the do / gr-peach-opencv-project

Fork of gr-peach-opencv-project by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers smooth.cpp Source File

smooth.cpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 #include "precomp.hpp"
00045 #include "opencl_kernels_imgproc.hpp"
00046 
00047 /*
00048  * This file includes the code, contributed by Simon Perreault
00049  * (the function icvMedianBlur_8u_O1)
00050  *
00051  * Constant-time median filtering -- http://nomis80.org/ctmf.html
00052  * Copyright (C) 2006 Simon Perreault
00053  *
00054  * Contact:
00055  *  Laboratoire de vision et systemes numeriques
00056  *  Pavillon Adrien-Pouliot
00057  *  Universite Laval
00058  *  Sainte-Foy, Quebec, Canada
00059  *  G1K 7P4
00060  *
00061  *  perreaul@gel.ulaval.ca
00062  */
00063 
00064 namespace cv
00065 {
00066 
00067 /****************************************************************************************\
00068                                          Box Filter
00069 \****************************************************************************************/
00070 
00071 template<typename T, typename ST>
00072 struct RowSum :
00073         public BaseRowFilter
00074 {
00075     RowSum( int _ksize, int _anchor ) :
00076         BaseRowFilter()
00077     {
00078         ksize = _ksize;
00079         anchor = _anchor;
00080     }
00081 
00082     virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
00083     {
00084         const T* S = (const T*)src;
00085         ST* D = (ST*)dst;
00086         int i = 0, k, ksz_cn = ksize*cn;
00087 
00088         width = (width - 1)*cn;
00089         for( k = 0; k < cn; k++, S++, D++ )
00090         {
00091             ST s = 0;
00092             for( i = 0; i < ksz_cn; i += cn )
00093                 s += S[i];
00094             D[0] = s;
00095             for( i = 0; i < width; i += cn )
00096             {
00097                 s += S[i + ksz_cn] - S[i];
00098                 D[i+cn] = s;
00099             }
00100         }
00101     }
00102 };
00103 
00104 
00105 template<typename ST, typename T>
00106 struct ColumnSum :
00107         public BaseColumnFilter
00108 {
00109     ColumnSum( int _ksize, int _anchor, double _scale ) :
00110         BaseColumnFilter()
00111     {
00112         ksize = _ksize;
00113         anchor = _anchor;
00114         scale = _scale;
00115         sumCount = 0;
00116     }
00117 
00118     virtual void reset() { sumCount = 0; }
00119 
00120     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00121     {
00122         int i;
00123         ST* SUM;
00124         bool haveScale = scale != 1;
00125         double _scale = scale;
00126 
00127         if( width != (int)sum.size() )
00128         {
00129             sum.resize(width);
00130             sumCount = 0;
00131         }
00132 
00133         SUM = &sum[0];
00134         if( sumCount == 0 )
00135         {
00136             memset((void*)SUM, 0, width*sizeof(ST));
00137 
00138             for( ; sumCount < ksize - 1; sumCount++, src++ )
00139             {
00140                 const ST* Sp = (const ST*)src[0];
00141                 for( i = 0; i <= width - 2; i += 2 )
00142                 {
00143                     ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
00144                     SUM[i] = s0; SUM[i+1] = s1;
00145                 }
00146 
00147                 for( ; i < width; i++ )
00148                     SUM[i] += Sp[i];
00149             }
00150         }
00151         else
00152         {
00153             CV_Assert( sumCount == ksize-1 );
00154             src += ksize-1;
00155         }
00156 
00157         for( ; count--; src++ )
00158         {
00159             const ST* Sp = (const ST*)src[0];
00160             const ST* Sm = (const ST*)src[1-ksize];
00161             T* D = (T*)dst;
00162             if( haveScale )
00163             {
00164                 for( i = 0; i <= width - 2; i += 2 )
00165                 {
00166                     ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
00167                     D[i] = saturate_cast<T>(s0*_scale);
00168                     D[i+1] = saturate_cast<T>(s1*_scale);
00169                     s0 -= Sm[i]; s1 -= Sm[i+1];
00170                     SUM[i] = s0; SUM[i+1] = s1;
00171                 }
00172 
00173                 for( ; i < width; i++ )
00174                 {
00175                     ST s0 = SUM[i] + Sp[i];
00176                     D[i] = saturate_cast<T>(s0*_scale);
00177                     SUM[i] = s0 - Sm[i];
00178                 }
00179             }
00180             else
00181             {
00182                 for( i = 0; i <= width - 2; i += 2 )
00183                 {
00184                     ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
00185                     D[i] = saturate_cast<T>(s0);
00186                     D[i+1] = saturate_cast<T>(s1);
00187                     s0 -= Sm[i]; s1 -= Sm[i+1];
00188                     SUM[i] = s0; SUM[i+1] = s1;
00189                 }
00190 
00191                 for( ; i < width; i++ )
00192                 {
00193                     ST s0 = SUM[i] + Sp[i];
00194                     D[i] = saturate_cast<T>(s0);
00195                     SUM[i] = s0 - Sm[i];
00196                 }
00197             }
00198             dst += dststep;
00199         }
00200     }
00201 
00202     double scale;
00203     int sumCount;
00204     std::vector<ST> sum;
00205 };
00206 
00207 
00208 template<>
00209 struct ColumnSum<int, uchar> :
00210         public BaseColumnFilter
00211 {
00212     ColumnSum( int _ksize, int _anchor, double _scale ) :
00213         BaseColumnFilter()
00214     {
00215         ksize = _ksize;
00216         anchor = _anchor;
00217         scale = _scale;
00218         sumCount = 0;
00219     }
00220 
00221     virtual void reset() { sumCount = 0; }
00222 
00223     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00224     {
00225         int i;
00226         int* SUM;
00227         bool haveScale = scale != 1;
00228         double _scale = scale;
00229 
00230         #if CV_SSE2
00231             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
00232         #endif
00233 
00234         if( width != (int)sum.size() )
00235         {
00236             sum.resize(width);
00237             sumCount = 0;
00238         }
00239 
00240         SUM = &sum[0];
00241         if( sumCount == 0 )
00242         {
00243             memset((void*)SUM, 0, width*sizeof(int));
00244             for( ; sumCount < ksize - 1; sumCount++, src++ )
00245             {
00246                 const int* Sp = (const int*)src[0];
00247                 i = 0;
00248                 #if CV_SSE2
00249                 if(haveSSE2)
00250                 {
00251                     for( ; i <= width-4; i+=4 )
00252                     {
00253                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00254                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00255                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
00256                     }
00257                 }
00258                 #elif CV_NEON
00259                 for( ; i <= width - 4; i+=4 )
00260                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00261                 #endif
00262                 for( ; i < width; i++ )
00263                     SUM[i] += Sp[i];
00264             }
00265         }
00266         else
00267         {
00268             CV_Assert( sumCount == ksize-1 );
00269             src += ksize-1;
00270         }
00271 
00272         for( ; count--; src++ )
00273         {
00274             const int* Sp = (const int*)src[0];
00275             const int* Sm = (const int*)src[1-ksize];
00276             uchar* D = (uchar*)dst;
00277             if( haveScale )
00278             {
00279                 i = 0;
00280                 #if CV_SSE2
00281                 if(haveSSE2)
00282                 {
00283                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00284                     for( ; i <= width-8; i+=8 )
00285                     {
00286                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
00287                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
00288 
00289                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00290                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00291                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
00292                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
00293 
00294                         __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00295                         __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
00296 
00297                         _s0T = _mm_packs_epi32(_s0T, _s0T1);
00298 
00299                         _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
00300 
00301                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00302                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
00303                     }
00304                 }
00305                 #elif CV_NEON
00306                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00307                 for( ; i <= width-8; i+=8 )
00308                 {
00309                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00310                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00311 
00312                     uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00313                     uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
00314 
00315                     uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
00316                     vst1_u8(D + i, vqmovn_u16(v_dst));
00317 
00318                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00319                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00320                 }
00321                 #endif
00322                 for( ; i < width; i++ )
00323                 {
00324                     int s0 = SUM[i] + Sp[i];
00325                     D[i] = saturate_cast<uchar>(s0*_scale);
00326                     SUM[i] = s0 - Sm[i];
00327                 }
00328             }
00329             else
00330             {
00331                 i = 0;
00332                 #if CV_SSE2
00333                 if(haveSSE2)
00334                 {
00335                     for( ; i <= width-8; i+=8 )
00336                     {
00337                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
00338                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
00339 
00340                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00341                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00342                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
00343                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
00344 
00345                         __m128i _s0T = _mm_packs_epi32(_s0, _s01);
00346 
00347                         _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
00348 
00349                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00350                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
00351                     }
00352                 }
00353                 #elif CV_NEON
00354                 for( ; i <= width-8; i+=8 )
00355                 {
00356                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00357                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00358 
00359                     uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
00360                     vst1_u8(D + i, vqmovn_u16(v_dst));
00361 
00362                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00363                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00364                 }
00365                 #endif
00366 
00367                 for( ; i < width; i++ )
00368                 {
00369                     int s0 = SUM[i] + Sp[i];
00370                     D[i] = saturate_cast<uchar>(s0);
00371                     SUM[i] = s0 - Sm[i];
00372                 }
00373             }
00374             dst += dststep;
00375         }
00376     }
00377 
00378     double scale;
00379     int sumCount;
00380     std::vector<int> sum;
00381 };
00382 
00383 template<>
00384 struct ColumnSum<int, short> :
00385         public BaseColumnFilter
00386 {
00387     ColumnSum( int _ksize, int _anchor, double _scale ) :
00388         BaseColumnFilter()
00389     {
00390         ksize = _ksize;
00391         anchor = _anchor;
00392         scale = _scale;
00393         sumCount = 0;
00394     }
00395 
00396     virtual void reset() { sumCount = 0; }
00397 
00398     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00399     {
00400         int i;
00401         int* SUM;
00402         bool haveScale = scale != 1;
00403         double _scale = scale;
00404 
00405         #if CV_SSE2
00406             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
00407         #endif
00408 
00409         if( width != (int)sum.size() )
00410         {
00411             sum.resize(width);
00412             sumCount = 0;
00413         }
00414         SUM = &sum[0];
00415         if( sumCount == 0 )
00416         {
00417             memset((void*)SUM, 0, width*sizeof(int));
00418             for( ; sumCount < ksize - 1; sumCount++, src++ )
00419             {
00420                 const int* Sp = (const int*)src[0];
00421                 i = 0;
00422                 #if CV_SSE2
00423                 if(haveSSE2)
00424                 {
00425                     for( ; i <= width-4; i+=4 )
00426                     {
00427                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00428                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00429                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
00430                     }
00431                 }
00432                 #elif CV_NEON
00433                 for( ; i <= width - 4; i+=4 )
00434                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00435                 #endif
00436                 for( ; i < width; i++ )
00437                     SUM[i] += Sp[i];
00438             }
00439         }
00440         else
00441         {
00442             CV_Assert( sumCount == ksize-1 );
00443             src += ksize-1;
00444         }
00445 
00446         for( ; count--; src++ )
00447         {
00448             const int* Sp = (const int*)src[0];
00449             const int* Sm = (const int*)src[1-ksize];
00450             short* D = (short*)dst;
00451             if( haveScale )
00452             {
00453                 i = 0;
00454                 #if CV_SSE2
00455                 if(haveSSE2)
00456                 {
00457                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00458                     for( ; i <= width-8; i+=8 )
00459                     {
00460                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00461                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
00462 
00463                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00464                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00465                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
00466                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
00467 
00468                         __m128i _s0T  = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00469                         __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
00470 
00471                         _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0T, _s0T1));
00472 
00473                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
00474                         _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1));
00475                     }
00476                 }
00477                 #elif CV_NEON
00478                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00479                 for( ; i <= width-8; i+=8 )
00480                 {
00481                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00482                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00483 
00484                     int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00485                     int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
00486                     vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
00487 
00488                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00489                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00490                 }
00491                 #endif
00492                 for( ; i < width; i++ )
00493                 {
00494                     int s0 = SUM[i] + Sp[i];
00495                     D[i] = saturate_cast<short>(s0*_scale);
00496                     SUM[i] = s0 - Sm[i];
00497                 }
00498             }
00499             else
00500             {
00501                 i = 0;
00502                 #if CV_SSE2
00503                 if(haveSSE2)
00504                 {
00505                     for( ; i <= width-8; i+=8 )
00506                     {
00507 
00508                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
00509                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
00510 
00511                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00512                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00513                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
00514                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
00515 
00516                         _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0, _s01));
00517 
00518                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00519                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
00520                     }
00521                 }
00522                 #elif CV_NEON
00523                 for( ; i <= width-8; i+=8 )
00524                 {
00525                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00526                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00527 
00528                     vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
00529 
00530                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00531                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00532                 }
00533                 #endif
00534 
00535                 for( ; i < width; i++ )
00536                 {
00537                     int s0 = SUM[i] + Sp[i];
00538                     D[i] = saturate_cast<short>(s0);
00539                     SUM[i] = s0 - Sm[i];
00540                 }
00541             }
00542             dst += dststep;
00543         }
00544     }
00545 
00546     double scale;
00547     int sumCount;
00548     std::vector<int> sum;
00549 };
00550 
00551 
00552 template<>
00553 struct ColumnSum<int, ushort> :
00554         public BaseColumnFilter
00555 {
00556     ColumnSum( int _ksize, int _anchor, double _scale ) :
00557         BaseColumnFilter()
00558     {
00559         ksize = _ksize;
00560         anchor = _anchor;
00561         scale = _scale;
00562         sumCount = 0;
00563     }
00564 
00565     virtual void reset() { sumCount = 0; }
00566 
00567     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00568     {
00569         int i;
00570         int* SUM;
00571         bool haveScale = scale != 1;
00572         double _scale = scale;
00573         #if CV_SSE2
00574                 bool haveSSE2 =  checkHardwareSupport(CV_CPU_SSE2);
00575         #endif
00576 
00577         if( width != (int)sum.size() )
00578         {
00579             sum.resize(width);
00580             sumCount = 0;
00581         }
00582         SUM = &sum[0];
00583         if( sumCount == 0 )
00584         {
00585             memset((void*)SUM, 0, width*sizeof(int));
00586             for( ; sumCount < ksize - 1; sumCount++, src++ )
00587             {
00588                 const int* Sp = (const int*)src[0];
00589                 i = 0;
00590                 #if CV_SSE2
00591                 if(haveSSE2)
00592                 {
00593                     for( ; i < width-4; i+=4 )
00594                     {
00595                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00596                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00597                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
00598                     }
00599                 }
00600                 #elif CV_NEON
00601                 for( ; i <= width - 4; i+=4 )
00602                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00603                 #endif
00604                 for( ; i < width; i++ )
00605                     SUM[i] += Sp[i];
00606             }
00607         }
00608         else
00609         {
00610             CV_Assert( sumCount == ksize-1 );
00611             src += ksize-1;
00612         }
00613 
00614         for( ; count--; src++ )
00615         {
00616             const int* Sp = (const int*)src[0];
00617             const int* Sm = (const int*)src[1-ksize];
00618             ushort* D = (ushort*)dst;
00619             if( haveScale )
00620             {
00621                 i = 0;
00622                 #if CV_SSE2
00623                 if(haveSSE2)
00624                 {
00625                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00626                     const __m128i delta0 = _mm_set1_epi32(0x8000);
00627                     const __m128i delta1 = _mm_set1_epi32(0x80008000);
00628 
00629                     for( ; i < width-4; i+=4)
00630                     {
00631                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00632                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00633                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
00634 
00635                         __m128i _res = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00636 
00637                         _res = _mm_sub_epi32(_res, delta0);
00638                         _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
00639 
00640                         _mm_storel_epi64((__m128i*)(D+i), _res);
00641                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00642                     }
00643                 }
00644                 #elif CV_NEON
00645                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00646                 for( ; i <= width-8; i+=8 )
00647                 {
00648                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00649                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00650 
00651                     uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00652                     uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
00653                     vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
00654 
00655                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00656                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00657                 }
00658                 #endif
00659                 for( ; i < width; i++ )
00660                 {
00661                     int s0 = SUM[i] + Sp[i];
00662                     D[i] = saturate_cast<ushort>(s0*_scale);
00663                     SUM[i] = s0 - Sm[i];
00664                 }
00665             }
00666             else
00667             {
00668                 i = 0;
00669                 #if  CV_SSE2
00670                 if(haveSSE2)
00671                 {
00672                     const __m128i delta0 = _mm_set1_epi32(0x8000);
00673                     const __m128i delta1 = _mm_set1_epi32(0x80008000);
00674 
00675                     for( ; i < width-4; i+=4 )
00676                     {
00677                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00678                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00679                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
00680 
00681                         __m128i _res = _mm_sub_epi32(_s0, delta0);
00682                         _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
00683 
00684                         _mm_storel_epi64((__m128i*)(D+i), _res);
00685                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00686                     }
00687                 }
00688                 #elif CV_NEON
00689                 for( ; i <= width-8; i+=8 )
00690                 {
00691                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00692                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00693 
00694                     vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
00695 
00696                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00697                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00698                 }
00699                 #endif
00700 
00701                 for( ; i < width; i++ )
00702                 {
00703                     int s0 = SUM[i] + Sp[i];
00704                     D[i] = saturate_cast<ushort>(s0);
00705                     SUM[i] = s0 - Sm[i];
00706                 }
00707             }
00708             dst += dststep;
00709         }
00710     }
00711 
00712     double scale;
00713     int sumCount;
00714     std::vector<int> sum;
00715 };
00716 
00717 template<>
00718 struct ColumnSum<int, int> :
00719         public BaseColumnFilter
00720 {
00721     ColumnSum( int _ksize, int _anchor, double _scale ) :
00722         BaseColumnFilter()
00723     {
00724         ksize = _ksize;
00725         anchor = _anchor;
00726         scale = _scale;
00727         sumCount = 0;
00728     }
00729 
00730     virtual void reset() { sumCount = 0; }
00731 
00732     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00733     {
00734         int i;
00735         int* SUM;
00736         bool haveScale = scale != 1;
00737         double _scale = scale;
00738 
00739         #if CV_SSE2
00740             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
00741         #endif
00742 
00743         if( width != (int)sum.size() )
00744         {
00745             sum.resize(width);
00746             sumCount = 0;
00747         }
00748         SUM = &sum[0];
00749         if( sumCount == 0 )
00750         {
00751             memset((void*)SUM, 0, width*sizeof(int));
00752             for( ; sumCount < ksize - 1; sumCount++, src++ )
00753             {
00754                 const int* Sp = (const int*)src[0];
00755                 i = 0;
00756                 #if CV_SSE2
00757                 if(haveSSE2)
00758                 {
00759                     for( ; i <= width-4; i+=4 )
00760                     {
00761                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00762                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00763                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
00764                     }
00765                 }
00766                 #elif CV_NEON
00767                 for( ; i <= width - 4; i+=4 )
00768                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00769                 #endif
00770                 for( ; i < width; i++ )
00771                     SUM[i] += Sp[i];
00772             }
00773         }
00774         else
00775         {
00776             CV_Assert( sumCount == ksize-1 );
00777             src += ksize-1;
00778         }
00779 
00780         for( ; count--; src++ )
00781         {
00782             const int* Sp = (const int*)src[0];
00783             const int* Sm = (const int*)src[1-ksize];
00784             int* D = (int*)dst;
00785             if( haveScale )
00786             {
00787                 i = 0;
00788                 #if CV_SSE2
00789                 if(haveSSE2)
00790                 {
00791                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00792                     for( ; i <= width-4; i+=4 )
00793                     {
00794                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00795 
00796                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00797                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00798 
00799                         __m128i _s0T  = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00800 
00801                         _mm_storeu_si128((__m128i*)(D+i), _s0T);
00802                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
00803                     }
00804                 }
00805                 #elif CV_NEON
00806                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00807                 for( ; i <= width-4; i+=4 )
00808                 {
00809                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00810 
00811                     int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00812                     vst1q_s32(D + i, v_s0d);
00813 
00814                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00815                 }
00816                 #endif
00817                 for( ; i < width; i++ )
00818                 {
00819                     int s0 = SUM[i] + Sp[i];
00820                     D[i] = saturate_cast<int>(s0*_scale);
00821                     SUM[i] = s0 - Sm[i];
00822                 }
00823             }
00824             else
00825             {
00826                 i = 0;
00827                 #if CV_SSE2
00828                 if(haveSSE2)
00829                 {
00830                     for( ; i <= width-4; i+=4 )
00831                     {
00832                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
00833                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00834                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00835 
00836                         _mm_storeu_si128((__m128i*)(D+i), _s0);
00837                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00838                     }
00839                 }
00840                 #elif CV_NEON
00841                 for( ; i <= width-4; i+=4 )
00842                 {
00843                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00844 
00845                     vst1q_s32(D + i, v_s0);
00846                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00847                 }
00848                 #endif
00849 
00850                 for( ; i < width; i++ )
00851                 {
00852                     int s0 = SUM[i] + Sp[i];
00853                     D[i] = s0;
00854                     SUM[i] = s0 - Sm[i];
00855                 }
00856             }
00857             dst += dststep;
00858         }
00859     }
00860 
00861     double scale;
00862     int sumCount;
00863     std::vector<int> sum;
00864 };
00865 
00866 
00867 template<>
00868 struct ColumnSum<int, float> :
00869         public BaseColumnFilter
00870 {
00871     ColumnSum( int _ksize, int _anchor, double _scale ) :
00872         BaseColumnFilter()
00873     {
00874         ksize = _ksize;
00875         anchor = _anchor;
00876         scale = _scale;
00877         sumCount = 0;
00878     }
00879 
00880     virtual void reset() { sumCount = 0; }
00881 
00882     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00883     {
00884         int i;
00885         int* SUM;
00886         bool haveScale = scale != 1;
00887         double _scale = scale;
00888 
00889         #if CV_SSE2
00890         bool haveSSE2 =  checkHardwareSupport(CV_CPU_SSE2);
00891         #endif
00892 
00893         if( width != (int)sum.size() )
00894         {
00895             sum.resize(width);
00896             sumCount = 0;
00897         }
00898 
00899         SUM = &sum[0];
00900         if( sumCount == 0 )
00901         {
00902             memset((void *)SUM, 0, sizeof(int) * width);
00903 
00904             for( ; sumCount < ksize - 1; sumCount++, src++ )
00905             {
00906                 const int* Sp = (const int*)src[0];
00907                 i = 0;
00908 
00909                 #if CV_SSE2
00910                 if(haveSSE2)
00911                 {
00912                     for( ; i < width-4; i+=4 )
00913                     {
00914                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00915                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00916                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
00917                     }
00918                 }
00919                 #elif CV_NEON
00920                 for( ; i <= width - 4; i+=4 )
00921                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00922                 #endif
00923 
00924                 for( ; i < width; i++ )
00925                     SUM[i] += Sp[i];
00926             }
00927         }
00928         else
00929         {
00930             CV_Assert( sumCount == ksize-1 );
00931             src += ksize-1;
00932         }
00933 
00934         for( ; count--; src++ )
00935         {
00936             const int * Sp = (const int*)src[0];
00937             const int * Sm = (const int*)src[1-ksize];
00938             float* D = (float*)dst;
00939             if( haveScale )
00940             {
00941                 i = 0;
00942 
00943                 #if CV_SSE2
00944                 if(haveSSE2)
00945                 {
00946                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00947 
00948                     for( ; i < width-4; i+=4)
00949                     {
00950                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00951                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00952                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
00953 
00954                         _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00955                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00956                     }
00957                 }
00958                 #elif CV_NEON
00959                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00960                 for( ; i <= width-8; i+=8 )
00961                 {
00962                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00963                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00964 
00965                     vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00966                     vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
00967 
00968                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00969                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00970                 }
00971                 #endif
00972 
00973                 for( ; i < width; i++ )
00974                 {
00975                     int s0 = SUM[i] + Sp[i];
00976                     D[i] = (float)(s0*_scale);
00977                     SUM[i] = s0 - Sm[i];
00978                 }
00979             }
00980             else
00981             {
00982                 i = 0;
00983 
00984                 #if CV_SSE2
00985                 if(haveSSE2)
00986                 {
00987                     for( ; i < width-4; i+=4)
00988                     {
00989                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00990                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00991                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
00992 
00993                         _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0));
00994                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00995                     }
00996                 }
00997                 #elif CV_NEON
00998                 for( ; i <= width-8; i+=8 )
00999                 {
01000                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
01001                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
01002 
01003                     vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
01004                     vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
01005 
01006                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
01007                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
01008                 }
01009                 #endif
01010 
01011                 for( ; i < width; i++ )
01012                 {
01013                     int s0 = SUM[i] + Sp[i];
01014                     D[i] = (float)(s0);
01015                     SUM[i] = s0 - Sm[i];
01016                 }
01017             }
01018             dst += dststep;
01019         }
01020     }
01021 
01022     double scale;
01023     int sumCount;
01024     std::vector<int> sum;
01025 };
01026 
01027 #ifdef HAVE_OPENCL
01028 
01029 #define DIVUP(total, grain) ((total + grain - 1) / (grain))
01030 #define ROUNDUP(sz, n)      ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
01031 
01032 static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
01033                            Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
01034 {
01035     const ocl::Device & dev = ocl::Device::getDefault();
01036     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
01037     bool doubleSupport = dev.doubleFPConfig() > 0;
01038 
01039     if (ddepth < 0)
01040         ddepth = sdepth;
01041 
01042     if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
01043         _src.offset() % esz != 0 || _src.step() % esz != 0)
01044         return false;
01045 
01046     if (anchor.x < 0)
01047         anchor.x = ksize.width / 2;
01048     if (anchor.y < 0)
01049         anchor.y = ksize.height / 2;
01050 
01051     int computeUnits = ocl::Device::getDefault().maxComputeUnits();
01052     float alpha = 1.0f / (ksize.height * ksize.width);
01053     Size size = _src.size(), wholeSize;
01054     bool isolated = (borderType & BORDER_ISOLATED) != 0;
01055     borderType &= ~BORDER_ISOLATED;
01056     int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
01057         wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
01058 
01059     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
01060     size_t globalsize[2] = { (size_t)size.width, (size_t)size.height };
01061     size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
01062 
01063     UMat src = _src.getUMat();
01064     if (!isolated)
01065     {
01066         Point ofs;
01067         src.locateROI(wholeSize, ofs);
01068     }
01069 
01070     int h = isolated ? size.height : wholeSize.height;
01071     int w = isolated ? size.width : wholeSize.width;
01072 
01073     size_t maxWorkItemSizes[32];
01074     ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
01075     int tryWorkItems = (int)maxWorkItemSizes[0];
01076 
01077     ocl::Kernel kernel;
01078 
01079     if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
01080         ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
01081          (ksize.width == 5 && ksize.height == 5 && cn == 1)))
01082     {
01083         if (w < ksize.width || h < ksize.height)
01084             return false;
01085 
01086         // Figure out what vector size to use for loading the pixels.
01087         int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
01088         int pxLoadVecSize = cn * pxLoadNumPixels;
01089 
01090         // Figure out how many pixels per work item to compute in X and Y
01091         // directions.  Too many and we run out of registers.
01092         int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
01093         if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
01094         {
01095             pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
01096             pxPerWorkItemY = size.height % 2 ? 1 : 2;
01097         }
01098         else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
01099         {
01100             pxPerWorkItemX = size.width % 2 ? 1 : 2;
01101             pxPerWorkItemY = size.height % 2 ? 1 : 2;
01102         }
01103         globalsize[0] = size.width / pxPerWorkItemX;
01104         globalsize[1] = size.height / pxPerWorkItemY;
01105 
01106         // Need some padding in the private array for pixels
01107         int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
01108 
01109         // Make the global size a nice round number so the runtime can pick
01110         // from reasonable choices for the workgroup size
01111         const int wgRound = 256;
01112         globalsize[0] = ROUNDUP(globalsize[0], wgRound);
01113 
01114         char build_options[1024], cvt[2][40];
01115         sprintf(build_options, "-D cn=%d "
01116                 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
01117                 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
01118                 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
01119                 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
01120                 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
01121                 "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
01122                 cn, anchor.x, anchor.y, ksize.width, ksize.height,
01123                 pxLoadVecSize, pxLoadNumPixels,
01124                 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
01125                 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
01126                 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
01127                 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
01128                 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
01129                 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
01130                 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
01131                 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
01132                 ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
01133                 );
01134 
01135 
01136         if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
01137             return false;
01138     }
01139     else
01140     {
01141         localsize = localsize_general;
01142         for ( ; ; )
01143         {
01144             int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
01145 
01146             while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
01147                 BLOCK_SIZE_X /= 2;
01148             while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
01149                 BLOCK_SIZE_Y *= 2;
01150 
01151             if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
01152                 return false;
01153 
01154             char cvt[2][50];
01155             String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
01156                                  " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
01157                                  " -D ST1=%s -D DT1=%s -D cn=%d",
01158                                  BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
01159                                  ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
01160                                  ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
01161                                  ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
01162                                  anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
01163                                  isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
01164                                  normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
01165                                  ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
01166 
01167             localsize[0] = BLOCK_SIZE_X;
01168             globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
01169             globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y);
01170 
01171             kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
01172             if (kernel.empty())
01173                 return false;
01174 
01175             size_t kernelWorkGroupSize = kernel.workGroupSize();
01176             if (localsize[0] <= kernelWorkGroupSize)
01177                 break;
01178             if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
01179                 return false;
01180 
01181             tryWorkItems = (int)kernelWorkGroupSize;
01182         }
01183     }
01184 
01185     _dst.create(size, CV_MAKETYPE(ddepth, cn));
01186     UMat dst = _dst.getUMat();
01187 
01188     int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
01189     idxArg = kernel.set(idxArg, (int)src.step);
01190     int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
01191     int srcOffsetY = (int)(src.offset / src.step);
01192     int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
01193     int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
01194     idxArg = kernel.set(idxArg, srcOffsetX);
01195     idxArg = kernel.set(idxArg, srcOffsetY);
01196     idxArg = kernel.set(idxArg, srcEndX);
01197     idxArg = kernel.set(idxArg, srcEndY);
01198     idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
01199     if (normalize)
01200         idxArg = kernel.set(idxArg, (float)alpha);
01201 
01202     return kernel.run(2, globalsize, localsize, false);
01203 }
01204 
01205 #undef ROUNDUP
01206 
01207 #endif
01208 
01209 }
01210 
01211 
01212 cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
01213 {
01214     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
01215     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
01216 
01217     if( anchor < 0 )
01218         anchor = ksize/2;
01219 
01220     if( sdepth == CV_8U && ddepth == CV_32S )
01221         return makePtr<RowSum<uchar, int> >(ksize, anchor);
01222     if( sdepth == CV_8U && ddepth == CV_64F )
01223         return makePtr<RowSum<uchar, double> >(ksize, anchor);
01224     if( sdepth == CV_16U && ddepth == CV_32S )
01225         return makePtr<RowSum<ushort, int> >(ksize, anchor);
01226     if( sdepth == CV_16U && ddepth == CV_64F )
01227         return makePtr<RowSum<ushort, double> >(ksize, anchor);
01228     if( sdepth == CV_16S && ddepth == CV_32S )
01229         return makePtr<RowSum<short, int> >(ksize, anchor);
01230     if( sdepth == CV_32S && ddepth == CV_32S )
01231         return makePtr<RowSum<int, int> >(ksize, anchor);
01232     if( sdepth == CV_16S && ddepth == CV_64F )
01233         return makePtr<RowSum<short, double> >(ksize, anchor);
01234     if( sdepth == CV_32F && ddepth == CV_64F )
01235         return makePtr<RowSum<float, double> >(ksize, anchor);
01236     if( sdepth == CV_64F && ddepth == CV_64F )
01237         return makePtr<RowSum<double, double> >(ksize, anchor);
01238 
01239     CV_Error_( CV_StsNotImplemented,
01240         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
01241         srcType, sumType));
01242 
01243     return Ptr<BaseRowFilter> ();
01244 }
01245 
01246 
01247 cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
01248                                                      int anchor, double scale)
01249 {
01250     int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
01251     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
01252 
01253     if( anchor < 0 )
01254         anchor = ksize/2;
01255 
01256     if( ddepth == CV_8U && sdepth == CV_32S )
01257         return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
01258     if( ddepth == CV_8U && sdepth == CV_64F )
01259         return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
01260     if( ddepth == CV_16U && sdepth == CV_32S )
01261         return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
01262     if( ddepth == CV_16U && sdepth == CV_64F )
01263         return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
01264     if( ddepth == CV_16S && sdepth == CV_32S )
01265         return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
01266     if( ddepth == CV_16S && sdepth == CV_64F )
01267         return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
01268     if( ddepth == CV_32S && sdepth == CV_32S )
01269         return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
01270     if( ddepth == CV_32F && sdepth == CV_32S )
01271         return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
01272     if( ddepth == CV_32F && sdepth == CV_64F )
01273         return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
01274     if( ddepth == CV_64F && sdepth == CV_32S )
01275         return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
01276     if( ddepth == CV_64F && sdepth == CV_64F )
01277         return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
01278 
01279     CV_Error_( CV_StsNotImplemented,
01280         ("Unsupported combination of sum format (=%d), and destination format (=%d)",
01281         sumType, dstType));
01282 
01283     return Ptr<BaseColumnFilter> ();
01284 }
01285 
01286 
01287 cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
01288                     Point anchor, bool normalize, int borderType )
01289 {
01290     int sdepth = CV_MAT_DEPTH(srcType);
01291     int cn = CV_MAT_CN(srcType), sumType = CV_64F;
01292     if( sdepth <= CV_32S && (!normalize ||
01293         ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
01294             sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
01295         sumType = CV_32S;
01296     sumType = CV_MAKETYPE( sumType, cn );
01297 
01298     Ptr<BaseRowFilter>  rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
01299     Ptr<BaseColumnFilter>  columnFilter = getColumnSumFilter(sumType,
01300         dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
01301 
01302     return makePtr<FilterEngine>(Ptr<BaseFilter> (), rowFilter, columnFilter,
01303            srcType, dstType, sumType, borderType );
01304 }
01305 
01306 #if defined(HAVE_IPP)
01307 namespace cv
01308 {
01309 static bool ipp_boxfilter( InputArray _src, OutputArray _dst, int ddepth,
01310                 Size ksize, Point anchor,
01311                 bool normalize, int borderType )
01312 {
01313     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
01314     if( ddepth < 0 )
01315         ddepth = sdepth;
01316     int ippBorderType = borderType & ~BORDER_ISOLATED;
01317     Point ocvAnchor, ippAnchor;
01318     ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
01319     ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
01320     ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
01321     ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
01322 
01323     Mat src = _src.getMat();
01324     _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
01325     Mat dst = _dst.getMat();
01326     if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
01327     {
01328         if( src.rows == 1 )
01329             ksize.height = 1;
01330         if( src.cols == 1 )
01331             ksize.width = 1;
01332     }
01333 
01334     {
01335         if (normalize && !src.isSubmatrix() && ddepth == sdepth &&
01336             (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
01337              ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
01338              dst.cols != ksize.width && dst.rows != ksize.height) // returns ippStsMaskSizeErr: mask has an illegal value
01339         {
01340             Ipp32s bufSize = 0;
01341             IppiSize roiSize = { dst.cols, dst.rows }, maskSize = { ksize.width, ksize.height };
01342 
01343 #define IPP_FILTER_BOX_BORDER(ippType, ippDataType, flavor) \
01344             do \
01345             { \
01346                 if (ippiFilterBoxBorderGetBufferSize(roiSize, maskSize, ippDataType, cn, &bufSize) >= 0) \
01347                 { \
01348                     Ipp8u * buffer = ippsMalloc_8u(bufSize); \
01349                     ippType borderValue[4] = { 0, 0, 0, 0 }; \
01350                     ippBorderType = ippBorderType == BORDER_CONSTANT ? ippBorderConst : ippBorderRepl; \
01351                     IppStatus status = ippiFilterBoxBorder_##flavor(src.ptr<ippType>(), (int)src.step, dst.ptr<ippType>(), \
01352                                                                     (int)dst.step, roiSize, maskSize, \
01353                                                                     (IppiBorderType)ippBorderType, borderValue, buffer); \
01354                     ippsFree(buffer); \
01355                     if (status >= 0) \
01356                     { \
01357                         CV_IMPL_ADD(CV_IMPL_IPP); \
01358                         return true; \
01359                     } \
01360                 } \
01361             } while ((void)0, 0)
01362 
01363             if (stype == CV_8UC1)
01364                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C1R);
01365             else if (stype == CV_8UC3)
01366                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C3R);
01367             else if (stype == CV_8UC4)
01368                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C4R);
01369 
01370             // Oct 2014: performance with BORDER_CONSTANT
01371             //else if (stype == CV_16UC1)
01372             //    IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C1R);
01373             else if (stype == CV_16UC3)
01374                 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C3R);
01375             else if (stype == CV_16UC4)
01376                 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C4R);
01377 
01378             // Oct 2014: performance with BORDER_CONSTANT
01379             //else if (stype == CV_16SC1)
01380             //    IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C1R);
01381             else if (stype == CV_16SC3)
01382                 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C3R);
01383             else if (stype == CV_16SC4)
01384                 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C4R);
01385 
01386             else if (stype == CV_32FC1)
01387                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C1R);
01388             else if (stype == CV_32FC3)
01389                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C3R);
01390             else if (stype == CV_32FC4)
01391                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C4R);
01392         }
01393 #undef IPP_FILTER_BOX_BORDER
01394     }
01395     return false;
01396 }
01397 }
01398 #endif
01399 
01400 
01401 void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
01402                 Size ksize, Point anchor,
01403                 bool normalize, int borderType )
01404 {
01405 #ifdef HAVE_OPENCL
01406     CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
01407 #endif
01408 
01409     Mat src = _src.getMat();
01410     int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
01411     if( ddepth < 0 )
01412         ddepth = sdepth;
01413     _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
01414     Mat dst = _dst.getMat();
01415     if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
01416     {
01417         if( src.rows == 1 )
01418             ksize.height = 1;
01419         if( src.cols == 1 )
01420             ksize.width = 1;
01421     }
01422 #ifdef HAVE_TEGRA_OPTIMIZATION
01423     if ( tegra::useTegra() && tegra::box(src, dst, ksize, anchor, normalize, borderType) )
01424         return;
01425 #endif
01426 
01427 #ifdef HAVE_IPP
01428     int ippBorderType = borderType & ~BORDER_ISOLATED;
01429 #endif
01430     Point ocvAnchor, ippAnchor;
01431     ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
01432     ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
01433     ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
01434     ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
01435     CV_IPP_RUN((normalize && !_src.isSubmatrix() && ddepth == sdepth &&
01436             (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
01437              ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
01438              _dst.cols() != ksize.width && _dst.rows() != ksize.height),
01439              ipp_boxfilter( _src,  _dst,  ddepth, ksize,  anchor, normalize,  borderType));
01440 
01441 
01442     Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
01443                         ksize, anchor, normalize, borderType );
01444     f->apply( src, dst );
01445 }
01446 
01447 
01448 void cv::blur( InputArray src, OutputArray dst,
01449            Size ksize, Point anchor, int borderType )
01450 {
01451     boxFilter( src, dst, -1, ksize, anchor, true, borderType );
01452 }
01453 
01454 
01455 /****************************************************************************************\
01456                                     Squared Box Filter
01457 \****************************************************************************************/
01458 
01459 namespace cv
01460 {
01461 
01462 template<typename T, typename ST>
01463 struct SqrRowSum :
01464         public BaseRowFilter
01465 {
01466     SqrRowSum( int _ksize, int _anchor ) :
01467         BaseRowFilter()
01468     {
01469         ksize = _ksize;
01470         anchor = _anchor;
01471     }
01472 
01473     virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
01474     {
01475         const T* S = (const T*)src;
01476         ST* D = (ST*)dst;
01477         int i = 0, k, ksz_cn = ksize*cn;
01478 
01479         width = (width - 1)*cn;
01480         for( k = 0; k < cn; k++, S++, D++ )
01481         {
01482             ST s = 0;
01483             for( i = 0; i < ksz_cn; i += cn )
01484             {
01485                 ST val = (ST)S[i];
01486                 s += val*val;
01487             }
01488             D[0] = s;
01489             for( i = 0; i < width; i += cn )
01490             {
01491                 ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
01492                 s += val1*val1 - val0*val0;
01493                 D[i+cn] = s;
01494             }
01495         }
01496     }
01497 };
01498 
01499 static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
01500 {
01501     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
01502     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
01503 
01504     if( anchor < 0 )
01505         anchor = ksize/2;
01506 
01507     if( sdepth == CV_8U && ddepth == CV_32S )
01508         return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
01509     if( sdepth == CV_8U && ddepth == CV_64F )
01510         return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
01511     if( sdepth == CV_16U && ddepth == CV_64F )
01512         return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
01513     if( sdepth == CV_16S && ddepth == CV_64F )
01514         return makePtr<SqrRowSum<short, double> >(ksize, anchor);
01515     if( sdepth == CV_32F && ddepth == CV_64F )
01516         return makePtr<SqrRowSum<float, double> >(ksize, anchor);
01517     if( sdepth == CV_64F && ddepth == CV_64F )
01518         return makePtr<SqrRowSum<double, double> >(ksize, anchor);
01519 
01520     CV_Error_( CV_StsNotImplemented,
01521               ("Unsupported combination of source format (=%d), and buffer format (=%d)",
01522                srcType, sumType));
01523 
01524     return Ptr<BaseRowFilter>();
01525 }
01526 
01527 }
01528 
01529 void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
01530                        Size ksize, Point anchor,
01531                        bool normalize, int borderType )
01532 {
01533     int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
01534     Size size = _src.size();
01535 
01536     if( ddepth < 0 )
01537         ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
01538 
01539     if( borderType != BORDER_CONSTANT && normalize )
01540     {
01541         if( size.height == 1 )
01542             ksize.height = 1;
01543         if( size.width == 1 )
01544             ksize.width = 1;
01545     }
01546 
01547 #ifdef HAVE_OPENCL
01548     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
01549                ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
01550 #endif
01551 
01552     int sumDepth = CV_64F;
01553     if( sdepth == CV_8U )
01554         sumDepth = CV_32S;
01555     int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
01556 
01557     Mat src = _src.getMat();
01558     _dst.create( size, dstType );
01559     Mat dst = _dst.getMat();
01560 
01561     Ptr<BaseRowFilter>  rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
01562     Ptr<BaseColumnFilter>  columnFilter = getColumnSumFilter(sumType,
01563                                                             dstType, ksize.height, anchor.y,
01564                                                             normalize ? 1./(ksize.width*ksize.height) : 1);
01565 
01566     Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter> (), rowFilter, columnFilter,
01567                                                 srcType, dstType, sumType, borderType );
01568     f->apply( src, dst );
01569 }
01570 
01571 
01572 /****************************************************************************************\
01573                                      Gaussian Blur
01574 \****************************************************************************************/
01575 
01576 cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
01577 {
01578     const int SMALL_GAUSSIAN_SIZE = 7;
01579     static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] =
01580     {
01581         {1.f},
01582         {0.25f, 0.5f, 0.25f},
01583         {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f},
01584         {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f}
01585     };
01586 
01587     const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ?
01588         small_gaussian_tab[n>>1] : 0;
01589 
01590     CV_Assert( ktype == CV_32F || ktype == CV_64F );
01591     Mat kernel(n, 1, ktype);
01592     float* cf = kernel.ptr<float>();
01593     double* cd = kernel.ptr<double>();
01594 
01595     double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8;
01596     double scale2X = -0.5/(sigmaX*sigmaX);
01597     double sum = 0;
01598 
01599     int i;
01600     for( i = 0; i < n; i++ )
01601     {
01602         double x = i - (n-1)*0.5;
01603         double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x);
01604         if( ktype == CV_32F )
01605         {
01606             cf[i] = (float)t;
01607             sum += cf[i];
01608         }
01609         else
01610         {
01611             cd[i] = t;
01612             sum += cd[i];
01613         }
01614     }
01615 
01616     sum = 1./sum;
01617     for( i = 0; i < n; i++ )
01618     {
01619         if( ktype == CV_32F )
01620             cf[i] = (float)(cf[i]*sum);
01621         else
01622             cd[i] *= sum;
01623     }
01624 
01625     return kernel;
01626 }
01627 
01628 namespace cv {
01629 
01630 static void createGaussianKernels( Mat & kx, Mat & ky, int type, Size ksize,
01631                                    double sigma1, double sigma2 )
01632 {
01633     int depth = CV_MAT_DEPTH(type);
01634     if( sigma2 <= 0 )
01635         sigma2 = sigma1;
01636 
01637     // automatic detection of kernel size from sigma
01638     if( ksize.width <= 0 && sigma1 > 0 )
01639         ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
01640     if( ksize.height <= 0 && sigma2 > 0 )
01641         ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
01642 
01643     CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 &&
01644         ksize.height > 0 && ksize.height % 2 == 1 );
01645 
01646     sigma1 = std::max( sigma1, 0. );
01647     sigma2 = std::max( sigma2, 0. );
01648 
01649     kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
01650     if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
01651         ky = kx;
01652     else
01653         ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
01654 }
01655 
01656 }
01657 
01658 cv::Ptr<cv::FilterEngine> cv::createGaussianFilter( int type, Size ksize,
01659                                         double sigma1, double sigma2,
01660                                         int borderType )
01661 {
01662     Mat kx, ky;
01663     createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
01664 
01665     return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType );
01666 }
01667 
01668 #ifdef HAVE_IPP
01669 namespace cv
01670 {
01671 static bool ipp_GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
01672                    double sigma1, double sigma2,
01673                    int borderType )
01674 {
01675 #if IPP_VERSION_X100 >= 810
01676     int type = _src.type();
01677     Size size = _src.size();
01678 
01679     if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
01680     {
01681         if( size.height == 1 )
01682             ksize.height = 1;
01683         if( size.width == 1 )
01684             ksize.width = 1;
01685     }
01686 
01687     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
01688 
01689     if ((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn == 1 || cn == 3) &&
01690             sigma1 == sigma2 && ksize.width == ksize.height && sigma1 != 0.0 )
01691     {
01692         IppiBorderType ippBorder = ippiGetBorderType(borderType);
01693         if (ippBorderConst == ippBorder || ippBorderRepl == ippBorder)
01694         {
01695             Mat src = _src.getMat(), dst = _dst.getMat();
01696             IppiSize roiSize = { src.cols, src.rows };
01697             IppDataType dataType = ippiGetDataType(depth);
01698             Ipp32s specSize = 0, bufferSize = 0;
01699 
01700             if (ippiFilterGaussianGetBufferSize(roiSize, (Ipp32u)ksize.width, dataType, cn, &specSize, &bufferSize) >= 0)
01701             {
01702                 IppAutoBuffer<IppFilterGaussianSpec> spec(specSize);
01703                 IppAutoBuffer<Ipp8u> buffer(bufferSize);
01704 
01705                 if (ippiFilterGaussianInit(roiSize, (Ipp32u)ksize.width, (Ipp32f)sigma1, ippBorder, dataType, cn, spec, buffer) >= 0)
01706                 {
01707 #define IPP_FILTER_GAUSS_C1(ippfavor) \
01708                     { \
01709                         Ipp##ippfavor borderValues = 0; \
01710                         status = ippiFilterGaussianBorder_##ippfavor##_C1R(src.ptr<Ipp##ippfavor>(), (int)src.step, \
01711                                 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \
01712                     }
01713 
01714 #define IPP_FILTER_GAUSS_CN(ippfavor, ippcn) \
01715                     { \
01716                         Ipp##ippfavor borderValues[] = { 0, 0, 0 }; \
01717                         status = ippiFilterGaussianBorder_##ippfavor##_C##ippcn##R(src.ptr<Ipp##ippfavor>(), (int)src.step, \
01718                                 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \
01719                     }
01720 
01721                     IppStatus status = ippStsErr;
01722 #if !HAVE_ICV
01723 #if IPP_VERSION_X100 > 900 // Buffer overflow may happen in IPP 9.0.0 and less
01724                     if (type == CV_8UC1)
01725                         IPP_FILTER_GAUSS_C1(8u)
01726                     else
01727 #endif
01728                     if (type == CV_8UC3)
01729                         IPP_FILTER_GAUSS_CN(8u, 3)
01730                     else if (type == CV_16UC1)
01731                         IPP_FILTER_GAUSS_C1(16u)
01732                     else if (type == CV_16UC3)
01733                         IPP_FILTER_GAUSS_CN(16u, 3)
01734                     else if (type == CV_16SC1)
01735                         IPP_FILTER_GAUSS_C1(16s)
01736                     else if (type == CV_16SC3)
01737                         IPP_FILTER_GAUSS_CN(16s, 3)
01738                     else if (type == CV_32FC3)
01739                         IPP_FILTER_GAUSS_CN(32f, 3)
01740                     else
01741 #endif
01742                     if (type == CV_32FC1)
01743                         IPP_FILTER_GAUSS_C1(32f)
01744 
01745                     if(status >= 0)
01746                         return true;
01747 
01748 #undef IPP_FILTER_GAUSS_C1
01749 #undef IPP_FILTER_GAUSS_CN
01750                 }
01751             }
01752         }
01753     }
01754 #else
01755     CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
01756 #endif
01757     return false;
01758 }
01759 }
01760 #endif
01761 
01762 
01763 void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
01764                    double sigma1, double sigma2,
01765                    int borderType )
01766 {
01767     int type = _src.type();
01768     Size size = _src.size();
01769     _dst.create( size, type );
01770 
01771     if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
01772     {
01773         if( size.height == 1 )
01774             ksize.height = 1;
01775         if( size.width == 1 )
01776             ksize.width = 1;
01777     }
01778 
01779     if( ksize.width == 1 && ksize.height == 1 )
01780     {
01781         _src.copyTo(_dst);
01782         return;
01783     }
01784 
01785 #ifdef HAVE_TEGRA_OPTIMIZATION
01786     Mat src = _src.getMat();
01787     Mat dst = _dst.getMat();
01788     if(sigma1 == 0 && sigma2 == 0 && tegra::useTegra() && tegra::gaussian(src, dst, ksize, borderType))
01789         return;
01790 #endif
01791 
01792     CV_IPP_RUN(true, ipp_GaussianBlur( _src,  _dst,  ksize, sigma1,  sigma2, borderType));
01793 
01794     Mat kx, ky;
01795     createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
01796     sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
01797 }
01798 
01799 /****************************************************************************************\
01800                                       Median Filter
01801 \****************************************************************************************/
01802 
01803 namespace cv
01804 {
01805 typedef ushort HT;
01806 
01807 /**
01808  * This structure represents a two-tier histogram. The first tier (known as the
01809  * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level)
01810  * is 8 bit wide. Pixels inserted in the fine level also get inserted into the
01811  * coarse bucket designated by the 4 MSBs of the fine bucket value.
01812  *
01813  * The structure is aligned on 16 bits, which is a prerequisite for SIMD
01814  * instructions. Each bucket is 16 bit wide, which means that extra care must be
01815  * taken to prevent overflow.
01816  */
01817 typedef struct
01818 {
01819     HT coarse[16];
01820     HT fine[16][16];
01821 } Histogram;
01822 
01823 
01824 #if CV_SSE2
01825 #define MEDIAN_HAVE_SIMD 1
01826 
01827 static inline void histogram_add_simd( const HT x[16], HT y[16] )
01828 {
01829     const __m128i* rx = (const __m128i*)x;
01830     __m128i* ry = (__m128i*)y;
01831     __m128i r0 = _mm_add_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
01832     __m128i r1 = _mm_add_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
01833     _mm_store_si128(ry+0, r0);
01834     _mm_store_si128(ry+1, r1);
01835 }
01836 
01837 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
01838 {
01839     const __m128i* rx = (const __m128i*)x;
01840     __m128i* ry = (__m128i*)y;
01841     __m128i r0 = _mm_sub_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
01842     __m128i r1 = _mm_sub_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
01843     _mm_store_si128(ry+0, r0);
01844     _mm_store_si128(ry+1, r1);
01845 }
01846 
01847 #elif CV_NEON
01848 #define MEDIAN_HAVE_SIMD 1
01849 
01850 static inline void histogram_add_simd( const HT x[16], HT y[16] )
01851 {
01852     vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y)));
01853     vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8)));
01854 }
01855 
01856 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
01857 {
01858     vst1q_u16(y, vsubq_u16(vld1q_u16(y), vld1q_u16(x)));
01859     vst1q_u16(y + 8, vsubq_u16(vld1q_u16(y + 8), vld1q_u16(x + 8)));
01860 }
01861 
01862 #else
01863 #define MEDIAN_HAVE_SIMD 0
01864 #endif
01865 
01866 
01867 static inline void histogram_add( const HT x[16], HT y[16] )
01868 {
01869     int i;
01870     for( i = 0; i < 16; ++i )
01871         y[i] = (HT)(y[i] + x[i]);
01872 }
01873 
01874 static inline void histogram_sub( const HT x[16], HT y[16] )
01875 {
01876     int i;
01877     for( i = 0; i < 16; ++i )
01878         y[i] = (HT)(y[i] - x[i]);
01879 }
01880 
01881 static inline void histogram_muladd( int a, const HT x[16],
01882         HT y[16] )
01883 {
01884     for( int i = 0; i < 16; ++i )
01885         y[i] = (HT)(y[i] + a * x[i]);
01886 }
01887 
01888 static void
01889 medianBlur_8u_O1 ( const Mat& _src, Mat& _dst, int ksize )
01890 {
01891 /**
01892  * HOP is short for Histogram OPeration. This macro makes an operation \a op on
01893  * histogram \a h for pixel value \a x. It takes care of handling both levels.
01894  */
01895 #define HOP(h,x,op) \
01896     h.coarse[x>>4] op, \
01897     *((HT*)h.fine + x) op
01898 
01899 #define COP(c,j,x,op) \
01900     h_coarse[ 16*(n*c+j) + (x>>4) ] op, \
01901     h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op
01902 
01903     int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2;
01904     size_t sstep = _src.step, dstep = _dst.step;
01905     Histogram CV_DECL_ALIGNED(16) H[4];
01906     HT CV_DECL_ALIGNED(16) luc[4][16];
01907 
01908     int STRIPE_SIZE = std::min( _dst.cols, 512/cn );
01909 
01910     std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
01911     std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
01912     HT* h_coarse = alignPtr(&_h_coarse[0], 16);
01913     HT* h_fine = alignPtr(&_h_fine[0], 16);
01914 #if MEDIAN_HAVE_SIMD
01915     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
01916 #endif
01917 
01918     for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
01919     {
01920         int i, j, k, c, n = std::min(_dst.cols - x, STRIPE_SIZE) + r*2;
01921         const uchar* src = _src.ptr() + x*cn;
01922         uchar* dst = _dst.ptr() + (x - r)*cn;
01923 
01924         memset( h_coarse, 0, 16*n*cn*sizeof(h_coarse[0]) );
01925         memset( h_fine, 0, 16*16*n*cn*sizeof(h_fine[0]) );
01926 
01927         // First row initialization
01928         for( c = 0; c < cn; c++ )
01929         {
01930             for( j = 0; j < n; j++ )
01931                 COP( c, j, src[cn*j+c], += (cv::HT)(r+2) );
01932 
01933             for( i = 1; i < r; i++ )
01934             {
01935                 const uchar* p = src + sstep*std::min(i, m-1);
01936                 for ( j = 0; j < n; j++ )
01937                     COP( c, j, p[cn*j+c], ++ );
01938             }
01939         }
01940 
01941         for( i = 0; i < m; i++ )
01942         {
01943             const uchar* p0 = src + sstep * std::max( 0, i-r-1 );
01944             const uchar* p1 = src + sstep * std::min( m-1, i+r );
01945 
01946             memset( H, 0, cn*sizeof(H[0]) );
01947             memset( luc, 0, cn*sizeof(luc[0]) );
01948             for( c = 0; c < cn; c++ )
01949             {
01950                 // Update column histograms for the entire row.
01951                 for( j = 0; j < n; j++ )
01952                 {
01953                     COP( c, j, p0[j*cn + c], -- );
01954                     COP( c, j, p1[j*cn + c], ++ );
01955                 }
01956 
01957                 // First column initialization
01958                 for( k = 0; k < 16; ++k )
01959                     histogram_muladd( 2*r+1, &h_fine[16*n*(16*c+k)], &H[c].fine[k][0] );
01960 
01961             #if MEDIAN_HAVE_SIMD
01962                 if( useSIMD )
01963                 {
01964                     for( j = 0; j < 2*r; ++j )
01965                         histogram_add_simd( &h_coarse[16*(n*c+j)], H[c].coarse );
01966 
01967                     for( j = r; j < n-r; j++ )
01968                     {
01969                         int t = 2*r*r + 2*r, b, sum = 0;
01970                         HT* segment;
01971 
01972                         histogram_add_simd( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
01973 
01974                         // Find median at coarse level
01975                         for ( k = 0; k < 16 ; ++k )
01976                         {
01977                             sum += H[c].coarse[k];
01978                             if ( sum > t )
01979                             {
01980                                 sum -= H[c].coarse[k];
01981                                 break;
01982                             }
01983                         }
01984                         assert( k < 16 );
01985 
01986                         /* Update corresponding histogram segment */
01987                         if ( luc[c][k] <= j-r )
01988                         {
01989                             memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
01990                             for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
01991                                 histogram_add_simd( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
01992 
01993                             if ( luc[c][k] < j+r+1 )
01994                             {
01995                                 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
01996                                 luc[c][k] = (HT)(j+r+1);
01997                             }
01998                         }
01999                         else
02000                         {
02001                             for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
02002                             {
02003                                 histogram_sub_simd( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
02004                                 histogram_add_simd( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
02005                             }
02006                         }
02007 
02008                         histogram_sub_simd( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
02009 
02010                         /* Find median in segment */
02011                         segment = H[c].fine[k];
02012                         for ( b = 0; b < 16 ; b++ )
02013                         {
02014                             sum += segment[b];
02015                             if ( sum > t )
02016                             {
02017                                 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
02018                                 break;
02019                             }
02020                         }
02021                         assert( b < 16 );
02022                     }
02023                 }
02024                 else
02025             #endif
02026                 {
02027                     for( j = 0; j < 2*r; ++j )
02028                         histogram_add( &h_coarse[16*(n*c+j)], H[c].coarse );
02029 
02030                     for( j = r; j < n-r; j++ )
02031                     {
02032                         int t = 2*r*r + 2*r, b, sum = 0;
02033                         HT* segment;
02034 
02035                         histogram_add( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
02036 
02037                         // Find median at coarse level
02038                         for ( k = 0; k < 16 ; ++k )
02039                         {
02040                             sum += H[c].coarse[k];
02041                             if ( sum > t )
02042                             {
02043                                 sum -= H[c].coarse[k];
02044                                 break;
02045                             }
02046                         }
02047                         assert( k < 16 );
02048 
02049                         /* Update corresponding histogram segment */
02050                         if ( luc[c][k] <= j-r )
02051                         {
02052                             memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
02053                             for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
02054                                 histogram_add( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
02055 
02056                             if ( luc[c][k] < j+r+1 )
02057                             {
02058                                 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
02059                                 luc[c][k] = (HT)(j+r+1);
02060                             }
02061                         }
02062                         else
02063                         {
02064                             for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
02065                             {
02066                                 histogram_sub( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
02067                                 histogram_add( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
02068                             }
02069                         }
02070 
02071                         histogram_sub( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
02072 
02073                         /* Find median in segment */
02074                         segment = H[c].fine[k];
02075                         for ( b = 0; b < 16 ; b++ )
02076                         {
02077                             sum += segment[b];
02078                             if ( sum > t )
02079                             {
02080                                 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
02081                                 break;
02082                             }
02083                         }
02084                         assert( b < 16 );
02085                     }
02086                 }
02087             }
02088         }
02089     }
02090 
02091 #undef HOP
02092 #undef COP
02093 }
02094 
02095 static void
02096 medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m )
02097 {
02098     #define N  16
02099     int     zone0[4][N];
02100     int     zone1[4][N*N];
02101     int     x, y;
02102     int     n2 = m*m/2;
02103     Size    size = _dst.size();
02104     const uchar* src = _src.ptr();
02105     uchar*  dst = _dst.ptr();
02106     int     src_step = (int)_src.step, dst_step = (int)_dst.step;
02107     int     cn = _src.channels();
02108     const uchar*  src_max = src + size.height*src_step;
02109 
02110     #define UPDATE_ACC01( pix, cn, op ) \
02111     {                                   \
02112         int p = (pix);                  \
02113         zone1[cn][p] op;                \
02114         zone0[cn][p >> 4] op;           \
02115     }
02116 
02117     //CV_Assert( size.height >= nx && size.width >= nx );
02118     for( x = 0; x < size.width; x++, src += cn, dst += cn )
02119     {
02120         uchar* dst_cur = dst;
02121         const uchar* src_top = src;
02122         const uchar* src_bottom = src;
02123         int k, c;
02124         int src_step1 = src_step, dst_step1 = dst_step;
02125 
02126         if( x % 2 != 0 )
02127         {
02128             src_bottom = src_top += src_step*(size.height-1);
02129             dst_cur += dst_step*(size.height-1);
02130             src_step1 = -src_step1;
02131             dst_step1 = -dst_step1;
02132         }
02133 
02134         // init accumulator
02135         memset( zone0, 0, sizeof(zone0[0])*cn );
02136         memset( zone1, 0, sizeof(zone1[0])*cn );
02137 
02138         for( y = 0; y <= m/2; y++ )
02139         {
02140             for( c = 0; c < cn; c++ )
02141             {
02142                 if( y > 0 )
02143                 {
02144                     for( k = 0; k < m*cn; k += cn )
02145                         UPDATE_ACC01( src_bottom[k+c], c, ++ );
02146                 }
02147                 else
02148                 {
02149                     for( k = 0; k < m*cn; k += cn )
02150                         UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 );
02151                 }
02152             }
02153 
02154             if( (src_step1 > 0 && y < size.height-1) ||
02155                 (src_step1 < 0 && size.height-y-1 > 0) )
02156                 src_bottom += src_step1;
02157         }
02158 
02159         for( y = 0; y < size.height; y++, dst_cur += dst_step1 )
02160         {
02161             // find median
02162             for( c = 0; c < cn; c++ )
02163             {
02164                 int s = 0;
02165                 for( k = 0; ; k++ )
02166                 {
02167                     int t = s + zone0[c][k];
02168                     if( t > n2 ) break;
02169                     s = t;
02170                 }
02171 
02172                 for( k *= N; ;k++ )
02173                 {
02174                     s += zone1[c][k];
02175                     if( s > n2 ) break;
02176                 }
02177 
02178                 dst_cur[c] = (uchar)k;
02179             }
02180 
02181             if( y+1 == size.height )
02182                 break;
02183 
02184             if( cn == 1 )
02185             {
02186                 for( k = 0; k < m; k++ )
02187                 {
02188                     int p = src_top[k];
02189                     int q = src_bottom[k];
02190                     zone1[0][p]--;
02191                     zone0[0][p>>4]--;
02192                     zone1[0][q]++;
02193                     zone0[0][q>>4]++;
02194                 }
02195             }
02196             else if( cn == 3 )
02197             {
02198                 for( k = 0; k < m*3; k += 3 )
02199                 {
02200                     UPDATE_ACC01( src_top[k], 0, -- );
02201                     UPDATE_ACC01( src_top[k+1], 1, -- );
02202                     UPDATE_ACC01( src_top[k+2], 2, -- );
02203 
02204                     UPDATE_ACC01( src_bottom[k], 0, ++ );
02205                     UPDATE_ACC01( src_bottom[k+1], 1, ++ );
02206                     UPDATE_ACC01( src_bottom[k+2], 2, ++ );
02207                 }
02208             }
02209             else
02210             {
02211                 assert( cn == 4 );
02212                 for( k = 0; k < m*4; k += 4 )
02213                 {
02214                     UPDATE_ACC01( src_top[k], 0, -- );
02215                     UPDATE_ACC01( src_top[k+1], 1, -- );
02216                     UPDATE_ACC01( src_top[k+2], 2, -- );
02217                     UPDATE_ACC01( src_top[k+3], 3, -- );
02218 
02219                     UPDATE_ACC01( src_bottom[k], 0, ++ );
02220                     UPDATE_ACC01( src_bottom[k+1], 1, ++ );
02221                     UPDATE_ACC01( src_bottom[k+2], 2, ++ );
02222                     UPDATE_ACC01( src_bottom[k+3], 3, ++ );
02223                 }
02224             }
02225 
02226             if( (src_step1 > 0 && src_bottom + src_step1 < src_max) ||
02227                 (src_step1 < 0 && src_bottom + src_step1 >= src) )
02228                 src_bottom += src_step1;
02229 
02230             if( y >= m/2 )
02231                 src_top += src_step1;
02232         }
02233     }
02234 #undef N
02235 #undef UPDATE_ACC
02236 }
02237 
02238 
02239 struct MinMax8u
02240 {
02241     typedef uchar value_type;
02242     typedef int arg_type;
02243     enum { SIZE = 1 };
02244     arg_type load(const uchar* ptr) { return *ptr; }
02245     void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
02246     void operator()(arg_type& a, arg_type& b) const
02247     {
02248         int t = CV_FAST_CAST_8U(a - b);
02249         b += t; a -= t;
02250     }
02251 };
02252 
02253 struct MinMax16u
02254 {
02255     typedef ushort value_type;
02256     typedef int arg_type;
02257     enum { SIZE = 1 };
02258     arg_type load(const ushort* ptr) { return *ptr; }
02259     void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
02260     void operator()(arg_type& a, arg_type& b) const
02261     {
02262         arg_type t = a;
02263         a = std::min(a, b);
02264         b = std::max(b, t);
02265     }
02266 };
02267 
02268 struct MinMax16s
02269 {
02270     typedef short value_type;
02271     typedef int arg_type;
02272     enum { SIZE = 1 };
02273     arg_type load(const short* ptr) { return *ptr; }
02274     void store(short* ptr, arg_type val) { *ptr = (short)val; }
02275     void operator()(arg_type& a, arg_type& b) const
02276     {
02277         arg_type t = a;
02278         a = std::min(a, b);
02279         b = std::max(b, t);
02280     }
02281 };
02282 
02283 struct MinMax32f
02284 {
02285     typedef float value_type;
02286     typedef float arg_type;
02287     enum { SIZE = 1 };
02288     arg_type load(const float* ptr) { return *ptr; }
02289     void store(float* ptr, arg_type val) { *ptr = val; }
02290     void operator()(arg_type& a, arg_type& b) const
02291     {
02292         arg_type t = a;
02293         a = std::min(a, b);
02294         b = std::max(b, t);
02295     }
02296 };
02297 
02298 #if CV_SSE2
02299 
02300 struct MinMaxVec8u
02301 {
02302     typedef uchar value_type;
02303     typedef __m128i arg_type;
02304     enum { SIZE = 16 };
02305     arg_type load(const uchar* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
02306     void store(uchar* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
02307     void operator()(arg_type& a, arg_type& b) const
02308     {
02309         arg_type t = a;
02310         a = _mm_min_epu8(a, b);
02311         b = _mm_max_epu8(b, t);
02312     }
02313 };
02314 
02315 
02316 struct MinMaxVec16u
02317 {
02318     typedef ushort value_type;
02319     typedef __m128i arg_type;
02320     enum { SIZE = 8 };
02321     arg_type load(const ushort* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
02322     void store(ushort* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
02323     void operator()(arg_type& a, arg_type& b) const
02324     {
02325         arg_type t = _mm_subs_epu16(a, b);
02326         a = _mm_subs_epu16(a, t);
02327         b = _mm_adds_epu16(b, t);
02328     }
02329 };
02330 
02331 
02332 struct MinMaxVec16s
02333 {
02334     typedef short value_type;
02335     typedef __m128i arg_type;
02336     enum { SIZE = 8 };
02337     arg_type load(const short* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
02338     void store(short* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
02339     void operator()(arg_type& a, arg_type& b) const
02340     {
02341         arg_type t = a;
02342         a = _mm_min_epi16(a, b);
02343         b = _mm_max_epi16(b, t);
02344     }
02345 };
02346 
02347 
02348 struct MinMaxVec32f
02349 {
02350     typedef float value_type;
02351     typedef __m128 arg_type;
02352     enum { SIZE = 4 };
02353     arg_type load(const float* ptr) { return _mm_loadu_ps(ptr); }
02354     void store(float* ptr, arg_type val) { _mm_storeu_ps(ptr, val); }
02355     void operator()(arg_type& a, arg_type& b) const
02356     {
02357         arg_type t = a;
02358         a = _mm_min_ps(a, b);
02359         b = _mm_max_ps(b, t);
02360     }
02361 };
02362 
02363 #elif CV_NEON
02364 
02365 struct MinMaxVec8u
02366 {
02367     typedef uchar value_type;
02368     typedef uint8x16_t arg_type;
02369     enum { SIZE = 16 };
02370     arg_type load(const uchar* ptr) { return vld1q_u8(ptr); }
02371     void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); }
02372     void operator()(arg_type& a, arg_type& b) const
02373     {
02374         arg_type t = a;
02375         a = vminq_u8(a, b);
02376         b = vmaxq_u8(b, t);
02377     }
02378 };
02379 
02380 
02381 struct MinMaxVec16u
02382 {
02383     typedef ushort value_type;
02384     typedef uint16x8_t arg_type;
02385     enum { SIZE = 8 };
02386     arg_type load(const ushort* ptr) { return vld1q_u16(ptr); }
02387     void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); }
02388     void operator()(arg_type& a, arg_type& b) const
02389     {
02390         arg_type t = a;
02391         a = vminq_u16(a, b);
02392         b = vmaxq_u16(b, t);
02393     }
02394 };
02395 
02396 
02397 struct MinMaxVec16s
02398 {
02399     typedef short value_type;
02400     typedef int16x8_t arg_type;
02401     enum { SIZE = 8 };
02402     arg_type load(const short* ptr) { return vld1q_s16(ptr); }
02403     void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); }
02404     void operator()(arg_type& a, arg_type& b) const
02405     {
02406         arg_type t = a;
02407         a = vminq_s16(a, b);
02408         b = vmaxq_s16(b, t);
02409     }
02410 };
02411 
02412 
02413 struct MinMaxVec32f
02414 {
02415     typedef float value_type;
02416     typedef float32x4_t arg_type;
02417     enum { SIZE = 4 };
02418     arg_type load(const float* ptr) { return vld1q_f32(ptr); }
02419     void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); }
02420     void operator()(arg_type& a, arg_type& b) const
02421     {
02422         arg_type t = a;
02423         a = vminq_f32(a, b);
02424         b = vmaxq_f32(b, t);
02425     }
02426 };
02427 
02428 
02429 #else
02430 
02431 typedef MinMax8u MinMaxVec8u;
02432 typedef MinMax16u MinMaxVec16u;
02433 typedef MinMax16s MinMaxVec16s;
02434 typedef MinMax32f MinMaxVec32f;
02435 
02436 #endif
02437 
02438 template<class Op, class VecOp>
02439 static void
02440 medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
02441 {
02442     typedef typename Op::value_type T;
02443     typedef typename Op::arg_type WT;
02444     typedef typename VecOp::arg_type VT;
02445 
02446     const T* src = _src.ptr<T>();
02447     T* dst = _dst.ptr<T>();
02448     int sstep = (int)(_src.step/sizeof(T));
02449     int dstep = (int)(_dst.step/sizeof(T));
02450     Size size = _dst.size();
02451     int i, j, k, cn = _src.channels();
02452     Op op;
02453     VecOp vop;
02454     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
02455 
02456     if( m == 3 )
02457     {
02458         if( size.width == 1 || size.height == 1 )
02459         {
02460             int len = size.width + size.height - 1;
02461             int sdelta = size.height == 1 ? cn : sstep;
02462             int sdelta0 = size.height == 1 ? 0 : sstep - cn;
02463             int ddelta = size.height == 1 ? cn : dstep;
02464 
02465             for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
02466                 for( j = 0; j < cn; j++, src++ )
02467                 {
02468                     WT p0 = src[i > 0 ? -sdelta : 0];
02469                     WT p1 = src[0];
02470                     WT p2 = src[i < len - 1 ? sdelta : 0];
02471 
02472                     op(p0, p1); op(p1, p2); op(p0, p1);
02473                     dst[j] = (T)p1;
02474                 }
02475             return;
02476         }
02477 
02478         size.width *= cn;
02479         for( i = 0; i < size.height; i++, dst += dstep )
02480         {
02481             const T* row0 = src + std::max(i - 1, 0)*sstep;
02482             const T* row1 = src + i*sstep;
02483             const T* row2 = src + std::min(i + 1, size.height-1)*sstep;
02484             int limit = useSIMD ? cn : size.width;
02485 
02486             for(j = 0;; )
02487             {
02488                 for( ; j < limit; j++ )
02489                 {
02490                     int j0 = j >= cn ? j - cn : j;
02491                     int j2 = j < size.width - cn ? j + cn : j;
02492                     WT p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
02493                     WT p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
02494                     WT p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
02495 
02496                     op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
02497                     op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
02498                     op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
02499                     op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
02500                     op(p4, p2); op(p6, p4); op(p4, p2);
02501                     dst[j] = (T)p4;
02502                 }
02503 
02504                 if( limit == size.width )
02505                     break;
02506 
02507                 for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE )
02508                 {
02509                     VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
02510                     VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
02511                     VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn);
02512 
02513                     vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
02514                     vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
02515                     vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
02516                     vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
02517                     vop(p4, p2); vop(p6, p4); vop(p4, p2);
02518                     vop.store(dst+j, p4);
02519                 }
02520 
02521                 limit = size.width;
02522             }
02523         }
02524     }
02525     else if( m == 5 )
02526     {
02527         if( size.width == 1 || size.height == 1 )
02528         {
02529             int len = size.width + size.height - 1;
02530             int sdelta = size.height == 1 ? cn : sstep;
02531             int sdelta0 = size.height == 1 ? 0 : sstep - cn;
02532             int ddelta = size.height == 1 ? cn : dstep;
02533 
02534             for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
02535                 for( j = 0; j < cn; j++, src++ )
02536                 {
02537                     int i1 = i > 0 ? -sdelta : 0;
02538                     int i0 = i > 1 ? -sdelta*2 : i1;
02539                     int i3 = i < len-1 ? sdelta : 0;
02540                     int i4 = i < len-2 ? sdelta*2 : i3;
02541                     WT p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
02542 
02543                     op(p0, p1); op(p3, p4); op(p2, p3); op(p3, p4); op(p0, p2);
02544                     op(p2, p4); op(p1, p3); op(p1, p2);
02545                     dst[j] = (T)p2;
02546                 }
02547             return;
02548         }
02549 
02550         size.width *= cn;
02551         for( i = 0; i < size.height; i++, dst += dstep )
02552         {
02553             const T* row[5];
02554             row[0] = src + std::max(i - 2, 0)*sstep;
02555             row[1] = src + std::max(i - 1, 0)*sstep;
02556             row[2] = src + i*sstep;
02557             row[3] = src + std::min(i + 1, size.height-1)*sstep;
02558             row[4] = src + std::min(i + 2, size.height-1)*sstep;
02559             int limit = useSIMD ? cn*2 : size.width;
02560 
02561             for(j = 0;; )
02562             {
02563                 for( ; j < limit; j++ )
02564                 {
02565                     WT p[25];
02566                     int j1 = j >= cn ? j - cn : j;
02567                     int j0 = j >= cn*2 ? j - cn*2 : j1;
02568                     int j3 = j < size.width - cn ? j + cn : j;
02569                     int j4 = j < size.width - cn*2 ? j + cn*2 : j3;
02570                     for( k = 0; k < 5; k++ )
02571                     {
02572                         const T* rowk = row[k];
02573                         p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
02574                         p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
02575                         p[k*5+4] = rowk[j4];
02576                     }
02577 
02578                     op(p[1], p[2]); op(p[0], p[1]); op(p[1], p[2]); op(p[4], p[5]); op(p[3], p[4]);
02579                     op(p[4], p[5]); op(p[0], p[3]); op(p[2], p[5]); op(p[2], p[3]); op(p[1], p[4]);
02580                     op(p[1], p[2]); op(p[3], p[4]); op(p[7], p[8]); op(p[6], p[7]); op(p[7], p[8]);
02581                     op(p[10], p[11]); op(p[9], p[10]); op(p[10], p[11]); op(p[6], p[9]); op(p[8], p[11]);
02582                     op(p[8], p[9]); op(p[7], p[10]); op(p[7], p[8]); op(p[9], p[10]); op(p[0], p[6]);
02583                     op(p[4], p[10]); op(p[4], p[6]); op(p[2], p[8]); op(p[2], p[4]); op(p[6], p[8]);
02584                     op(p[1], p[7]); op(p[5], p[11]); op(p[5], p[7]); op(p[3], p[9]); op(p[3], p[5]);
02585                     op(p[7], p[9]); op(p[1], p[2]); op(p[3], p[4]); op(p[5], p[6]); op(p[7], p[8]);
02586                     op(p[9], p[10]); op(p[13], p[14]); op(p[12], p[13]); op(p[13], p[14]); op(p[16], p[17]);
02587                     op(p[15], p[16]); op(p[16], p[17]); op(p[12], p[15]); op(p[14], p[17]); op(p[14], p[15]);
02588                     op(p[13], p[16]); op(p[13], p[14]); op(p[15], p[16]); op(p[19], p[20]); op(p[18], p[19]);
02589                     op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[21], p[23]); op(p[22], p[24]);
02590                     op(p[22], p[23]); op(p[18], p[21]); op(p[20], p[23]); op(p[20], p[21]); op(p[19], p[22]);
02591                     op(p[22], p[24]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[12], p[18]);
02592                     op(p[16], p[22]); op(p[16], p[18]); op(p[14], p[20]); op(p[20], p[24]); op(p[14], p[16]);
02593                     op(p[18], p[20]); op(p[22], p[24]); op(p[13], p[19]); op(p[17], p[23]); op(p[17], p[19]);
02594                     op(p[15], p[21]); op(p[15], p[17]); op(p[19], p[21]); op(p[13], p[14]); op(p[15], p[16]);
02595                     op(p[17], p[18]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[0], p[12]);
02596                     op(p[8], p[20]); op(p[8], p[12]); op(p[4], p[16]); op(p[16], p[24]); op(p[12], p[16]);
02597                     op(p[2], p[14]); op(p[10], p[22]); op(p[10], p[14]); op(p[6], p[18]); op(p[6], p[10]);
02598                     op(p[10], p[12]); op(p[1], p[13]); op(p[9], p[21]); op(p[9], p[13]); op(p[5], p[17]);
02599                     op(p[13], p[17]); op(p[3], p[15]); op(p[11], p[23]); op(p[11], p[15]); op(p[7], p[19]);
02600                     op(p[7], p[11]); op(p[11], p[13]); op(p[11], p[12]);
02601                     dst[j] = (T)p[12];
02602                 }
02603 
02604                 if( limit == size.width )
02605                     break;
02606 
02607                 for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE )
02608                 {
02609                     VT p[25];
02610                     for( k = 0; k < 5; k++ )
02611                     {
02612                         const T* rowk = row[k];
02613                         p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn);
02614                         p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
02615                         p[k*5+4] = vop.load(rowk+j+cn*2);
02616                     }
02617 
02618                     vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
02619                     vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
02620                     vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
02621                     vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
02622                     vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
02623                     vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
02624                     vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
02625                     vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
02626                     vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
02627                     vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
02628                     vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
02629                     vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
02630                     vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
02631                     vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
02632                     vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
02633                     vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
02634                     vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
02635                     vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
02636                     vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
02637                     vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
02638                     vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
02639                     vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
02640                     vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
02641                     vop.store(dst+j, p[12]);
02642                 }
02643 
02644                 limit = size.width;
02645             }
02646         }
02647     }
02648 }
02649 
02650 #ifdef HAVE_OPENCL
02651 
02652 static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m)
02653 {
02654     size_t localsize[2] = { 16, 16 };
02655     size_t globalsize[2];
02656     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
02657 
02658     if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) )
02659         return false;
02660 
02661     Size imgSize = _src.size();
02662     bool useOptimized = (1 == cn) &&
02663                         (size_t)imgSize.width >= localsize[0] * 8  &&
02664                         (size_t)imgSize.height >= localsize[1] * 8 &&
02665                         imgSize.width % 4 == 0 &&
02666                         imgSize.height % 4 == 0 &&
02667                         (ocl::Device::getDefault().isIntel());
02668 
02669     cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ;
02670     cv::String kdefs = useOptimized ?
02671                          format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type),
02672                          ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn)
02673                          :
02674                          format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ;
02675 
02676     ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() );
02677 
02678     if (k.empty())
02679         return false;
02680 
02681     UMat src = _src.getUMat();
02682     _dst.create(src.size(), type);
02683     UMat dst = _dst.getUMat();
02684 
02685     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
02686 
02687     if( useOptimized )
02688     {
02689         globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0];
02690         globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1];
02691     }
02692     else
02693     {
02694         globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0];
02695         globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1];
02696     }
02697 
02698     return k.run(2, globalsize, localsize, false);
02699 }
02700 
02701 #endif
02702 
02703 }
02704 
02705 #ifdef HAVE_IPP
02706 namespace cv
02707 {
02708 static bool ipp_medianFilter( InputArray _src0, OutputArray _dst, int ksize )
02709 {
02710 #if IPP_VERSION_X100 >= 810
02711     Mat src0 = _src0.getMat();
02712     _dst.create( src0.size(), src0.type() );
02713     Mat dst = _dst.getMat();
02714 
02715 #define IPP_FILTER_MEDIAN_BORDER(ippType, ippDataType, flavor) \
02716     do \
02717     { \
02718         if (ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, \
02719         ippDataType, CV_MAT_CN(type), &bufSize) >= 0) \
02720         { \
02721             Ipp8u * buffer = ippsMalloc_8u(bufSize); \
02722             IppStatus status = ippiFilterMedianBorder_##flavor(src.ptr<ippType>(), (int)src.step, \
02723             dst.ptr<ippType>(), (int)dst.step, dstRoiSize, maskSize, \
02724             ippBorderRepl, (ippType)0, buffer); \
02725             ippsFree(buffer); \
02726             if (status >= 0) \
02727             { \
02728                 CV_IMPL_ADD(CV_IMPL_IPP); \
02729                 return true; \
02730             } \
02731         } \
02732     } \
02733     while ((void)0, 0)
02734 
02735     if( ksize <= 5 )
02736     {
02737         Ipp32s bufSize;
02738         IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize);
02739         Mat src;
02740         if( dst.data != src0.data )
02741             src = src0;
02742         else
02743             src0.copyTo(src);
02744 
02745         int type = src0.type();
02746         if (type == CV_8UC1)
02747             IPP_FILTER_MEDIAN_BORDER(Ipp8u, ipp8u, 8u_C1R);
02748         else if (type == CV_16UC1)
02749             IPP_FILTER_MEDIAN_BORDER(Ipp16u, ipp16u, 16u_C1R);
02750         else if (type == CV_16SC1)
02751             IPP_FILTER_MEDIAN_BORDER(Ipp16s, ipp16s, 16s_C1R);
02752         else if (type == CV_32FC1)
02753             IPP_FILTER_MEDIAN_BORDER(Ipp32f, ipp32f, 32f_C1R);
02754     }
02755 #undef IPP_FILTER_MEDIAN_BORDER
02756 #else
02757     CV_UNUSED(_src0); CV_UNUSED(_dst); CV_UNUSED(ksize);
02758 #endif
02759     return false;
02760 }
02761 }
02762 #endif
02763 
02764 void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
02765 {
02766     CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 ));
02767 
02768     if( ksize <= 1 )
02769     {
02770         _src0.copyTo(_dst);
02771         return;
02772     }
02773 
02774 #ifdef HAVE_OPENCL
02775     CV_OCL_RUN(_dst.isUMat(),
02776                ocl_medianFilter(_src0,_dst, ksize))
02777 #endif
02778 
02779     Mat src0 = _src0.getMat();
02780     _dst.create( src0.size(), src0.type() );
02781     Mat dst = _dst.getMat();
02782 
02783     CV_IPP_RUN(IPP_VERSION_X100 >= 810 && ksize <= 5, ipp_medianFilter(_src0,_dst, ksize));
02784 
02785 #ifdef HAVE_TEGRA_OPTIMIZATION
02786     if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize))
02787         return;
02788 #endif
02789 
02790     bool useSortNet = ksize == 3 || (ksize == 5
02791 #if !(CV_SSE2 || CV_NEON)
02792             && src0.depth() > CV_8U
02793 #endif
02794         );
02795 
02796     Mat src;
02797     if( useSortNet )
02798     {
02799         if( dst.data != src0.data )
02800             src = src0;
02801         else
02802             src0.copyTo(src);
02803 
02804         if( src.depth() == CV_8U )
02805             medianBlur_SortNet<MinMax8u, MinMaxVec8u>( src, dst, ksize );
02806         else if( src.depth() == CV_16U )
02807             medianBlur_SortNet<MinMax16u, MinMaxVec16u>( src, dst, ksize );
02808         else if( src.depth() == CV_16S )
02809             medianBlur_SortNet<MinMax16s, MinMaxVec16s>( src, dst, ksize );
02810         else if( src.depth() == CV_32F )
02811             medianBlur_SortNet<MinMax32f, MinMaxVec32f>( src, dst, ksize );
02812         else
02813             CV_Error(CV_StsUnsupportedFormat, "");
02814 
02815         return;
02816     }
02817     else
02818     {
02819         cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE );
02820 
02821         int cn = src0.channels();
02822         CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) );
02823 
02824         double img_size_mp = (double)(src0.total())/(1 << 20);
02825         if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*
02826             (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3))
02827             medianBlur_8u_Om( src, dst, ksize );
02828         else
02829             medianBlur_8u_O1 ( src, dst, ksize );
02830     }
02831 }
02832 
02833 /****************************************************************************************\
02834                                    Bilateral Filtering
02835 \****************************************************************************************/
02836 
02837 namespace cv
02838 {
02839 
02840 class BilateralFilter_8u_Invoker :
02841     public ParallelLoopBody
02842 {
02843 public:
02844     BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk,
02845         int* _space_ofs, float *_space_weight, float *_color_weight) :
02846         temp(&_temp), dest(&_dest), radius(_radius),
02847         maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
02848     {
02849     }
02850 
02851     virtual void operator() (const Range& range) const
02852     {
02853         int i, j, cn = dest->channels(), k;
02854         Size size = dest->size();
02855         #if CV_SSE3
02856         int CV_DECL_ALIGNED(16) buf[4];
02857         float CV_DECL_ALIGNED(16) bufSum[4];
02858         static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
02859         bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
02860         #endif
02861 
02862         for( i = range.start; i < range.end; i++ )
02863         {
02864             const uchar* sptr = temp->ptr(i+radius) + radius*cn;
02865             uchar* dptr = dest->ptr(i);
02866 
02867             if( cn == 1 )
02868             {
02869                 for( j = 0; j < size.width; j++ )
02870                 {
02871                     float sum = 0, wsum = 0;
02872                     int val0 = sptr[j];
02873                     k = 0;
02874                     #if CV_SSE3
02875                     if( haveSSE3 )
02876                     {
02877                         __m128 _val0 = _mm_set1_ps(static_cast<float>(val0));
02878                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
02879 
02880                         for( ; k <= maxk - 4; k += 4 )
02881                         {
02882                             __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
02883                                                       sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
02884 
02885                             __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0));
02886                             _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val));
02887 
02888                             __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
02889                                                     color_weight[buf[1]],color_weight[buf[0]]);
02890                             __m128 _sw = _mm_loadu_ps(space_weight+k);
02891                             __m128 _w = _mm_mul_ps(_cw, _sw);
02892                              _cw = _mm_mul_ps(_w, _valF);
02893 
02894                              _sw = _mm_hadd_ps(_w, _cw);
02895                              _sw = _mm_hadd_ps(_sw, _sw);
02896                              _mm_storel_pi((__m64*)bufSum, _sw);
02897 
02898                              sum += bufSum[1];
02899                              wsum += bufSum[0];
02900                         }
02901                     }
02902                     #endif
02903                     for( ; k < maxk; k++ )
02904                     {
02905                         int val = sptr[j + space_ofs[k]];
02906                         float w = space_weight[k]*color_weight[std::abs(val - val0)];
02907                         sum += val*w;
02908                         wsum += w;
02909                     }
02910                     // overflow is not possible here => there is no need to use cv::saturate_cast
02911                     dptr[j] = (uchar)cvRound(sum/wsum);
02912                 }
02913             }
02914             else
02915             {
02916                 assert( cn == 3 );
02917                 for( j = 0; j < size.width*3; j += 3 )
02918                 {
02919                     float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
02920                     int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
02921                     k = 0;
02922                     #if CV_SSE3
02923                     if( haveSSE3 )
02924                     {
02925                         const __m128i izero = _mm_setzero_si128();
02926                         const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
02927                         const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
02928                         const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
02929                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
02930 
02931                         for( ; k <= maxk - 4; k += 4 )
02932                         {
02933                             const int* const sptr_k0  = reinterpret_cast<const int*>(sptr + j + space_ofs[k]);
02934                             const int* const sptr_k1  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]);
02935                             const int* const sptr_k2  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]);
02936                             const int* const sptr_k3  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]);
02937 
02938                             __m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero));
02939                             __m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero));
02940                             __m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero));
02941                             __m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero));
02942 
02943                             _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
02944 
02945                             __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
02946                             __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
02947                             __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0));
02948 
02949                             bt =_mm_add_ps(rt, _mm_add_ps(bt, gt));
02950                             _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt));
02951 
02952                             __m128 _w  = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
02953                                                     color_weight[buf[1]],color_weight[buf[0]]);
02954                             __m128 _sw = _mm_loadu_ps(space_weight+k);
02955 
02956                             _w = _mm_mul_ps(_w,_sw);
02957                             _b = _mm_mul_ps(_b, _w);
02958                             _g = _mm_mul_ps(_g, _w);
02959                             _r = _mm_mul_ps(_r, _w);
02960 
02961                              _w = _mm_hadd_ps(_w, _b);
02962                              _g = _mm_hadd_ps(_g, _r);
02963 
02964                              _w = _mm_hadd_ps(_w, _g);
02965                              _mm_store_ps(bufSum, _w);
02966 
02967                              wsum  += bufSum[0];
02968                              sum_b += bufSum[1];
02969                              sum_g += bufSum[2];
02970                              sum_r += bufSum[3];
02971                          }
02972                     }
02973                     #endif
02974 
02975                     for( ; k < maxk; k++ )
02976                     {
02977                         const uchar* sptr_k = sptr + j + space_ofs[k];
02978                         int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
02979                         float w = space_weight[k]*color_weight[std::abs(b - b0) +
02980                                                                std::abs(g - g0) + std::abs(r - r0)];
02981                         sum_b += b*w; sum_g += g*w; sum_r += r*w;
02982                         wsum += w;
02983                     }
02984                     wsum = 1.f/wsum;
02985                     b0 = cvRound(sum_b*wsum);
02986                     g0 = cvRound(sum_g*wsum);
02987                     r0 = cvRound(sum_r*wsum);
02988                     dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
02989                 }
02990             }
02991         }
02992     }
02993 
02994 private:
02995     const Mat *temp;
02996     Mat *dest;
02997     int radius, maxk, *space_ofs;
02998     float *space_weight, *color_weight;
02999 };
03000 
03001 #if defined (HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY) && IPP_DISABLE_BLOCK
03002 class IPPBilateralFilter_8u_Invoker :
03003     public ParallelLoopBody
03004 {
03005 public:
03006     IPPBilateralFilter_8u_Invoker(Mat &_src, Mat &_dst, double _sigma_color, double _sigma_space, int _radius, bool *_ok) :
03007       ParallelLoopBody(), src(_src), dst(_dst), sigma_color(_sigma_color), sigma_space(_sigma_space), radius(_radius), ok(_ok)
03008       {
03009           *ok = true;
03010       }
03011 
03012       virtual void operator() (const Range& range) const
03013       {
03014           int d = radius * 2 + 1;
03015           IppiSize kernel = {d, d};
03016           IppiSize roi={dst.cols, range.end - range.start};
03017           int bufsize=0;
03018           if (0 > ippiFilterBilateralGetBufSize_8u_C1R( ippiFilterBilateralGauss, roi, kernel, &bufsize))
03019           {
03020               *ok = false;
03021               return;
03022           }
03023           AutoBuffer<uchar> buf(bufsize);
03024           IppiFilterBilateralSpec *pSpec = (IppiFilterBilateralSpec *)alignPtr(&buf[0], 32);
03025           if (0 > ippiFilterBilateralInit_8u_C1R( ippiFilterBilateralGauss, kernel, (Ipp32f)sigma_color, (Ipp32f)sigma_space, 1, pSpec ))
03026           {
03027               *ok = false;
03028               return;
03029           }
03030           if (0 > ippiFilterBilateral_8u_C1R( src.ptr<uchar>(range.start) + radius * ((int)src.step[0] + 1), (int)src.step[0], dst.ptr<uchar>(range.start), (int)dst.step[0], roi, kernel, pSpec ))
03031               *ok = false;
03032           else
03033           {
03034             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
03035           }
03036       }
03037 private:
03038     Mat &src;
03039     Mat &dst;
03040     double sigma_color;
03041     double sigma_space;
03042     int radius;
03043     bool *ok;
03044     const IPPBilateralFilter_8u_Invoker& operator= (const IPPBilateralFilter_8u_Invoker&);
03045 };
03046 #endif
03047 
03048 #ifdef HAVE_OPENCL
03049 
03050 static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
03051                                    double sigma_color, double sigma_space,
03052                                    int borderType)
03053 {
03054 #ifdef ANDROID
03055     if (ocl::Device::getDefault().isNVidia())
03056         return false;
03057 #endif
03058 
03059     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
03060     int i, j, maxk, radius;
03061 
03062     if (depth != CV_8U || cn > 4)
03063         return false;
03064 
03065     if (sigma_color <= 0)
03066         sigma_color = 1;
03067     if (sigma_space <= 0)
03068         sigma_space = 1;
03069 
03070     double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
03071     double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
03072 
03073     if ( d <= 0 )
03074         radius = cvRound(sigma_space * 1.5);
03075     else
03076         radius = d / 2;
03077     radius = MAX(radius, 1);
03078     d = radius * 2 + 1;
03079 
03080     UMat src = _src.getUMat(), dst = _dst.getUMat(), temp;
03081     if (src.u == dst.u)
03082         return false;
03083 
03084     copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);
03085     std::vector<float> _space_weight(d * d);
03086     std::vector<int> _space_ofs(d * d);
03087     float * const space_weight = &_space_weight[0];
03088     int * const space_ofs = &_space_ofs[0];
03089 
03090     // initialize space-related bilateral filter coefficients
03091     for( i = -radius, maxk = 0; i <= radius; i++ )
03092         for( j = -radius; j <= radius; j++ )
03093         {
03094             double r = std::sqrt((double)i * i + (double)j * j);
03095             if ( r > radius )
03096                 continue;
03097             space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
03098             space_ofs[maxk++] = (int)(i * temp.step + j * cn);
03099         }
03100 
03101     char cvt[3][40];
03102     String cnstr = cn > 1 ? format("%d", cn) : "";
03103     String kernelName("bilateral");
03104     size_t sizeDiv = 1;
03105     if ((ocl::Device::getDefault().isIntel()) &&
03106         (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
03107     {
03108             //Intel GPU
03109             if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images.
03110             {
03111                 kernelName = "bilateral_float4";
03112                 sizeDiv = 4;
03113             }
03114      }
03115      ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
03116             format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
03117             " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f",
03118             radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
03119             ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
03120             ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
03121             ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
03122             ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff));
03123     if (k.empty())
03124         return false;
03125 
03126     Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
03127     Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
03128     UMat ucolor_weight, uspace_weight, uspace_ofs;
03129 
03130     mspace_weight.copyTo(uspace_weight);
03131     mspace_ofs.copyTo(uspace_ofs);
03132 
03133     k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
03134            ocl::KernelArg::PtrReadOnly(uspace_weight),
03135            ocl::KernelArg::PtrReadOnly(uspace_ofs));
03136 
03137     size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows };
03138     return k.run(2, globalsize, NULL, false);
03139 }
03140 
03141 #endif
03142 static void
03143 bilateralFilter_8u( const Mat& src, Mat& dst, int d,
03144     double sigma_color, double sigma_space,
03145     int borderType )
03146 {
03147     int cn = src.channels();
03148     int i, j, maxk, radius;
03149     Size size = src.size();
03150 
03151     CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data );
03152 
03153     if( sigma_color <= 0 )
03154         sigma_color = 1;
03155     if( sigma_space <= 0 )
03156         sigma_space = 1;
03157 
03158     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
03159     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
03160 
03161     if( d <= 0 )
03162         radius = cvRound(sigma_space*1.5);
03163     else
03164         radius = d/2;
03165     radius = MAX(radius, 1);
03166     d = radius*2 + 1;
03167 
03168     Mat temp;
03169     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
03170 
03171 #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK
03172     CV_IPP_CHECK()
03173     {
03174         if( cn == 1 )
03175         {
03176             bool ok;
03177             IPPBilateralFilter_8u_Invoker body(temp, dst, sigma_color * sigma_color, sigma_space * sigma_space, radius, &ok );
03178             parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16));
03179             if( ok )
03180             {
03181                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
03182                 return;
03183             }
03184             setIppErrorStatus();
03185         }
03186     }
03187 #endif
03188 
03189     std::vector<float> _color_weight(cn*256);
03190     std::vector<float> _space_weight(d*d);
03191     std::vector<int> _space_ofs(d*d);
03192     float* color_weight = &_color_weight[0];
03193     float* space_weight = &_space_weight[0];
03194     int* space_ofs = &_space_ofs[0];
03195 
03196     // initialize color-related bilateral filter coefficients
03197 
03198     for( i = 0; i < 256*cn; i++ )
03199         color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
03200 
03201     // initialize space-related bilateral filter coefficients
03202     for( i = -radius, maxk = 0; i <= radius; i++ )
03203     {
03204         j = -radius;
03205 
03206         for( ; j <= radius; j++ )
03207         {
03208             double r = std::sqrt((double)i*i + (double)j*j);
03209             if( r > radius )
03210                 continue;
03211             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
03212             space_ofs[maxk++] = (int)(i*temp.step + j*cn);
03213         }
03214     }
03215 
03216     BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
03217     parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
03218 }
03219 
03220 
03221 class BilateralFilter_32f_Invoker :
03222     public ParallelLoopBody
03223 {
03224 public:
03225 
03226     BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
03227         const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) :
03228         cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
03229         temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
03230     {
03231     }
03232 
03233     virtual void operator() (const Range& range) const
03234     {
03235         int i, j, k;
03236         Size size = dest->size();
03237         #if CV_SSE3
03238         int CV_DECL_ALIGNED(16) idxBuf[4];
03239         float CV_DECL_ALIGNED(16) bufSum32[4];
03240         static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
03241         bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
03242         #endif
03243 
03244         for( i = range.start; i < range.end; i++ )
03245         {
03246             const float* sptr = temp->ptr<float>(i+radius) + radius*cn;
03247             float* dptr = dest->ptr<float>(i);
03248 
03249             if( cn == 1 )
03250             {
03251                 for( j = 0; j < size.width; j++ )
03252                 {
03253                     float sum = 0, wsum = 0;
03254                     float val0 = sptr[j];
03255                     k = 0;
03256                     #if CV_SSE3
03257                     if( haveSSE3 )
03258                     {
03259                         __m128 psum = _mm_setzero_ps();
03260                         const __m128 _val0 = _mm_set1_ps(sptr[j]);
03261                         const __m128 _scale_index = _mm_set1_ps(scale_index);
03262                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
03263 
03264                         for( ; k <= maxk - 4 ; k += 4 )
03265                         {
03266                             __m128 _sw    = _mm_loadu_ps(space_weight + k);
03267                             __m128 _val   = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
03268                                                        sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
03269                             __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index);
03270 
03271                             __m128i _idx = _mm_cvtps_epi32(_alpha);
03272                             _mm_store_si128((__m128i*)idxBuf, _idx);
03273                             _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
03274 
03275                             __m128 _explut  = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]],
03276                                                          expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
03277                             __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1],
03278                                                          expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
03279 
03280                             __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
03281                             _val = _mm_mul_ps(_w, _val);
03282 
03283                              _sw = _mm_hadd_ps(_w, _val);
03284                              _sw = _mm_hadd_ps(_sw, _sw);
03285                              psum = _mm_add_ps(_sw, psum);
03286                         }
03287                         _mm_storel_pi((__m64*)bufSum32, psum);
03288 
03289                         sum = bufSum32[1];
03290                         wsum = bufSum32[0];
03291                     }
03292                     #endif
03293 
03294                     for( ; k < maxk; k++ )
03295                     {
03296                         float val = sptr[j + space_ofs[k]];
03297                         float alpha = (float)(std::abs(val - val0)*scale_index);
03298                         int idx = cvFloor(alpha);
03299                         alpha -= idx;
03300                         float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
03301                         sum += val*w;
03302                         wsum += w;
03303                     }
03304                     dptr[j] = (float)(sum/wsum);
03305                 }
03306             }
03307             else
03308             {
03309                 CV_Assert( cn == 3 );
03310                 for( j = 0; j < size.width*3; j += 3 )
03311                 {
03312                     float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
03313                     float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
03314                     k = 0;
03315                     #if  CV_SSE3
03316                     if( haveSSE3 )
03317                     {
03318                         __m128 sum = _mm_setzero_ps();
03319                         const __m128 _b0 = _mm_set1_ps(b0);
03320                         const __m128 _g0 = _mm_set1_ps(g0);
03321                         const __m128 _r0 = _mm_set1_ps(r0);
03322                         const __m128 _scale_index = _mm_set1_ps(scale_index);
03323                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
03324 
03325                         for( ; k <= maxk-4; k += 4 )
03326                         {
03327                             __m128 _sw = _mm_loadu_ps(space_weight + k);
03328 
03329                             const float* const sptr_k0 = sptr + j + space_ofs[k];
03330                             const float* const sptr_k1 = sptr + j + space_ofs[k+1];
03331                             const float* const sptr_k2 = sptr + j + space_ofs[k+2];
03332                             const float* const sptr_k3 = sptr + j + space_ofs[k+3];
03333 
03334                             __m128 _b = _mm_loadu_ps(sptr_k0);
03335                             __m128 _g = _mm_loadu_ps(sptr_k1);
03336                             __m128 _r = _mm_loadu_ps(sptr_k2);
03337                             __m128 _z = _mm_loadu_ps(sptr_k3);
03338                             _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
03339 
03340                             __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
03341                             __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
03342                             __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0));
03343 
03344                             __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt)));
03345 
03346                             __m128i _idx  = _mm_cvtps_epi32(_alpha);
03347                             _mm_store_si128((__m128i*)idxBuf, _idx);
03348                             _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
03349 
03350                             __m128 _explut  = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
03351                             __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
03352 
03353                             __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
03354 
03355                             _b = _mm_mul_ps(_b, _w);
03356                             _g = _mm_mul_ps(_g, _w);
03357                             _r = _mm_mul_ps(_r, _w);
03358 
03359                              _w = _mm_hadd_ps(_w, _b);
03360                              _g = _mm_hadd_ps(_g, _r);
03361 
03362                              _w = _mm_hadd_ps(_w, _g);
03363                              sum = _mm_add_ps(sum, _w);
03364                         }
03365                         _mm_store_ps(bufSum32, sum);
03366                         wsum  = bufSum32[0];
03367                         sum_b = bufSum32[1];
03368                         sum_g = bufSum32[2];
03369                         sum_r = bufSum32[3];
03370                     }
03371                     #endif
03372 
03373                     for(; k < maxk; k++ )
03374                     {
03375                         const float* sptr_k = sptr + j + space_ofs[k];
03376                         float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
03377                         float alpha = (float)((std::abs(b - b0) +
03378                             std::abs(g - g0) + std::abs(r - r0))*scale_index);
03379                         int idx = cvFloor(alpha);
03380                         alpha -= idx;
03381                         float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
03382                         sum_b += b*w; sum_g += g*w; sum_r += r*w;
03383                         wsum += w;
03384                     }
03385                     wsum = 1.f/wsum;
03386                     b0 = sum_b*wsum;
03387                     g0 = sum_g*wsum;
03388                     r0 = sum_r*wsum;
03389                     dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
03390                 }
03391             }
03392         }
03393     }
03394 
03395 private:
03396     int cn, radius, maxk, *space_ofs;
03397     const Mat* temp;
03398     Mat *dest;
03399     float scale_index, *space_weight, *expLUT;
03400 };
03401 
03402 
03403 static void
03404 bilateralFilter_32f( const Mat& src, Mat& dst, int d,
03405                      double sigma_color, double sigma_space,
03406                      int borderType )
03407 {
03408     int cn = src.channels();
03409     int i, j, maxk, radius;
03410     double minValSrc=-1, maxValSrc=1;
03411     const int kExpNumBinsPerChannel = 1 << 12;
03412     int kExpNumBins = 0;
03413     float lastExpVal = 1.f;
03414     float len, scale_index;
03415     Size size = src.size();
03416 
03417     CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data );
03418 
03419     if( sigma_color <= 0 )
03420         sigma_color = 1;
03421     if( sigma_space <= 0 )
03422         sigma_space = 1;
03423 
03424     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
03425     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
03426 
03427     if( d <= 0 )
03428         radius = cvRound(sigma_space*1.5);
03429     else
03430         radius = d/2;
03431     radius = MAX(radius, 1);
03432     d = radius*2 + 1;
03433     // compute the min/max range for the input image (even if multichannel)
03434 
03435     minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
03436     if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
03437     {
03438         src.copyTo(dst);
03439         return;
03440     }
03441 
03442     // temporary copy of the image with borders for easy processing
03443     Mat temp;
03444     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
03445     const double insteadNaNValue = -5. * sigma_color;
03446     patchNaNs( temp, insteadNaNValue ); // this replacement of NaNs makes the assumption that depth values are nonnegative
03447                                         // TODO: make insteadNaNValue avalible in the outside function interface to control the cases breaking the assumption
03448     // allocate lookup tables
03449     std::vector<float> _space_weight(d*d);
03450     std::vector<int> _space_ofs(d*d);
03451     float* space_weight = &_space_weight[0];
03452     int* space_ofs = &_space_ofs[0];
03453 
03454     // assign a length which is slightly more than needed
03455     len = (float)(maxValSrc - minValSrc) * cn;
03456     kExpNumBins = kExpNumBinsPerChannel * cn;
03457     std::vector<float> _expLUT(kExpNumBins+2);
03458     float* expLUT = &_expLUT[0];
03459 
03460     scale_index = kExpNumBins/len;
03461 
03462     // initialize the exp LUT
03463     for( i = 0; i < kExpNumBins+2; i++ )
03464     {
03465         if( lastExpVal > 0.f )
03466         {
03467             double val =  i / scale_index;
03468             expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
03469             lastExpVal = expLUT[i];
03470         }
03471         else
03472             expLUT[i] = 0.f;
03473     }
03474 
03475     // initialize space-related bilateral filter coefficients
03476     for( i = -radius, maxk = 0; i <= radius; i++ )
03477         for( j = -radius; j <= radius; j++ )
03478         {
03479             double r = std::sqrt((double)i*i + (double)j*j);
03480             if( r > radius )
03481                 continue;
03482             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
03483             space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
03484         }
03485 
03486     // parallel_for usage
03487 
03488     BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
03489     parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
03490 }
03491 
03492 }
03493 
03494 void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
03495                       double sigmaColor, double sigmaSpace,
03496                       int borderType )
03497 {
03498     _dst.create( _src.size(), _src.type() );
03499 
03500 #ifdef HAVE_OPENCL
03501     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
03502                ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
03503 #endif
03504 
03505     Mat src = _src.getMat(), dst = _dst.getMat();
03506 
03507     if( src.depth() == CV_8U )
03508         bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType );
03509     else if( src.depth() == CV_32F )
03510         bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType );
03511     else
03512         CV_Error( CV_StsUnsupportedFormat,
03513         "Bilateral filtering is only implemented for 8u and 32f images" );
03514 }
03515 
03516 //////////////////////////////////////////////////////////////////////////////////////////
03517 
03518 CV_IMPL void
03519 cvSmooth( const void* srcarr, void* dstarr, int smooth_type,
03520           int param1, int param2, double param3, double param4 )
03521 {
03522     cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
03523 
03524     CV_Assert( dst.size() == src.size() &&
03525         (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) );
03526 
03527     if( param2 <= 0 )
03528         param2 = param1;
03529 
03530     if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE )
03531         cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point (-1,-1),
03532             smooth_type == CV_BLUR, cv::BORDER_REPLICATE );
03533     else if( smooth_type == CV_GAUSSIAN )
03534         cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE );
03535     else if( smooth_type == CV_MEDIAN )
03536         cv::medianBlur( src, dst, param1 );
03537     else
03538         cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE );
03539 
03540     if( dst.data != dst0.data )
03541         CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" );
03542 }
03543 
03544 /* End of file. */
03545