Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project by
smooth.cpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 00016 // Third party copyrights are property of their respective owners. 00017 // 00018 // Redistribution and use in source and binary forms, with or without modification, 00019 // are permitted provided that the following conditions are met: 00020 // 00021 // * Redistribution's of source code must retain the above copyright notice, 00022 // this list of conditions and the following disclaimer. 00023 // 00024 // * Redistribution's in binary form must reproduce the above copyright notice, 00025 // this list of conditions and the following disclaimer in the documentation 00026 // and/or other materials provided with the distribution. 00027 // 00028 // * The name of the copyright holders may not be used to endorse or promote products 00029 // derived from this software without specific prior written permission. 00030 // 00031 // This software is provided by the copyright holders and contributors "as is" and 00032 // any express or implied warranties, including, but not limited to, the implied 00033 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00034 // In no event shall the Intel Corporation or contributors be liable for any direct, 00035 // indirect, incidental, special, exemplary, or consequential damages 00036 // (including, but not limited to, procurement of substitute goods or services; 00037 // loss of use, data, or profits; or business interruption) however caused 00038 // and on any theory of liability, whether in contract, strict liability, 00039 // or tort (including negligence or otherwise) arising in any way out of 00040 // the use of this software, even if advised of the possibility of such damage. 00041 // 00042 //M*/ 00043 00044 #include "precomp.hpp" 00045 #include "opencl_kernels_imgproc.hpp" 00046 00047 /* 00048 * This file includes the code, contributed by Simon Perreault 00049 * (the function icvMedianBlur_8u_O1) 00050 * 00051 * Constant-time median filtering -- http://nomis80.org/ctmf.html 00052 * Copyright (C) 2006 Simon Perreault 00053 * 00054 * Contact: 00055 * Laboratoire de vision et systemes numeriques 00056 * Pavillon Adrien-Pouliot 00057 * Universite Laval 00058 * Sainte-Foy, Quebec, Canada 00059 * G1K 7P4 00060 * 00061 * perreaul@gel.ulaval.ca 00062 */ 00063 00064 namespace cv 00065 { 00066 00067 /****************************************************************************************\ 00068 Box Filter 00069 \****************************************************************************************/ 00070 00071 template<typename T, typename ST> 00072 struct RowSum : 00073 public BaseRowFilter 00074 { 00075 RowSum( int _ksize, int _anchor ) : 00076 BaseRowFilter() 00077 { 00078 ksize = _ksize; 00079 anchor = _anchor; 00080 } 00081 00082 virtual void operator()(const uchar* src, uchar* dst, int width, int cn) 00083 { 00084 const T* S = (const T*)src; 00085 ST* D = (ST*)dst; 00086 int i = 0, k, ksz_cn = ksize*cn; 00087 00088 width = (width - 1)*cn; 00089 for( k = 0; k < cn; k++, S++, D++ ) 00090 { 00091 ST s = 0; 00092 for( i = 0; i < ksz_cn; i += cn ) 00093 s += S[i]; 00094 D[0] = s; 00095 for( i = 0; i < width; i += cn ) 00096 { 00097 s += S[i + ksz_cn] - S[i]; 00098 D[i+cn] = s; 00099 } 00100 } 00101 } 00102 }; 00103 00104 00105 template<typename ST, typename T> 00106 struct ColumnSum : 00107 public BaseColumnFilter 00108 { 00109 ColumnSum( int _ksize, int _anchor, double _scale ) : 00110 BaseColumnFilter() 00111 { 00112 ksize = _ksize; 00113 anchor = _anchor; 00114 scale = _scale; 00115 sumCount = 0; 00116 } 00117 00118 virtual void reset() { sumCount = 0; } 00119 00120 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 00121 { 00122 int i; 00123 ST* SUM; 00124 bool haveScale = scale != 1; 00125 double _scale = scale; 00126 00127 if( width != (int)sum.size() ) 00128 { 00129 sum.resize(width); 00130 sumCount = 0; 00131 } 00132 00133 SUM = &sum[0]; 00134 if( sumCount == 0 ) 00135 { 00136 memset((void*)SUM, 0, width*sizeof(ST)); 00137 00138 for( ; sumCount < ksize - 1; sumCount++, src++ ) 00139 { 00140 const ST* Sp = (const ST*)src[0]; 00141 for( i = 0; i <= width - 2; i += 2 ) 00142 { 00143 ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; 00144 SUM[i] = s0; SUM[i+1] = s1; 00145 } 00146 00147 for( ; i < width; i++ ) 00148 SUM[i] += Sp[i]; 00149 } 00150 } 00151 else 00152 { 00153 CV_Assert( sumCount == ksize-1 ); 00154 src += ksize-1; 00155 } 00156 00157 for( ; count--; src++ ) 00158 { 00159 const ST* Sp = (const ST*)src[0]; 00160 const ST* Sm = (const ST*)src[1-ksize]; 00161 T* D = (T*)dst; 00162 if( haveScale ) 00163 { 00164 for( i = 0; i <= width - 2; i += 2 ) 00165 { 00166 ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; 00167 D[i] = saturate_cast<T>(s0*_scale); 00168 D[i+1] = saturate_cast<T>(s1*_scale); 00169 s0 -= Sm[i]; s1 -= Sm[i+1]; 00170 SUM[i] = s0; SUM[i+1] = s1; 00171 } 00172 00173 for( ; i < width; i++ ) 00174 { 00175 ST s0 = SUM[i] + Sp[i]; 00176 D[i] = saturate_cast<T>(s0*_scale); 00177 SUM[i] = s0 - Sm[i]; 00178 } 00179 } 00180 else 00181 { 00182 for( i = 0; i <= width - 2; i += 2 ) 00183 { 00184 ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; 00185 D[i] = saturate_cast<T>(s0); 00186 D[i+1] = saturate_cast<T>(s1); 00187 s0 -= Sm[i]; s1 -= Sm[i+1]; 00188 SUM[i] = s0; SUM[i+1] = s1; 00189 } 00190 00191 for( ; i < width; i++ ) 00192 { 00193 ST s0 = SUM[i] + Sp[i]; 00194 D[i] = saturate_cast<T>(s0); 00195 SUM[i] = s0 - Sm[i]; 00196 } 00197 } 00198 dst += dststep; 00199 } 00200 } 00201 00202 double scale; 00203 int sumCount; 00204 std::vector<ST> sum; 00205 }; 00206 00207 00208 template<> 00209 struct ColumnSum<int, uchar> : 00210 public BaseColumnFilter 00211 { 00212 ColumnSum( int _ksize, int _anchor, double _scale ) : 00213 BaseColumnFilter() 00214 { 00215 ksize = _ksize; 00216 anchor = _anchor; 00217 scale = _scale; 00218 sumCount = 0; 00219 } 00220 00221 virtual void reset() { sumCount = 0; } 00222 00223 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 00224 { 00225 int i; 00226 int* SUM; 00227 bool haveScale = scale != 1; 00228 double _scale = scale; 00229 00230 #if CV_SSE2 00231 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 00232 #endif 00233 00234 if( width != (int)sum.size() ) 00235 { 00236 sum.resize(width); 00237 sumCount = 0; 00238 } 00239 00240 SUM = &sum[0]; 00241 if( sumCount == 0 ) 00242 { 00243 memset((void*)SUM, 0, width*sizeof(int)); 00244 for( ; sumCount < ksize - 1; sumCount++, src++ ) 00245 { 00246 const int* Sp = (const int*)src[0]; 00247 i = 0; 00248 #if CV_SSE2 00249 if(haveSSE2) 00250 { 00251 for( ; i <= width-4; i+=4 ) 00252 { 00253 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); 00254 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); 00255 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); 00256 } 00257 } 00258 #elif CV_NEON 00259 for( ; i <= width - 4; i+=4 ) 00260 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); 00261 #endif 00262 for( ; i < width; i++ ) 00263 SUM[i] += Sp[i]; 00264 } 00265 } 00266 else 00267 { 00268 CV_Assert( sumCount == ksize-1 ); 00269 src += ksize-1; 00270 } 00271 00272 for( ; count--; src++ ) 00273 { 00274 const int* Sp = (const int*)src[0]; 00275 const int* Sm = (const int*)src[1-ksize]; 00276 uchar* D = (uchar*)dst; 00277 if( haveScale ) 00278 { 00279 i = 0; 00280 #if CV_SSE2 00281 if(haveSSE2) 00282 { 00283 const __m128 scale4 = _mm_set1_ps((float)_scale); 00284 for( ; i <= width-8; i+=8 ) 00285 { 00286 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00287 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); 00288 00289 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00290 _mm_loadu_si128((const __m128i*)(Sp+i))); 00291 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), 00292 _mm_loadu_si128((const __m128i*)(Sp+i+4))); 00293 00294 __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); 00295 __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01))); 00296 00297 _s0T = _mm_packs_epi32(_s0T, _s0T1); 00298 00299 _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T)); 00300 00301 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); 00302 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); 00303 } 00304 } 00305 #elif CV_NEON 00306 float32x4_t v_scale = vdupq_n_f32((float)_scale); 00307 for( ; i <= width-8; i+=8 ) 00308 { 00309 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00310 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); 00311 00312 uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); 00313 uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); 00314 00315 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)); 00316 vst1_u8(D + i, vqmovn_u16(v_dst)); 00317 00318 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00319 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); 00320 } 00321 #endif 00322 for( ; i < width; i++ ) 00323 { 00324 int s0 = SUM[i] + Sp[i]; 00325 D[i] = saturate_cast<uchar>(s0*_scale); 00326 SUM[i] = s0 - Sm[i]; 00327 } 00328 } 00329 else 00330 { 00331 i = 0; 00332 #if CV_SSE2 00333 if(haveSSE2) 00334 { 00335 for( ; i <= width-8; i+=8 ) 00336 { 00337 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00338 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); 00339 00340 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00341 _mm_loadu_si128((const __m128i*)(Sp+i))); 00342 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), 00343 _mm_loadu_si128((const __m128i*)(Sp+i+4))); 00344 00345 __m128i _s0T = _mm_packs_epi32(_s0, _s01); 00346 00347 _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T)); 00348 00349 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); 00350 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); 00351 } 00352 } 00353 #elif CV_NEON 00354 for( ; i <= width-8; i+=8 ) 00355 { 00356 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00357 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); 00358 00359 uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)); 00360 vst1_u8(D + i, vqmovn_u16(v_dst)); 00361 00362 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00363 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); 00364 } 00365 #endif 00366 00367 for( ; i < width; i++ ) 00368 { 00369 int s0 = SUM[i] + Sp[i]; 00370 D[i] = saturate_cast<uchar>(s0); 00371 SUM[i] = s0 - Sm[i]; 00372 } 00373 } 00374 dst += dststep; 00375 } 00376 } 00377 00378 double scale; 00379 int sumCount; 00380 std::vector<int> sum; 00381 }; 00382 00383 template<> 00384 struct ColumnSum<int, short> : 00385 public BaseColumnFilter 00386 { 00387 ColumnSum( int _ksize, int _anchor, double _scale ) : 00388 BaseColumnFilter() 00389 { 00390 ksize = _ksize; 00391 anchor = _anchor; 00392 scale = _scale; 00393 sumCount = 0; 00394 } 00395 00396 virtual void reset() { sumCount = 0; } 00397 00398 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 00399 { 00400 int i; 00401 int* SUM; 00402 bool haveScale = scale != 1; 00403 double _scale = scale; 00404 00405 #if CV_SSE2 00406 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 00407 #endif 00408 00409 if( width != (int)sum.size() ) 00410 { 00411 sum.resize(width); 00412 sumCount = 0; 00413 } 00414 SUM = &sum[0]; 00415 if( sumCount == 0 ) 00416 { 00417 memset((void*)SUM, 0, width*sizeof(int)); 00418 for( ; sumCount < ksize - 1; sumCount++, src++ ) 00419 { 00420 const int* Sp = (const int*)src[0]; 00421 i = 0; 00422 #if CV_SSE2 00423 if(haveSSE2) 00424 { 00425 for( ; i <= width-4; i+=4 ) 00426 { 00427 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); 00428 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); 00429 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); 00430 } 00431 } 00432 #elif CV_NEON 00433 for( ; i <= width - 4; i+=4 ) 00434 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); 00435 #endif 00436 for( ; i < width; i++ ) 00437 SUM[i] += Sp[i]; 00438 } 00439 } 00440 else 00441 { 00442 CV_Assert( sumCount == ksize-1 ); 00443 src += ksize-1; 00444 } 00445 00446 for( ; count--; src++ ) 00447 { 00448 const int* Sp = (const int*)src[0]; 00449 const int* Sm = (const int*)src[1-ksize]; 00450 short* D = (short*)dst; 00451 if( haveScale ) 00452 { 00453 i = 0; 00454 #if CV_SSE2 00455 if(haveSSE2) 00456 { 00457 const __m128 scale4 = _mm_set1_ps((float)_scale); 00458 for( ; i <= width-8; i+=8 ) 00459 { 00460 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00461 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); 00462 00463 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00464 _mm_loadu_si128((const __m128i*)(Sp+i))); 00465 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), 00466 _mm_loadu_si128((const __m128i*)(Sp+i+4))); 00467 00468 __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); 00469 __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01))); 00470 00471 _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0T, _s0T1)); 00472 00473 _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm)); 00474 _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1)); 00475 } 00476 } 00477 #elif CV_NEON 00478 float32x4_t v_scale = vdupq_n_f32((float)_scale); 00479 for( ; i <= width-8; i+=8 ) 00480 { 00481 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00482 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); 00483 00484 int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); 00485 int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); 00486 vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d))); 00487 00488 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00489 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); 00490 } 00491 #endif 00492 for( ; i < width; i++ ) 00493 { 00494 int s0 = SUM[i] + Sp[i]; 00495 D[i] = saturate_cast<short>(s0*_scale); 00496 SUM[i] = s0 - Sm[i]; 00497 } 00498 } 00499 else 00500 { 00501 i = 0; 00502 #if CV_SSE2 00503 if(haveSSE2) 00504 { 00505 for( ; i <= width-8; i+=8 ) 00506 { 00507 00508 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00509 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); 00510 00511 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00512 _mm_loadu_si128((const __m128i*)(Sp+i))); 00513 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), 00514 _mm_loadu_si128((const __m128i*)(Sp+i+4))); 00515 00516 _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0, _s01)); 00517 00518 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); 00519 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); 00520 } 00521 } 00522 #elif CV_NEON 00523 for( ; i <= width-8; i+=8 ) 00524 { 00525 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00526 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); 00527 00528 vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01))); 00529 00530 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00531 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); 00532 } 00533 #endif 00534 00535 for( ; i < width; i++ ) 00536 { 00537 int s0 = SUM[i] + Sp[i]; 00538 D[i] = saturate_cast<short>(s0); 00539 SUM[i] = s0 - Sm[i]; 00540 } 00541 } 00542 dst += dststep; 00543 } 00544 } 00545 00546 double scale; 00547 int sumCount; 00548 std::vector<int> sum; 00549 }; 00550 00551 00552 template<> 00553 struct ColumnSum<int, ushort> : 00554 public BaseColumnFilter 00555 { 00556 ColumnSum( int _ksize, int _anchor, double _scale ) : 00557 BaseColumnFilter() 00558 { 00559 ksize = _ksize; 00560 anchor = _anchor; 00561 scale = _scale; 00562 sumCount = 0; 00563 } 00564 00565 virtual void reset() { sumCount = 0; } 00566 00567 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 00568 { 00569 int i; 00570 int* SUM; 00571 bool haveScale = scale != 1; 00572 double _scale = scale; 00573 #if CV_SSE2 00574 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 00575 #endif 00576 00577 if( width != (int)sum.size() ) 00578 { 00579 sum.resize(width); 00580 sumCount = 0; 00581 } 00582 SUM = &sum[0]; 00583 if( sumCount == 0 ) 00584 { 00585 memset((void*)SUM, 0, width*sizeof(int)); 00586 for( ; sumCount < ksize - 1; sumCount++, src++ ) 00587 { 00588 const int* Sp = (const int*)src[0]; 00589 i = 0; 00590 #if CV_SSE2 00591 if(haveSSE2) 00592 { 00593 for( ; i < width-4; i+=4 ) 00594 { 00595 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); 00596 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); 00597 _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp)); 00598 } 00599 } 00600 #elif CV_NEON 00601 for( ; i <= width - 4; i+=4 ) 00602 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); 00603 #endif 00604 for( ; i < width; i++ ) 00605 SUM[i] += Sp[i]; 00606 } 00607 } 00608 else 00609 { 00610 CV_Assert( sumCount == ksize-1 ); 00611 src += ksize-1; 00612 } 00613 00614 for( ; count--; src++ ) 00615 { 00616 const int* Sp = (const int*)src[0]; 00617 const int* Sm = (const int*)src[1-ksize]; 00618 ushort* D = (ushort*)dst; 00619 if( haveScale ) 00620 { 00621 i = 0; 00622 #if CV_SSE2 00623 if(haveSSE2) 00624 { 00625 const __m128 scale4 = _mm_set1_ps((float)_scale); 00626 const __m128i delta0 = _mm_set1_epi32(0x8000); 00627 const __m128i delta1 = _mm_set1_epi32(0x80008000); 00628 00629 for( ; i < width-4; i+=4) 00630 { 00631 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00632 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00633 _mm_loadu_si128((const __m128i*)(Sp+i))); 00634 00635 __m128i _res = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); 00636 00637 _res = _mm_sub_epi32(_res, delta0); 00638 _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1); 00639 00640 _mm_storel_epi64((__m128i*)(D+i), _res); 00641 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); 00642 } 00643 } 00644 #elif CV_NEON 00645 float32x4_t v_scale = vdupq_n_f32((float)_scale); 00646 for( ; i <= width-8; i+=8 ) 00647 { 00648 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00649 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); 00650 00651 uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); 00652 uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); 00653 vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d))); 00654 00655 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00656 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); 00657 } 00658 #endif 00659 for( ; i < width; i++ ) 00660 { 00661 int s0 = SUM[i] + Sp[i]; 00662 D[i] = saturate_cast<ushort>(s0*_scale); 00663 SUM[i] = s0 - Sm[i]; 00664 } 00665 } 00666 else 00667 { 00668 i = 0; 00669 #if CV_SSE2 00670 if(haveSSE2) 00671 { 00672 const __m128i delta0 = _mm_set1_epi32(0x8000); 00673 const __m128i delta1 = _mm_set1_epi32(0x80008000); 00674 00675 for( ; i < width-4; i+=4 ) 00676 { 00677 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00678 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00679 _mm_loadu_si128((const __m128i*)(Sp+i))); 00680 00681 __m128i _res = _mm_sub_epi32(_s0, delta0); 00682 _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1); 00683 00684 _mm_storel_epi64((__m128i*)(D+i), _res); 00685 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); 00686 } 00687 } 00688 #elif CV_NEON 00689 for( ; i <= width-8; i+=8 ) 00690 { 00691 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00692 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); 00693 00694 vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01))); 00695 00696 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00697 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); 00698 } 00699 #endif 00700 00701 for( ; i < width; i++ ) 00702 { 00703 int s0 = SUM[i] + Sp[i]; 00704 D[i] = saturate_cast<ushort>(s0); 00705 SUM[i] = s0 - Sm[i]; 00706 } 00707 } 00708 dst += dststep; 00709 } 00710 } 00711 00712 double scale; 00713 int sumCount; 00714 std::vector<int> sum; 00715 }; 00716 00717 template<> 00718 struct ColumnSum<int, int> : 00719 public BaseColumnFilter 00720 { 00721 ColumnSum( int _ksize, int _anchor, double _scale ) : 00722 BaseColumnFilter() 00723 { 00724 ksize = _ksize; 00725 anchor = _anchor; 00726 scale = _scale; 00727 sumCount = 0; 00728 } 00729 00730 virtual void reset() { sumCount = 0; } 00731 00732 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 00733 { 00734 int i; 00735 int* SUM; 00736 bool haveScale = scale != 1; 00737 double _scale = scale; 00738 00739 #if CV_SSE2 00740 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 00741 #endif 00742 00743 if( width != (int)sum.size() ) 00744 { 00745 sum.resize(width); 00746 sumCount = 0; 00747 } 00748 SUM = &sum[0]; 00749 if( sumCount == 0 ) 00750 { 00751 memset((void*)SUM, 0, width*sizeof(int)); 00752 for( ; sumCount < ksize - 1; sumCount++, src++ ) 00753 { 00754 const int* Sp = (const int*)src[0]; 00755 i = 0; 00756 #if CV_SSE2 00757 if(haveSSE2) 00758 { 00759 for( ; i <= width-4; i+=4 ) 00760 { 00761 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); 00762 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); 00763 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); 00764 } 00765 } 00766 #elif CV_NEON 00767 for( ; i <= width - 4; i+=4 ) 00768 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); 00769 #endif 00770 for( ; i < width; i++ ) 00771 SUM[i] += Sp[i]; 00772 } 00773 } 00774 else 00775 { 00776 CV_Assert( sumCount == ksize-1 ); 00777 src += ksize-1; 00778 } 00779 00780 for( ; count--; src++ ) 00781 { 00782 const int* Sp = (const int*)src[0]; 00783 const int* Sm = (const int*)src[1-ksize]; 00784 int* D = (int*)dst; 00785 if( haveScale ) 00786 { 00787 i = 0; 00788 #if CV_SSE2 00789 if(haveSSE2) 00790 { 00791 const __m128 scale4 = _mm_set1_ps((float)_scale); 00792 for( ; i <= width-4; i+=4 ) 00793 { 00794 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00795 00796 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00797 _mm_loadu_si128((const __m128i*)(Sp+i))); 00798 00799 __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); 00800 00801 _mm_storeu_si128((__m128i*)(D+i), _s0T); 00802 _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm)); 00803 } 00804 } 00805 #elif CV_NEON 00806 float32x4_t v_scale = vdupq_n_f32((float)_scale); 00807 for( ; i <= width-4; i+=4 ) 00808 { 00809 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00810 00811 int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); 00812 vst1q_s32(D + i, v_s0d); 00813 00814 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00815 } 00816 #endif 00817 for( ; i < width; i++ ) 00818 { 00819 int s0 = SUM[i] + Sp[i]; 00820 D[i] = saturate_cast<int>(s0*_scale); 00821 SUM[i] = s0 - Sm[i]; 00822 } 00823 } 00824 else 00825 { 00826 i = 0; 00827 #if CV_SSE2 00828 if(haveSSE2) 00829 { 00830 for( ; i <= width-4; i+=4 ) 00831 { 00832 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00833 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00834 _mm_loadu_si128((const __m128i*)(Sp+i))); 00835 00836 _mm_storeu_si128((__m128i*)(D+i), _s0); 00837 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); 00838 } 00839 } 00840 #elif CV_NEON 00841 for( ; i <= width-4; i+=4 ) 00842 { 00843 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00844 00845 vst1q_s32(D + i, v_s0); 00846 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00847 } 00848 #endif 00849 00850 for( ; i < width; i++ ) 00851 { 00852 int s0 = SUM[i] + Sp[i]; 00853 D[i] = s0; 00854 SUM[i] = s0 - Sm[i]; 00855 } 00856 } 00857 dst += dststep; 00858 } 00859 } 00860 00861 double scale; 00862 int sumCount; 00863 std::vector<int> sum; 00864 }; 00865 00866 00867 template<> 00868 struct ColumnSum<int, float> : 00869 public BaseColumnFilter 00870 { 00871 ColumnSum( int _ksize, int _anchor, double _scale ) : 00872 BaseColumnFilter() 00873 { 00874 ksize = _ksize; 00875 anchor = _anchor; 00876 scale = _scale; 00877 sumCount = 0; 00878 } 00879 00880 virtual void reset() { sumCount = 0; } 00881 00882 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 00883 { 00884 int i; 00885 int* SUM; 00886 bool haveScale = scale != 1; 00887 double _scale = scale; 00888 00889 #if CV_SSE2 00890 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 00891 #endif 00892 00893 if( width != (int)sum.size() ) 00894 { 00895 sum.resize(width); 00896 sumCount = 0; 00897 } 00898 00899 SUM = &sum[0]; 00900 if( sumCount == 0 ) 00901 { 00902 memset((void *)SUM, 0, sizeof(int) * width); 00903 00904 for( ; sumCount < ksize - 1; sumCount++, src++ ) 00905 { 00906 const int* Sp = (const int*)src[0]; 00907 i = 0; 00908 00909 #if CV_SSE2 00910 if(haveSSE2) 00911 { 00912 for( ; i < width-4; i+=4 ) 00913 { 00914 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); 00915 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); 00916 _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp)); 00917 } 00918 } 00919 #elif CV_NEON 00920 for( ; i <= width - 4; i+=4 ) 00921 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); 00922 #endif 00923 00924 for( ; i < width; i++ ) 00925 SUM[i] += Sp[i]; 00926 } 00927 } 00928 else 00929 { 00930 CV_Assert( sumCount == ksize-1 ); 00931 src += ksize-1; 00932 } 00933 00934 for( ; count--; src++ ) 00935 { 00936 const int * Sp = (const int*)src[0]; 00937 const int * Sm = (const int*)src[1-ksize]; 00938 float* D = (float*)dst; 00939 if( haveScale ) 00940 { 00941 i = 0; 00942 00943 #if CV_SSE2 00944 if(haveSSE2) 00945 { 00946 const __m128 scale4 = _mm_set1_ps((float)_scale); 00947 00948 for( ; i < width-4; i+=4) 00949 { 00950 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00951 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00952 _mm_loadu_si128((const __m128i*)(Sp+i))); 00953 00954 _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); 00955 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); 00956 } 00957 } 00958 #elif CV_NEON 00959 float32x4_t v_scale = vdupq_n_f32((float)_scale); 00960 for( ; i <= width-8; i+=8 ) 00961 { 00962 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 00963 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); 00964 00965 vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); 00966 vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); 00967 00968 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 00969 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); 00970 } 00971 #endif 00972 00973 for( ; i < width; i++ ) 00974 { 00975 int s0 = SUM[i] + Sp[i]; 00976 D[i] = (float)(s0*_scale); 00977 SUM[i] = s0 - Sm[i]; 00978 } 00979 } 00980 else 00981 { 00982 i = 0; 00983 00984 #if CV_SSE2 00985 if(haveSSE2) 00986 { 00987 for( ; i < width-4; i+=4) 00988 { 00989 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); 00990 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), 00991 _mm_loadu_si128((const __m128i*)(Sp+i))); 00992 00993 _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0)); 00994 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); 00995 } 00996 } 00997 #elif CV_NEON 00998 for( ; i <= width-8; i+=8 ) 00999 { 01000 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); 01001 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); 01002 01003 vst1q_f32(D + i, vcvtq_f32_s32(v_s0)); 01004 vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01)); 01005 01006 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); 01007 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); 01008 } 01009 #endif 01010 01011 for( ; i < width; i++ ) 01012 { 01013 int s0 = SUM[i] + Sp[i]; 01014 D[i] = (float)(s0); 01015 SUM[i] = s0 - Sm[i]; 01016 } 01017 } 01018 dst += dststep; 01019 } 01020 } 01021 01022 double scale; 01023 int sumCount; 01024 std::vector<int> sum; 01025 }; 01026 01027 #ifdef HAVE_OPENCL 01028 01029 #define DIVUP(total, grain) ((total + grain - 1) / (grain)) 01030 #define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) 01031 01032 static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, 01033 Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false ) 01034 { 01035 const ocl::Device & dev = ocl::Device::getDefault(); 01036 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); 01037 bool doubleSupport = dev.doubleFPConfig() > 0; 01038 01039 if (ddepth < 0) 01040 ddepth = sdepth; 01041 01042 if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || 01043 _src.offset() % esz != 0 || _src.step() % esz != 0) 01044 return false; 01045 01046 if (anchor.x < 0) 01047 anchor.x = ksize.width / 2; 01048 if (anchor.y < 0) 01049 anchor.y = ksize.height / 2; 01050 01051 int computeUnits = ocl::Device::getDefault().maxComputeUnits(); 01052 float alpha = 1.0f / (ksize.height * ksize.width); 01053 Size size = _src.size(), wholeSize; 01054 bool isolated = (borderType & BORDER_ISOLATED) != 0; 01055 borderType &= ~BORDER_ISOLATED; 01056 int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)), 01057 wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn); 01058 01059 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; 01060 size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; 01061 size_t localsize_general[2] = { 0, 1 }, * localsize = NULL; 01062 01063 UMat src = _src.getUMat(); 01064 if (!isolated) 01065 { 01066 Point ofs; 01067 src.locateROI(wholeSize, ofs); 01068 } 01069 01070 int h = isolated ? size.height : wholeSize.height; 01071 int w = isolated ? size.width : wholeSize.width; 01072 01073 size_t maxWorkItemSizes[32]; 01074 ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes); 01075 int tryWorkItems = (int)maxWorkItemSizes[0]; 01076 01077 ocl::Kernel kernel; 01078 01079 if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) && 01080 ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || 01081 (ksize.width == 5 && ksize.height == 5 && cn == 1))) 01082 { 01083 if (w < ksize.width || h < ksize.height) 01084 return false; 01085 01086 // Figure out what vector size to use for loading the pixels. 01087 int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; 01088 int pxLoadVecSize = cn * pxLoadNumPixels; 01089 01090 // Figure out how many pixels per work item to compute in X and Y 01091 // directions. Too many and we run out of registers. 01092 int pxPerWorkItemX = 1, pxPerWorkItemY = 1; 01093 if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) 01094 { 01095 pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; 01096 pxPerWorkItemY = size.height % 2 ? 1 : 2; 01097 } 01098 else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) 01099 { 01100 pxPerWorkItemX = size.width % 2 ? 1 : 2; 01101 pxPerWorkItemY = size.height % 2 ? 1 : 2; 01102 } 01103 globalsize[0] = size.width / pxPerWorkItemX; 01104 globalsize[1] = size.height / pxPerWorkItemY; 01105 01106 // Need some padding in the private array for pixels 01107 int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); 01108 01109 // Make the global size a nice round number so the runtime can pick 01110 // from reasonable choices for the workgroup size 01111 const int wgRound = 256; 01112 globalsize[0] = ROUNDUP(globalsize[0], wgRound); 01113 01114 char build_options[1024], cvt[2][40]; 01115 sprintf(build_options, "-D cn=%d " 01116 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " 01117 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " 01118 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " 01119 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " 01120 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " 01121 "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER", 01122 cn, anchor.x, anchor.y, ksize.width, ksize.height, 01123 pxLoadVecSize, pxLoadNumPixels, 01124 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], 01125 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", 01126 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, 01127 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), 01128 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), 01129 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), 01130 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), 01131 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", 01132 ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV 01133 ); 01134 01135 01136 if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options)) 01137 return false; 01138 } 01139 else 01140 { 01141 localsize = localsize_general; 01142 for ( ; ; ) 01143 { 01144 int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); 01145 01146 while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) 01147 BLOCK_SIZE_X /= 2; 01148 while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) 01149 BLOCK_SIZE_Y *= 2; 01150 01151 if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) 01152 return false; 01153 01154 char cvt[2][50]; 01155 String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" 01156 " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" 01157 " -D ST1=%s -D DT1=%s -D cn=%d", 01158 BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), 01159 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), 01160 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), 01161 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), 01162 anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], 01163 isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", 01164 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", 01165 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); 01166 01167 localsize[0] = BLOCK_SIZE_X; 01168 globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; 01169 globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y); 01170 01171 kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); 01172 if (kernel.empty()) 01173 return false; 01174 01175 size_t kernelWorkGroupSize = kernel.workGroupSize(); 01176 if (localsize[0] <= kernelWorkGroupSize) 01177 break; 01178 if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) 01179 return false; 01180 01181 tryWorkItems = (int)kernelWorkGroupSize; 01182 } 01183 } 01184 01185 _dst.create(size, CV_MAKETYPE(ddepth, cn)); 01186 UMat dst = _dst.getUMat(); 01187 01188 int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); 01189 idxArg = kernel.set(idxArg, (int)src.step); 01190 int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); 01191 int srcOffsetY = (int)(src.offset / src.step); 01192 int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; 01193 int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; 01194 idxArg = kernel.set(idxArg, srcOffsetX); 01195 idxArg = kernel.set(idxArg, srcOffsetY); 01196 idxArg = kernel.set(idxArg, srcEndX); 01197 idxArg = kernel.set(idxArg, srcEndY); 01198 idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); 01199 if (normalize) 01200 idxArg = kernel.set(idxArg, (float)alpha); 01201 01202 return kernel.run(2, globalsize, localsize, false); 01203 } 01204 01205 #undef ROUNDUP 01206 01207 #endif 01208 01209 } 01210 01211 01212 cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor) 01213 { 01214 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); 01215 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); 01216 01217 if( anchor < 0 ) 01218 anchor = ksize/2; 01219 01220 if( sdepth == CV_8U && ddepth == CV_32S ) 01221 return makePtr<RowSum<uchar, int> >(ksize, anchor); 01222 if( sdepth == CV_8U && ddepth == CV_64F ) 01223 return makePtr<RowSum<uchar, double> >(ksize, anchor); 01224 if( sdepth == CV_16U && ddepth == CV_32S ) 01225 return makePtr<RowSum<ushort, int> >(ksize, anchor); 01226 if( sdepth == CV_16U && ddepth == CV_64F ) 01227 return makePtr<RowSum<ushort, double> >(ksize, anchor); 01228 if( sdepth == CV_16S && ddepth == CV_32S ) 01229 return makePtr<RowSum<short, int> >(ksize, anchor); 01230 if( sdepth == CV_32S && ddepth == CV_32S ) 01231 return makePtr<RowSum<int, int> >(ksize, anchor); 01232 if( sdepth == CV_16S && ddepth == CV_64F ) 01233 return makePtr<RowSum<short, double> >(ksize, anchor); 01234 if( sdepth == CV_32F && ddepth == CV_64F ) 01235 return makePtr<RowSum<float, double> >(ksize, anchor); 01236 if( sdepth == CV_64F && ddepth == CV_64F ) 01237 return makePtr<RowSum<double, double> >(ksize, anchor); 01238 01239 CV_Error_( CV_StsNotImplemented, 01240 ("Unsupported combination of source format (=%d), and buffer format (=%d)", 01241 srcType, sumType)); 01242 01243 return Ptr<BaseRowFilter> (); 01244 } 01245 01246 01247 cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize, 01248 int anchor, double scale) 01249 { 01250 int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType); 01251 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) ); 01252 01253 if( anchor < 0 ) 01254 anchor = ksize/2; 01255 01256 if( ddepth == CV_8U && sdepth == CV_32S ) 01257 return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale); 01258 if( ddepth == CV_8U && sdepth == CV_64F ) 01259 return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale); 01260 if( ddepth == CV_16U && sdepth == CV_32S ) 01261 return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale); 01262 if( ddepth == CV_16U && sdepth == CV_64F ) 01263 return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale); 01264 if( ddepth == CV_16S && sdepth == CV_32S ) 01265 return makePtr<ColumnSum<int, short> >(ksize, anchor, scale); 01266 if( ddepth == CV_16S && sdepth == CV_64F ) 01267 return makePtr<ColumnSum<double, short> >(ksize, anchor, scale); 01268 if( ddepth == CV_32S && sdepth == CV_32S ) 01269 return makePtr<ColumnSum<int, int> >(ksize, anchor, scale); 01270 if( ddepth == CV_32F && sdepth == CV_32S ) 01271 return makePtr<ColumnSum<int, float> >(ksize, anchor, scale); 01272 if( ddepth == CV_32F && sdepth == CV_64F ) 01273 return makePtr<ColumnSum<double, float> >(ksize, anchor, scale); 01274 if( ddepth == CV_64F && sdepth == CV_32S ) 01275 return makePtr<ColumnSum<int, double> >(ksize, anchor, scale); 01276 if( ddepth == CV_64F && sdepth == CV_64F ) 01277 return makePtr<ColumnSum<double, double> >(ksize, anchor, scale); 01278 01279 CV_Error_( CV_StsNotImplemented, 01280 ("Unsupported combination of sum format (=%d), and destination format (=%d)", 01281 sumType, dstType)); 01282 01283 return Ptr<BaseColumnFilter> (); 01284 } 01285 01286 01287 cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize, 01288 Point anchor, bool normalize, int borderType ) 01289 { 01290 int sdepth = CV_MAT_DEPTH(srcType); 01291 int cn = CV_MAT_CN(srcType), sumType = CV_64F; 01292 if( sdepth <= CV_32S && (!normalize || 01293 ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) : 01294 sdepth == CV_16U ? (1 << 15) : (1 << 16))) ) 01295 sumType = CV_32S; 01296 sumType = CV_MAKETYPE( sumType, cn ); 01297 01298 Ptr<BaseRowFilter> rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x ); 01299 Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType, 01300 dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1); 01301 01302 return makePtr<FilterEngine>(Ptr<BaseFilter> (), rowFilter, columnFilter, 01303 srcType, dstType, sumType, borderType ); 01304 } 01305 01306 #if defined(HAVE_IPP) 01307 namespace cv 01308 { 01309 static bool ipp_boxfilter( InputArray _src, OutputArray _dst, int ddepth, 01310 Size ksize, Point anchor, 01311 bool normalize, int borderType ) 01312 { 01313 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype); 01314 if( ddepth < 0 ) 01315 ddepth = sdepth; 01316 int ippBorderType = borderType & ~BORDER_ISOLATED; 01317 Point ocvAnchor, ippAnchor; 01318 ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x; 01319 ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y; 01320 ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0); 01321 ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0); 01322 01323 Mat src = _src.getMat(); 01324 _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) ); 01325 Mat dst = _dst.getMat(); 01326 if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 ) 01327 { 01328 if( src.rows == 1 ) 01329 ksize.height = 1; 01330 if( src.cols == 1 ) 01331 ksize.width = 1; 01332 } 01333 01334 { 01335 if (normalize && !src.isSubmatrix() && ddepth == sdepth && 01336 (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */ 01337 ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor && 01338 dst.cols != ksize.width && dst.rows != ksize.height) // returns ippStsMaskSizeErr: mask has an illegal value 01339 { 01340 Ipp32s bufSize = 0; 01341 IppiSize roiSize = { dst.cols, dst.rows }, maskSize = { ksize.width, ksize.height }; 01342 01343 #define IPP_FILTER_BOX_BORDER(ippType, ippDataType, flavor) \ 01344 do \ 01345 { \ 01346 if (ippiFilterBoxBorderGetBufferSize(roiSize, maskSize, ippDataType, cn, &bufSize) >= 0) \ 01347 { \ 01348 Ipp8u * buffer = ippsMalloc_8u(bufSize); \ 01349 ippType borderValue[4] = { 0, 0, 0, 0 }; \ 01350 ippBorderType = ippBorderType == BORDER_CONSTANT ? ippBorderConst : ippBorderRepl; \ 01351 IppStatus status = ippiFilterBoxBorder_##flavor(src.ptr<ippType>(), (int)src.step, dst.ptr<ippType>(), \ 01352 (int)dst.step, roiSize, maskSize, \ 01353 (IppiBorderType)ippBorderType, borderValue, buffer); \ 01354 ippsFree(buffer); \ 01355 if (status >= 0) \ 01356 { \ 01357 CV_IMPL_ADD(CV_IMPL_IPP); \ 01358 return true; \ 01359 } \ 01360 } \ 01361 } while ((void)0, 0) 01362 01363 if (stype == CV_8UC1) 01364 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C1R); 01365 else if (stype == CV_8UC3) 01366 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C3R); 01367 else if (stype == CV_8UC4) 01368 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C4R); 01369 01370 // Oct 2014: performance with BORDER_CONSTANT 01371 //else if (stype == CV_16UC1) 01372 // IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C1R); 01373 else if (stype == CV_16UC3) 01374 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C3R); 01375 else if (stype == CV_16UC4) 01376 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C4R); 01377 01378 // Oct 2014: performance with BORDER_CONSTANT 01379 //else if (stype == CV_16SC1) 01380 // IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C1R); 01381 else if (stype == CV_16SC3) 01382 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C3R); 01383 else if (stype == CV_16SC4) 01384 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C4R); 01385 01386 else if (stype == CV_32FC1) 01387 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C1R); 01388 else if (stype == CV_32FC3) 01389 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C3R); 01390 else if (stype == CV_32FC4) 01391 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C4R); 01392 } 01393 #undef IPP_FILTER_BOX_BORDER 01394 } 01395 return false; 01396 } 01397 } 01398 #endif 01399 01400 01401 void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, 01402 Size ksize, Point anchor, 01403 bool normalize, int borderType ) 01404 { 01405 #ifdef HAVE_OPENCL 01406 CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) 01407 #endif 01408 01409 Mat src = _src.getMat(); 01410 int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype); 01411 if( ddepth < 0 ) 01412 ddepth = sdepth; 01413 _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) ); 01414 Mat dst = _dst.getMat(); 01415 if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 ) 01416 { 01417 if( src.rows == 1 ) 01418 ksize.height = 1; 01419 if( src.cols == 1 ) 01420 ksize.width = 1; 01421 } 01422 #ifdef HAVE_TEGRA_OPTIMIZATION 01423 if ( tegra::useTegra() && tegra::box(src, dst, ksize, anchor, normalize, borderType) ) 01424 return; 01425 #endif 01426 01427 #ifdef HAVE_IPP 01428 int ippBorderType = borderType & ~BORDER_ISOLATED; 01429 #endif 01430 Point ocvAnchor, ippAnchor; 01431 ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x; 01432 ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y; 01433 ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0); 01434 ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0); 01435 CV_IPP_RUN((normalize && !_src.isSubmatrix() && ddepth == sdepth && 01436 (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */ 01437 ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor && 01438 _dst.cols() != ksize.width && _dst.rows() != ksize.height), 01439 ipp_boxfilter( _src, _dst, ddepth, ksize, anchor, normalize, borderType)); 01440 01441 01442 Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(), 01443 ksize, anchor, normalize, borderType ); 01444 f->apply( src, dst ); 01445 } 01446 01447 01448 void cv::blur( InputArray src, OutputArray dst, 01449 Size ksize, Point anchor, int borderType ) 01450 { 01451 boxFilter( src, dst, -1, ksize, anchor, true, borderType ); 01452 } 01453 01454 01455 /****************************************************************************************\ 01456 Squared Box Filter 01457 \****************************************************************************************/ 01458 01459 namespace cv 01460 { 01461 01462 template<typename T, typename ST> 01463 struct SqrRowSum : 01464 public BaseRowFilter 01465 { 01466 SqrRowSum( int _ksize, int _anchor ) : 01467 BaseRowFilter() 01468 { 01469 ksize = _ksize; 01470 anchor = _anchor; 01471 } 01472 01473 virtual void operator()(const uchar* src, uchar* dst, int width, int cn) 01474 { 01475 const T* S = (const T*)src; 01476 ST* D = (ST*)dst; 01477 int i = 0, k, ksz_cn = ksize*cn; 01478 01479 width = (width - 1)*cn; 01480 for( k = 0; k < cn; k++, S++, D++ ) 01481 { 01482 ST s = 0; 01483 for( i = 0; i < ksz_cn; i += cn ) 01484 { 01485 ST val = (ST)S[i]; 01486 s += val*val; 01487 } 01488 D[0] = s; 01489 for( i = 0; i < width; i += cn ) 01490 { 01491 ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn]; 01492 s += val1*val1 - val0*val0; 01493 D[i+cn] = s; 01494 } 01495 } 01496 } 01497 }; 01498 01499 static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) 01500 { 01501 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); 01502 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); 01503 01504 if( anchor < 0 ) 01505 anchor = ksize/2; 01506 01507 if( sdepth == CV_8U && ddepth == CV_32S ) 01508 return makePtr<SqrRowSum<uchar, int> >(ksize, anchor); 01509 if( sdepth == CV_8U && ddepth == CV_64F ) 01510 return makePtr<SqrRowSum<uchar, double> >(ksize, anchor); 01511 if( sdepth == CV_16U && ddepth == CV_64F ) 01512 return makePtr<SqrRowSum<ushort, double> >(ksize, anchor); 01513 if( sdepth == CV_16S && ddepth == CV_64F ) 01514 return makePtr<SqrRowSum<short, double> >(ksize, anchor); 01515 if( sdepth == CV_32F && ddepth == CV_64F ) 01516 return makePtr<SqrRowSum<float, double> >(ksize, anchor); 01517 if( sdepth == CV_64F && ddepth == CV_64F ) 01518 return makePtr<SqrRowSum<double, double> >(ksize, anchor); 01519 01520 CV_Error_( CV_StsNotImplemented, 01521 ("Unsupported combination of source format (=%d), and buffer format (=%d)", 01522 srcType, sumType)); 01523 01524 return Ptr<BaseRowFilter>(); 01525 } 01526 01527 } 01528 01529 void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, 01530 Size ksize, Point anchor, 01531 bool normalize, int borderType ) 01532 { 01533 int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType); 01534 Size size = _src.size(); 01535 01536 if( ddepth < 0 ) 01537 ddepth = sdepth < CV_32F ? CV_32F : CV_64F; 01538 01539 if( borderType != BORDER_CONSTANT && normalize ) 01540 { 01541 if( size.height == 1 ) 01542 ksize.height = 1; 01543 if( size.width == 1 ) 01544 ksize.width = 1; 01545 } 01546 01547 #ifdef HAVE_OPENCL 01548 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, 01549 ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true)) 01550 #endif 01551 01552 int sumDepth = CV_64F; 01553 if( sdepth == CV_8U ) 01554 sumDepth = CV_32S; 01555 int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn); 01556 01557 Mat src = _src.getMat(); 01558 _dst.create( size, dstType ); 01559 Mat dst = _dst.getMat(); 01560 01561 Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x ); 01562 Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType, 01563 dstType, ksize.height, anchor.y, 01564 normalize ? 1./(ksize.width*ksize.height) : 1); 01565 01566 Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter> (), rowFilter, columnFilter, 01567 srcType, dstType, sumType, borderType ); 01568 f->apply( src, dst ); 01569 } 01570 01571 01572 /****************************************************************************************\ 01573 Gaussian Blur 01574 \****************************************************************************************/ 01575 01576 cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype ) 01577 { 01578 const int SMALL_GAUSSIAN_SIZE = 7; 01579 static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] = 01580 { 01581 {1.f}, 01582 {0.25f, 0.5f, 0.25f}, 01583 {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, 01584 {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f} 01585 }; 01586 01587 const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ? 01588 small_gaussian_tab[n>>1] : 0; 01589 01590 CV_Assert( ktype == CV_32F || ktype == CV_64F ); 01591 Mat kernel(n, 1, ktype); 01592 float* cf = kernel.ptr<float>(); 01593 double* cd = kernel.ptr<double>(); 01594 01595 double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8; 01596 double scale2X = -0.5/(sigmaX*sigmaX); 01597 double sum = 0; 01598 01599 int i; 01600 for( i = 0; i < n; i++ ) 01601 { 01602 double x = i - (n-1)*0.5; 01603 double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x); 01604 if( ktype == CV_32F ) 01605 { 01606 cf[i] = (float)t; 01607 sum += cf[i]; 01608 } 01609 else 01610 { 01611 cd[i] = t; 01612 sum += cd[i]; 01613 } 01614 } 01615 01616 sum = 1./sum; 01617 for( i = 0; i < n; i++ ) 01618 { 01619 if( ktype == CV_32F ) 01620 cf[i] = (float)(cf[i]*sum); 01621 else 01622 cd[i] *= sum; 01623 } 01624 01625 return kernel; 01626 } 01627 01628 namespace cv { 01629 01630 static void createGaussianKernels( Mat & kx, Mat & ky, int type, Size ksize, 01631 double sigma1, double sigma2 ) 01632 { 01633 int depth = CV_MAT_DEPTH(type); 01634 if( sigma2 <= 0 ) 01635 sigma2 = sigma1; 01636 01637 // automatic detection of kernel size from sigma 01638 if( ksize.width <= 0 && sigma1 > 0 ) 01639 ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1; 01640 if( ksize.height <= 0 && sigma2 > 0 ) 01641 ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1; 01642 01643 CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && 01644 ksize.height > 0 && ksize.height % 2 == 1 ); 01645 01646 sigma1 = std::max( sigma1, 0. ); 01647 sigma2 = std::max( sigma2, 0. ); 01648 01649 kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) ); 01650 if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON ) 01651 ky = kx; 01652 else 01653 ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) ); 01654 } 01655 01656 } 01657 01658 cv::Ptr<cv::FilterEngine> cv::createGaussianFilter( int type, Size ksize, 01659 double sigma1, double sigma2, 01660 int borderType ) 01661 { 01662 Mat kx, ky; 01663 createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); 01664 01665 return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType ); 01666 } 01667 01668 #ifdef HAVE_IPP 01669 namespace cv 01670 { 01671 static bool ipp_GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, 01672 double sigma1, double sigma2, 01673 int borderType ) 01674 { 01675 #if IPP_VERSION_X100 >= 810 01676 int type = _src.type(); 01677 Size size = _src.size(); 01678 01679 if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 ) 01680 { 01681 if( size.height == 1 ) 01682 ksize.height = 1; 01683 if( size.width == 1 ) 01684 ksize.width = 1; 01685 } 01686 01687 int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 01688 01689 if ((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn == 1 || cn == 3) && 01690 sigma1 == sigma2 && ksize.width == ksize.height && sigma1 != 0.0 ) 01691 { 01692 IppiBorderType ippBorder = ippiGetBorderType(borderType); 01693 if (ippBorderConst == ippBorder || ippBorderRepl == ippBorder) 01694 { 01695 Mat src = _src.getMat(), dst = _dst.getMat(); 01696 IppiSize roiSize = { src.cols, src.rows }; 01697 IppDataType dataType = ippiGetDataType(depth); 01698 Ipp32s specSize = 0, bufferSize = 0; 01699 01700 if (ippiFilterGaussianGetBufferSize(roiSize, (Ipp32u)ksize.width, dataType, cn, &specSize, &bufferSize) >= 0) 01701 { 01702 IppAutoBuffer<IppFilterGaussianSpec> spec(specSize); 01703 IppAutoBuffer<Ipp8u> buffer(bufferSize); 01704 01705 if (ippiFilterGaussianInit(roiSize, (Ipp32u)ksize.width, (Ipp32f)sigma1, ippBorder, dataType, cn, spec, buffer) >= 0) 01706 { 01707 #define IPP_FILTER_GAUSS_C1(ippfavor) \ 01708 { \ 01709 Ipp##ippfavor borderValues = 0; \ 01710 status = ippiFilterGaussianBorder_##ippfavor##_C1R(src.ptr<Ipp##ippfavor>(), (int)src.step, \ 01711 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \ 01712 } 01713 01714 #define IPP_FILTER_GAUSS_CN(ippfavor, ippcn) \ 01715 { \ 01716 Ipp##ippfavor borderValues[] = { 0, 0, 0 }; \ 01717 status = ippiFilterGaussianBorder_##ippfavor##_C##ippcn##R(src.ptr<Ipp##ippfavor>(), (int)src.step, \ 01718 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \ 01719 } 01720 01721 IppStatus status = ippStsErr; 01722 #if !HAVE_ICV 01723 #if IPP_VERSION_X100 > 900 // Buffer overflow may happen in IPP 9.0.0 and less 01724 if (type == CV_8UC1) 01725 IPP_FILTER_GAUSS_C1(8u) 01726 else 01727 #endif 01728 if (type == CV_8UC3) 01729 IPP_FILTER_GAUSS_CN(8u, 3) 01730 else if (type == CV_16UC1) 01731 IPP_FILTER_GAUSS_C1(16u) 01732 else if (type == CV_16UC3) 01733 IPP_FILTER_GAUSS_CN(16u, 3) 01734 else if (type == CV_16SC1) 01735 IPP_FILTER_GAUSS_C1(16s) 01736 else if (type == CV_16SC3) 01737 IPP_FILTER_GAUSS_CN(16s, 3) 01738 else if (type == CV_32FC3) 01739 IPP_FILTER_GAUSS_CN(32f, 3) 01740 else 01741 #endif 01742 if (type == CV_32FC1) 01743 IPP_FILTER_GAUSS_C1(32f) 01744 01745 if(status >= 0) 01746 return true; 01747 01748 #undef IPP_FILTER_GAUSS_C1 01749 #undef IPP_FILTER_GAUSS_CN 01750 } 01751 } 01752 } 01753 } 01754 #else 01755 CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); 01756 #endif 01757 return false; 01758 } 01759 } 01760 #endif 01761 01762 01763 void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, 01764 double sigma1, double sigma2, 01765 int borderType ) 01766 { 01767 int type = _src.type(); 01768 Size size = _src.size(); 01769 _dst.create( size, type ); 01770 01771 if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 ) 01772 { 01773 if( size.height == 1 ) 01774 ksize.height = 1; 01775 if( size.width == 1 ) 01776 ksize.width = 1; 01777 } 01778 01779 if( ksize.width == 1 && ksize.height == 1 ) 01780 { 01781 _src.copyTo(_dst); 01782 return; 01783 } 01784 01785 #ifdef HAVE_TEGRA_OPTIMIZATION 01786 Mat src = _src.getMat(); 01787 Mat dst = _dst.getMat(); 01788 if(sigma1 == 0 && sigma2 == 0 && tegra::useTegra() && tegra::gaussian(src, dst, ksize, borderType)) 01789 return; 01790 #endif 01791 01792 CV_IPP_RUN(true, ipp_GaussianBlur( _src, _dst, ksize, sigma1, sigma2, borderType)); 01793 01794 Mat kx, ky; 01795 createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); 01796 sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType ); 01797 } 01798 01799 /****************************************************************************************\ 01800 Median Filter 01801 \****************************************************************************************/ 01802 01803 namespace cv 01804 { 01805 typedef ushort HT; 01806 01807 /** 01808 * This structure represents a two-tier histogram. The first tier (known as the 01809 * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level) 01810 * is 8 bit wide. Pixels inserted in the fine level also get inserted into the 01811 * coarse bucket designated by the 4 MSBs of the fine bucket value. 01812 * 01813 * The structure is aligned on 16 bits, which is a prerequisite for SIMD 01814 * instructions. Each bucket is 16 bit wide, which means that extra care must be 01815 * taken to prevent overflow. 01816 */ 01817 typedef struct 01818 { 01819 HT coarse[16]; 01820 HT fine[16][16]; 01821 } Histogram; 01822 01823 01824 #if CV_SSE2 01825 #define MEDIAN_HAVE_SIMD 1 01826 01827 static inline void histogram_add_simd( const HT x[16], HT y[16] ) 01828 { 01829 const __m128i* rx = (const __m128i*)x; 01830 __m128i* ry = (__m128i*)y; 01831 __m128i r0 = _mm_add_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0)); 01832 __m128i r1 = _mm_add_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1)); 01833 _mm_store_si128(ry+0, r0); 01834 _mm_store_si128(ry+1, r1); 01835 } 01836 01837 static inline void histogram_sub_simd( const HT x[16], HT y[16] ) 01838 { 01839 const __m128i* rx = (const __m128i*)x; 01840 __m128i* ry = (__m128i*)y; 01841 __m128i r0 = _mm_sub_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0)); 01842 __m128i r1 = _mm_sub_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1)); 01843 _mm_store_si128(ry+0, r0); 01844 _mm_store_si128(ry+1, r1); 01845 } 01846 01847 #elif CV_NEON 01848 #define MEDIAN_HAVE_SIMD 1 01849 01850 static inline void histogram_add_simd( const HT x[16], HT y[16] ) 01851 { 01852 vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y))); 01853 vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8))); 01854 } 01855 01856 static inline void histogram_sub_simd( const HT x[16], HT y[16] ) 01857 { 01858 vst1q_u16(y, vsubq_u16(vld1q_u16(y), vld1q_u16(x))); 01859 vst1q_u16(y + 8, vsubq_u16(vld1q_u16(y + 8), vld1q_u16(x + 8))); 01860 } 01861 01862 #else 01863 #define MEDIAN_HAVE_SIMD 0 01864 #endif 01865 01866 01867 static inline void histogram_add( const HT x[16], HT y[16] ) 01868 { 01869 int i; 01870 for( i = 0; i < 16; ++i ) 01871 y[i] = (HT)(y[i] + x[i]); 01872 } 01873 01874 static inline void histogram_sub( const HT x[16], HT y[16] ) 01875 { 01876 int i; 01877 for( i = 0; i < 16; ++i ) 01878 y[i] = (HT)(y[i] - x[i]); 01879 } 01880 01881 static inline void histogram_muladd( int a, const HT x[16], 01882 HT y[16] ) 01883 { 01884 for( int i = 0; i < 16; ++i ) 01885 y[i] = (HT)(y[i] + a * x[i]); 01886 } 01887 01888 static void 01889 medianBlur_8u_O1 ( const Mat& _src, Mat& _dst, int ksize ) 01890 { 01891 /** 01892 * HOP is short for Histogram OPeration. This macro makes an operation \a op on 01893 * histogram \a h for pixel value \a x. It takes care of handling both levels. 01894 */ 01895 #define HOP(h,x,op) \ 01896 h.coarse[x>>4] op, \ 01897 *((HT*)h.fine + x) op 01898 01899 #define COP(c,j,x,op) \ 01900 h_coarse[ 16*(n*c+j) + (x>>4) ] op, \ 01901 h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op 01902 01903 int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2; 01904 size_t sstep = _src.step, dstep = _dst.step; 01905 Histogram CV_DECL_ALIGNED(16) H[4]; 01906 HT CV_DECL_ALIGNED(16) luc[4][16]; 01907 01908 int STRIPE_SIZE = std::min( _dst.cols, 512/cn ); 01909 01910 std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16); 01911 std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16); 01912 HT* h_coarse = alignPtr(&_h_coarse[0], 16); 01913 HT* h_fine = alignPtr(&_h_fine[0], 16); 01914 #if MEDIAN_HAVE_SIMD 01915 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); 01916 #endif 01917 01918 for( int x = 0; x < _dst.cols; x += STRIPE_SIZE ) 01919 { 01920 int i, j, k, c, n = std::min(_dst.cols - x, STRIPE_SIZE) + r*2; 01921 const uchar* src = _src.ptr() + x*cn; 01922 uchar* dst = _dst.ptr() + (x - r)*cn; 01923 01924 memset( h_coarse, 0, 16*n*cn*sizeof(h_coarse[0]) ); 01925 memset( h_fine, 0, 16*16*n*cn*sizeof(h_fine[0]) ); 01926 01927 // First row initialization 01928 for( c = 0; c < cn; c++ ) 01929 { 01930 for( j = 0; j < n; j++ ) 01931 COP( c, j, src[cn*j+c], += (cv::HT)(r+2) ); 01932 01933 for( i = 1; i < r; i++ ) 01934 { 01935 const uchar* p = src + sstep*std::min(i, m-1); 01936 for ( j = 0; j < n; j++ ) 01937 COP( c, j, p[cn*j+c], ++ ); 01938 } 01939 } 01940 01941 for( i = 0; i < m; i++ ) 01942 { 01943 const uchar* p0 = src + sstep * std::max( 0, i-r-1 ); 01944 const uchar* p1 = src + sstep * std::min( m-1, i+r ); 01945 01946 memset( H, 0, cn*sizeof(H[0]) ); 01947 memset( luc, 0, cn*sizeof(luc[0]) ); 01948 for( c = 0; c < cn; c++ ) 01949 { 01950 // Update column histograms for the entire row. 01951 for( j = 0; j < n; j++ ) 01952 { 01953 COP( c, j, p0[j*cn + c], -- ); 01954 COP( c, j, p1[j*cn + c], ++ ); 01955 } 01956 01957 // First column initialization 01958 for( k = 0; k < 16; ++k ) 01959 histogram_muladd( 2*r+1, &h_fine[16*n*(16*c+k)], &H[c].fine[k][0] ); 01960 01961 #if MEDIAN_HAVE_SIMD 01962 if( useSIMD ) 01963 { 01964 for( j = 0; j < 2*r; ++j ) 01965 histogram_add_simd( &h_coarse[16*(n*c+j)], H[c].coarse ); 01966 01967 for( j = r; j < n-r; j++ ) 01968 { 01969 int t = 2*r*r + 2*r, b, sum = 0; 01970 HT* segment; 01971 01972 histogram_add_simd( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse ); 01973 01974 // Find median at coarse level 01975 for ( k = 0; k < 16 ; ++k ) 01976 { 01977 sum += H[c].coarse[k]; 01978 if ( sum > t ) 01979 { 01980 sum -= H[c].coarse[k]; 01981 break; 01982 } 01983 } 01984 assert( k < 16 ); 01985 01986 /* Update corresponding histogram segment */ 01987 if ( luc[c][k] <= j-r ) 01988 { 01989 memset( &H[c].fine[k], 0, 16 * sizeof(HT) ); 01990 for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] ) 01991 histogram_add_simd( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] ); 01992 01993 if ( luc[c][k] < j+r+1 ) 01994 { 01995 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] ); 01996 luc[c][k] = (HT)(j+r+1); 01997 } 01998 } 01999 else 02000 { 02001 for ( ; luc[c][k] < j+r+1; ++luc[c][k] ) 02002 { 02003 histogram_sub_simd( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] ); 02004 histogram_add_simd( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] ); 02005 } 02006 } 02007 02008 histogram_sub_simd( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse ); 02009 02010 /* Find median in segment */ 02011 segment = H[c].fine[k]; 02012 for ( b = 0; b < 16 ; b++ ) 02013 { 02014 sum += segment[b]; 02015 if ( sum > t ) 02016 { 02017 dst[dstep*i+cn*j+c] = (uchar)(16*k + b); 02018 break; 02019 } 02020 } 02021 assert( b < 16 ); 02022 } 02023 } 02024 else 02025 #endif 02026 { 02027 for( j = 0; j < 2*r; ++j ) 02028 histogram_add( &h_coarse[16*(n*c+j)], H[c].coarse ); 02029 02030 for( j = r; j < n-r; j++ ) 02031 { 02032 int t = 2*r*r + 2*r, b, sum = 0; 02033 HT* segment; 02034 02035 histogram_add( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse ); 02036 02037 // Find median at coarse level 02038 for ( k = 0; k < 16 ; ++k ) 02039 { 02040 sum += H[c].coarse[k]; 02041 if ( sum > t ) 02042 { 02043 sum -= H[c].coarse[k]; 02044 break; 02045 } 02046 } 02047 assert( k < 16 ); 02048 02049 /* Update corresponding histogram segment */ 02050 if ( luc[c][k] <= j-r ) 02051 { 02052 memset( &H[c].fine[k], 0, 16 * sizeof(HT) ); 02053 for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] ) 02054 histogram_add( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] ); 02055 02056 if ( luc[c][k] < j+r+1 ) 02057 { 02058 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] ); 02059 luc[c][k] = (HT)(j+r+1); 02060 } 02061 } 02062 else 02063 { 02064 for ( ; luc[c][k] < j+r+1; ++luc[c][k] ) 02065 { 02066 histogram_sub( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] ); 02067 histogram_add( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] ); 02068 } 02069 } 02070 02071 histogram_sub( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse ); 02072 02073 /* Find median in segment */ 02074 segment = H[c].fine[k]; 02075 for ( b = 0; b < 16 ; b++ ) 02076 { 02077 sum += segment[b]; 02078 if ( sum > t ) 02079 { 02080 dst[dstep*i+cn*j+c] = (uchar)(16*k + b); 02081 break; 02082 } 02083 } 02084 assert( b < 16 ); 02085 } 02086 } 02087 } 02088 } 02089 } 02090 02091 #undef HOP 02092 #undef COP 02093 } 02094 02095 static void 02096 medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m ) 02097 { 02098 #define N 16 02099 int zone0[4][N]; 02100 int zone1[4][N*N]; 02101 int x, y; 02102 int n2 = m*m/2; 02103 Size size = _dst.size(); 02104 const uchar* src = _src.ptr(); 02105 uchar* dst = _dst.ptr(); 02106 int src_step = (int)_src.step, dst_step = (int)_dst.step; 02107 int cn = _src.channels(); 02108 const uchar* src_max = src + size.height*src_step; 02109 02110 #define UPDATE_ACC01( pix, cn, op ) \ 02111 { \ 02112 int p = (pix); \ 02113 zone1[cn][p] op; \ 02114 zone0[cn][p >> 4] op; \ 02115 } 02116 02117 //CV_Assert( size.height >= nx && size.width >= nx ); 02118 for( x = 0; x < size.width; x++, src += cn, dst += cn ) 02119 { 02120 uchar* dst_cur = dst; 02121 const uchar* src_top = src; 02122 const uchar* src_bottom = src; 02123 int k, c; 02124 int src_step1 = src_step, dst_step1 = dst_step; 02125 02126 if( x % 2 != 0 ) 02127 { 02128 src_bottom = src_top += src_step*(size.height-1); 02129 dst_cur += dst_step*(size.height-1); 02130 src_step1 = -src_step1; 02131 dst_step1 = -dst_step1; 02132 } 02133 02134 // init accumulator 02135 memset( zone0, 0, sizeof(zone0[0])*cn ); 02136 memset( zone1, 0, sizeof(zone1[0])*cn ); 02137 02138 for( y = 0; y <= m/2; y++ ) 02139 { 02140 for( c = 0; c < cn; c++ ) 02141 { 02142 if( y > 0 ) 02143 { 02144 for( k = 0; k < m*cn; k += cn ) 02145 UPDATE_ACC01( src_bottom[k+c], c, ++ ); 02146 } 02147 else 02148 { 02149 for( k = 0; k < m*cn; k += cn ) 02150 UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 ); 02151 } 02152 } 02153 02154 if( (src_step1 > 0 && y < size.height-1) || 02155 (src_step1 < 0 && size.height-y-1 > 0) ) 02156 src_bottom += src_step1; 02157 } 02158 02159 for( y = 0; y < size.height; y++, dst_cur += dst_step1 ) 02160 { 02161 // find median 02162 for( c = 0; c < cn; c++ ) 02163 { 02164 int s = 0; 02165 for( k = 0; ; k++ ) 02166 { 02167 int t = s + zone0[c][k]; 02168 if( t > n2 ) break; 02169 s = t; 02170 } 02171 02172 for( k *= N; ;k++ ) 02173 { 02174 s += zone1[c][k]; 02175 if( s > n2 ) break; 02176 } 02177 02178 dst_cur[c] = (uchar)k; 02179 } 02180 02181 if( y+1 == size.height ) 02182 break; 02183 02184 if( cn == 1 ) 02185 { 02186 for( k = 0; k < m; k++ ) 02187 { 02188 int p = src_top[k]; 02189 int q = src_bottom[k]; 02190 zone1[0][p]--; 02191 zone0[0][p>>4]--; 02192 zone1[0][q]++; 02193 zone0[0][q>>4]++; 02194 } 02195 } 02196 else if( cn == 3 ) 02197 { 02198 for( k = 0; k < m*3; k += 3 ) 02199 { 02200 UPDATE_ACC01( src_top[k], 0, -- ); 02201 UPDATE_ACC01( src_top[k+1], 1, -- ); 02202 UPDATE_ACC01( src_top[k+2], 2, -- ); 02203 02204 UPDATE_ACC01( src_bottom[k], 0, ++ ); 02205 UPDATE_ACC01( src_bottom[k+1], 1, ++ ); 02206 UPDATE_ACC01( src_bottom[k+2], 2, ++ ); 02207 } 02208 } 02209 else 02210 { 02211 assert( cn == 4 ); 02212 for( k = 0; k < m*4; k += 4 ) 02213 { 02214 UPDATE_ACC01( src_top[k], 0, -- ); 02215 UPDATE_ACC01( src_top[k+1], 1, -- ); 02216 UPDATE_ACC01( src_top[k+2], 2, -- ); 02217 UPDATE_ACC01( src_top[k+3], 3, -- ); 02218 02219 UPDATE_ACC01( src_bottom[k], 0, ++ ); 02220 UPDATE_ACC01( src_bottom[k+1], 1, ++ ); 02221 UPDATE_ACC01( src_bottom[k+2], 2, ++ ); 02222 UPDATE_ACC01( src_bottom[k+3], 3, ++ ); 02223 } 02224 } 02225 02226 if( (src_step1 > 0 && src_bottom + src_step1 < src_max) || 02227 (src_step1 < 0 && src_bottom + src_step1 >= src) ) 02228 src_bottom += src_step1; 02229 02230 if( y >= m/2 ) 02231 src_top += src_step1; 02232 } 02233 } 02234 #undef N 02235 #undef UPDATE_ACC 02236 } 02237 02238 02239 struct MinMax8u 02240 { 02241 typedef uchar value_type; 02242 typedef int arg_type; 02243 enum { SIZE = 1 }; 02244 arg_type load(const uchar* ptr) { return *ptr; } 02245 void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; } 02246 void operator()(arg_type& a, arg_type& b) const 02247 { 02248 int t = CV_FAST_CAST_8U(a - b); 02249 b += t; a -= t; 02250 } 02251 }; 02252 02253 struct MinMax16u 02254 { 02255 typedef ushort value_type; 02256 typedef int arg_type; 02257 enum { SIZE = 1 }; 02258 arg_type load(const ushort* ptr) { return *ptr; } 02259 void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; } 02260 void operator()(arg_type& a, arg_type& b) const 02261 { 02262 arg_type t = a; 02263 a = std::min(a, b); 02264 b = std::max(b, t); 02265 } 02266 }; 02267 02268 struct MinMax16s 02269 { 02270 typedef short value_type; 02271 typedef int arg_type; 02272 enum { SIZE = 1 }; 02273 arg_type load(const short* ptr) { return *ptr; } 02274 void store(short* ptr, arg_type val) { *ptr = (short)val; } 02275 void operator()(arg_type& a, arg_type& b) const 02276 { 02277 arg_type t = a; 02278 a = std::min(a, b); 02279 b = std::max(b, t); 02280 } 02281 }; 02282 02283 struct MinMax32f 02284 { 02285 typedef float value_type; 02286 typedef float arg_type; 02287 enum { SIZE = 1 }; 02288 arg_type load(const float* ptr) { return *ptr; } 02289 void store(float* ptr, arg_type val) { *ptr = val; } 02290 void operator()(arg_type& a, arg_type& b) const 02291 { 02292 arg_type t = a; 02293 a = std::min(a, b); 02294 b = std::max(b, t); 02295 } 02296 }; 02297 02298 #if CV_SSE2 02299 02300 struct MinMaxVec8u 02301 { 02302 typedef uchar value_type; 02303 typedef __m128i arg_type; 02304 enum { SIZE = 16 }; 02305 arg_type load(const uchar* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 02306 void store(uchar* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); } 02307 void operator()(arg_type& a, arg_type& b) const 02308 { 02309 arg_type t = a; 02310 a = _mm_min_epu8(a, b); 02311 b = _mm_max_epu8(b, t); 02312 } 02313 }; 02314 02315 02316 struct MinMaxVec16u 02317 { 02318 typedef ushort value_type; 02319 typedef __m128i arg_type; 02320 enum { SIZE = 8 }; 02321 arg_type load(const ushort* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 02322 void store(ushort* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); } 02323 void operator()(arg_type& a, arg_type& b) const 02324 { 02325 arg_type t = _mm_subs_epu16(a, b); 02326 a = _mm_subs_epu16(a, t); 02327 b = _mm_adds_epu16(b, t); 02328 } 02329 }; 02330 02331 02332 struct MinMaxVec16s 02333 { 02334 typedef short value_type; 02335 typedef __m128i arg_type; 02336 enum { SIZE = 8 }; 02337 arg_type load(const short* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 02338 void store(short* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); } 02339 void operator()(arg_type& a, arg_type& b) const 02340 { 02341 arg_type t = a; 02342 a = _mm_min_epi16(a, b); 02343 b = _mm_max_epi16(b, t); 02344 } 02345 }; 02346 02347 02348 struct MinMaxVec32f 02349 { 02350 typedef float value_type; 02351 typedef __m128 arg_type; 02352 enum { SIZE = 4 }; 02353 arg_type load(const float* ptr) { return _mm_loadu_ps(ptr); } 02354 void store(float* ptr, arg_type val) { _mm_storeu_ps(ptr, val); } 02355 void operator()(arg_type& a, arg_type& b) const 02356 { 02357 arg_type t = a; 02358 a = _mm_min_ps(a, b); 02359 b = _mm_max_ps(b, t); 02360 } 02361 }; 02362 02363 #elif CV_NEON 02364 02365 struct MinMaxVec8u 02366 { 02367 typedef uchar value_type; 02368 typedef uint8x16_t arg_type; 02369 enum { SIZE = 16 }; 02370 arg_type load(const uchar* ptr) { return vld1q_u8(ptr); } 02371 void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); } 02372 void operator()(arg_type& a, arg_type& b) const 02373 { 02374 arg_type t = a; 02375 a = vminq_u8(a, b); 02376 b = vmaxq_u8(b, t); 02377 } 02378 }; 02379 02380 02381 struct MinMaxVec16u 02382 { 02383 typedef ushort value_type; 02384 typedef uint16x8_t arg_type; 02385 enum { SIZE = 8 }; 02386 arg_type load(const ushort* ptr) { return vld1q_u16(ptr); } 02387 void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); } 02388 void operator()(arg_type& a, arg_type& b) const 02389 { 02390 arg_type t = a; 02391 a = vminq_u16(a, b); 02392 b = vmaxq_u16(b, t); 02393 } 02394 }; 02395 02396 02397 struct MinMaxVec16s 02398 { 02399 typedef short value_type; 02400 typedef int16x8_t arg_type; 02401 enum { SIZE = 8 }; 02402 arg_type load(const short* ptr) { return vld1q_s16(ptr); } 02403 void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); } 02404 void operator()(arg_type& a, arg_type& b) const 02405 { 02406 arg_type t = a; 02407 a = vminq_s16(a, b); 02408 b = vmaxq_s16(b, t); 02409 } 02410 }; 02411 02412 02413 struct MinMaxVec32f 02414 { 02415 typedef float value_type; 02416 typedef float32x4_t arg_type; 02417 enum { SIZE = 4 }; 02418 arg_type load(const float* ptr) { return vld1q_f32(ptr); } 02419 void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); } 02420 void operator()(arg_type& a, arg_type& b) const 02421 { 02422 arg_type t = a; 02423 a = vminq_f32(a, b); 02424 b = vmaxq_f32(b, t); 02425 } 02426 }; 02427 02428 02429 #else 02430 02431 typedef MinMax8u MinMaxVec8u; 02432 typedef MinMax16u MinMaxVec16u; 02433 typedef MinMax16s MinMaxVec16s; 02434 typedef MinMax32f MinMaxVec32f; 02435 02436 #endif 02437 02438 template<class Op, class VecOp> 02439 static void 02440 medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) 02441 { 02442 typedef typename Op::value_type T; 02443 typedef typename Op::arg_type WT; 02444 typedef typename VecOp::arg_type VT; 02445 02446 const T* src = _src.ptr<T>(); 02447 T* dst = _dst.ptr<T>(); 02448 int sstep = (int)(_src.step/sizeof(T)); 02449 int dstep = (int)(_dst.step/sizeof(T)); 02450 Size size = _dst.size(); 02451 int i, j, k, cn = _src.channels(); 02452 Op op; 02453 VecOp vop; 02454 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); 02455 02456 if( m == 3 ) 02457 { 02458 if( size.width == 1 || size.height == 1 ) 02459 { 02460 int len = size.width + size.height - 1; 02461 int sdelta = size.height == 1 ? cn : sstep; 02462 int sdelta0 = size.height == 1 ? 0 : sstep - cn; 02463 int ddelta = size.height == 1 ? cn : dstep; 02464 02465 for( i = 0; i < len; i++, src += sdelta0, dst += ddelta ) 02466 for( j = 0; j < cn; j++, src++ ) 02467 { 02468 WT p0 = src[i > 0 ? -sdelta : 0]; 02469 WT p1 = src[0]; 02470 WT p2 = src[i < len - 1 ? sdelta : 0]; 02471 02472 op(p0, p1); op(p1, p2); op(p0, p1); 02473 dst[j] = (T)p1; 02474 } 02475 return; 02476 } 02477 02478 size.width *= cn; 02479 for( i = 0; i < size.height; i++, dst += dstep ) 02480 { 02481 const T* row0 = src + std::max(i - 1, 0)*sstep; 02482 const T* row1 = src + i*sstep; 02483 const T* row2 = src + std::min(i + 1, size.height-1)*sstep; 02484 int limit = useSIMD ? cn : size.width; 02485 02486 for(j = 0;; ) 02487 { 02488 for( ; j < limit; j++ ) 02489 { 02490 int j0 = j >= cn ? j - cn : j; 02491 int j2 = j < size.width - cn ? j + cn : j; 02492 WT p0 = row0[j0], p1 = row0[j], p2 = row0[j2]; 02493 WT p3 = row1[j0], p4 = row1[j], p5 = row1[j2]; 02494 WT p6 = row2[j0], p7 = row2[j], p8 = row2[j2]; 02495 02496 op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1); 02497 op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5); 02498 op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7); 02499 op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); 02500 op(p4, p2); op(p6, p4); op(p4, p2); 02501 dst[j] = (T)p4; 02502 } 02503 02504 if( limit == size.width ) 02505 break; 02506 02507 for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE ) 02508 { 02509 VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn); 02510 VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn); 02511 VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn); 02512 02513 vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1); 02514 vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5); 02515 vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7); 02516 vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7); 02517 vop(p4, p2); vop(p6, p4); vop(p4, p2); 02518 vop.store(dst+j, p4); 02519 } 02520 02521 limit = size.width; 02522 } 02523 } 02524 } 02525 else if( m == 5 ) 02526 { 02527 if( size.width == 1 || size.height == 1 ) 02528 { 02529 int len = size.width + size.height - 1; 02530 int sdelta = size.height == 1 ? cn : sstep; 02531 int sdelta0 = size.height == 1 ? 0 : sstep - cn; 02532 int ddelta = size.height == 1 ? cn : dstep; 02533 02534 for( i = 0; i < len; i++, src += sdelta0, dst += ddelta ) 02535 for( j = 0; j < cn; j++, src++ ) 02536 { 02537 int i1 = i > 0 ? -sdelta : 0; 02538 int i0 = i > 1 ? -sdelta*2 : i1; 02539 int i3 = i < len-1 ? sdelta : 0; 02540 int i4 = i < len-2 ? sdelta*2 : i3; 02541 WT p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4]; 02542 02543 op(p0, p1); op(p3, p4); op(p2, p3); op(p3, p4); op(p0, p2); 02544 op(p2, p4); op(p1, p3); op(p1, p2); 02545 dst[j] = (T)p2; 02546 } 02547 return; 02548 } 02549 02550 size.width *= cn; 02551 for( i = 0; i < size.height; i++, dst += dstep ) 02552 { 02553 const T* row[5]; 02554 row[0] = src + std::max(i - 2, 0)*sstep; 02555 row[1] = src + std::max(i - 1, 0)*sstep; 02556 row[2] = src + i*sstep; 02557 row[3] = src + std::min(i + 1, size.height-1)*sstep; 02558 row[4] = src + std::min(i + 2, size.height-1)*sstep; 02559 int limit = useSIMD ? cn*2 : size.width; 02560 02561 for(j = 0;; ) 02562 { 02563 for( ; j < limit; j++ ) 02564 { 02565 WT p[25]; 02566 int j1 = j >= cn ? j - cn : j; 02567 int j0 = j >= cn*2 ? j - cn*2 : j1; 02568 int j3 = j < size.width - cn ? j + cn : j; 02569 int j4 = j < size.width - cn*2 ? j + cn*2 : j3; 02570 for( k = 0; k < 5; k++ ) 02571 { 02572 const T* rowk = row[k]; 02573 p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1]; 02574 p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3]; 02575 p[k*5+4] = rowk[j4]; 02576 } 02577 02578 op(p[1], p[2]); op(p[0], p[1]); op(p[1], p[2]); op(p[4], p[5]); op(p[3], p[4]); 02579 op(p[4], p[5]); op(p[0], p[3]); op(p[2], p[5]); op(p[2], p[3]); op(p[1], p[4]); 02580 op(p[1], p[2]); op(p[3], p[4]); op(p[7], p[8]); op(p[6], p[7]); op(p[7], p[8]); 02581 op(p[10], p[11]); op(p[9], p[10]); op(p[10], p[11]); op(p[6], p[9]); op(p[8], p[11]); 02582 op(p[8], p[9]); op(p[7], p[10]); op(p[7], p[8]); op(p[9], p[10]); op(p[0], p[6]); 02583 op(p[4], p[10]); op(p[4], p[6]); op(p[2], p[8]); op(p[2], p[4]); op(p[6], p[8]); 02584 op(p[1], p[7]); op(p[5], p[11]); op(p[5], p[7]); op(p[3], p[9]); op(p[3], p[5]); 02585 op(p[7], p[9]); op(p[1], p[2]); op(p[3], p[4]); op(p[5], p[6]); op(p[7], p[8]); 02586 op(p[9], p[10]); op(p[13], p[14]); op(p[12], p[13]); op(p[13], p[14]); op(p[16], p[17]); 02587 op(p[15], p[16]); op(p[16], p[17]); op(p[12], p[15]); op(p[14], p[17]); op(p[14], p[15]); 02588 op(p[13], p[16]); op(p[13], p[14]); op(p[15], p[16]); op(p[19], p[20]); op(p[18], p[19]); 02589 op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[21], p[23]); op(p[22], p[24]); 02590 op(p[22], p[23]); op(p[18], p[21]); op(p[20], p[23]); op(p[20], p[21]); op(p[19], p[22]); 02591 op(p[22], p[24]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[12], p[18]); 02592 op(p[16], p[22]); op(p[16], p[18]); op(p[14], p[20]); op(p[20], p[24]); op(p[14], p[16]); 02593 op(p[18], p[20]); op(p[22], p[24]); op(p[13], p[19]); op(p[17], p[23]); op(p[17], p[19]); 02594 op(p[15], p[21]); op(p[15], p[17]); op(p[19], p[21]); op(p[13], p[14]); op(p[15], p[16]); 02595 op(p[17], p[18]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[0], p[12]); 02596 op(p[8], p[20]); op(p[8], p[12]); op(p[4], p[16]); op(p[16], p[24]); op(p[12], p[16]); 02597 op(p[2], p[14]); op(p[10], p[22]); op(p[10], p[14]); op(p[6], p[18]); op(p[6], p[10]); 02598 op(p[10], p[12]); op(p[1], p[13]); op(p[9], p[21]); op(p[9], p[13]); op(p[5], p[17]); 02599 op(p[13], p[17]); op(p[3], p[15]); op(p[11], p[23]); op(p[11], p[15]); op(p[7], p[19]); 02600 op(p[7], p[11]); op(p[11], p[13]); op(p[11], p[12]); 02601 dst[j] = (T)p[12]; 02602 } 02603 02604 if( limit == size.width ) 02605 break; 02606 02607 for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE ) 02608 { 02609 VT p[25]; 02610 for( k = 0; k < 5; k++ ) 02611 { 02612 const T* rowk = row[k]; 02613 p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn); 02614 p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn); 02615 p[k*5+4] = vop.load(rowk+j+cn*2); 02616 } 02617 02618 vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]); 02619 vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]); 02620 vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]); 02621 vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]); 02622 vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]); 02623 vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]); 02624 vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]); 02625 vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]); 02626 vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]); 02627 vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]); 02628 vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]); 02629 vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]); 02630 vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]); 02631 vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]); 02632 vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]); 02633 vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]); 02634 vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]); 02635 vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]); 02636 vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]); 02637 vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]); 02638 vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]); 02639 vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]); 02640 vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]); 02641 vop.store(dst+j, p[12]); 02642 } 02643 02644 limit = size.width; 02645 } 02646 } 02647 } 02648 } 02649 02650 #ifdef HAVE_OPENCL 02651 02652 static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m) 02653 { 02654 size_t localsize[2] = { 16, 16 }; 02655 size_t globalsize[2]; 02656 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 02657 02658 if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) ) 02659 return false; 02660 02661 Size imgSize = _src.size(); 02662 bool useOptimized = (1 == cn) && 02663 (size_t)imgSize.width >= localsize[0] * 8 && 02664 (size_t)imgSize.height >= localsize[1] * 8 && 02665 imgSize.width % 4 == 0 && 02666 imgSize.height % 4 == 0 && 02667 (ocl::Device::getDefault().isIntel()); 02668 02669 cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ; 02670 cv::String kdefs = useOptimized ? 02671 format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type), 02672 ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn) 02673 : 02674 format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ; 02675 02676 ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() ); 02677 02678 if (k.empty()) 02679 return false; 02680 02681 UMat src = _src.getUMat(); 02682 _dst.create(src.size(), type); 02683 UMat dst = _dst.getUMat(); 02684 02685 k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)); 02686 02687 if( useOptimized ) 02688 { 02689 globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0]; 02690 globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1]; 02691 } 02692 else 02693 { 02694 globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0]; 02695 globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1]; 02696 } 02697 02698 return k.run(2, globalsize, localsize, false); 02699 } 02700 02701 #endif 02702 02703 } 02704 02705 #ifdef HAVE_IPP 02706 namespace cv 02707 { 02708 static bool ipp_medianFilter( InputArray _src0, OutputArray _dst, int ksize ) 02709 { 02710 #if IPP_VERSION_X100 >= 810 02711 Mat src0 = _src0.getMat(); 02712 _dst.create( src0.size(), src0.type() ); 02713 Mat dst = _dst.getMat(); 02714 02715 #define IPP_FILTER_MEDIAN_BORDER(ippType, ippDataType, flavor) \ 02716 do \ 02717 { \ 02718 if (ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, \ 02719 ippDataType, CV_MAT_CN(type), &bufSize) >= 0) \ 02720 { \ 02721 Ipp8u * buffer = ippsMalloc_8u(bufSize); \ 02722 IppStatus status = ippiFilterMedianBorder_##flavor(src.ptr<ippType>(), (int)src.step, \ 02723 dst.ptr<ippType>(), (int)dst.step, dstRoiSize, maskSize, \ 02724 ippBorderRepl, (ippType)0, buffer); \ 02725 ippsFree(buffer); \ 02726 if (status >= 0) \ 02727 { \ 02728 CV_IMPL_ADD(CV_IMPL_IPP); \ 02729 return true; \ 02730 } \ 02731 } \ 02732 } \ 02733 while ((void)0, 0) 02734 02735 if( ksize <= 5 ) 02736 { 02737 Ipp32s bufSize; 02738 IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize); 02739 Mat src; 02740 if( dst.data != src0.data ) 02741 src = src0; 02742 else 02743 src0.copyTo(src); 02744 02745 int type = src0.type(); 02746 if (type == CV_8UC1) 02747 IPP_FILTER_MEDIAN_BORDER(Ipp8u, ipp8u, 8u_C1R); 02748 else if (type == CV_16UC1) 02749 IPP_FILTER_MEDIAN_BORDER(Ipp16u, ipp16u, 16u_C1R); 02750 else if (type == CV_16SC1) 02751 IPP_FILTER_MEDIAN_BORDER(Ipp16s, ipp16s, 16s_C1R); 02752 else if (type == CV_32FC1) 02753 IPP_FILTER_MEDIAN_BORDER(Ipp32f, ipp32f, 32f_C1R); 02754 } 02755 #undef IPP_FILTER_MEDIAN_BORDER 02756 #else 02757 CV_UNUSED(_src0); CV_UNUSED(_dst); CV_UNUSED(ksize); 02758 #endif 02759 return false; 02760 } 02761 } 02762 #endif 02763 02764 void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize ) 02765 { 02766 CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 )); 02767 02768 if( ksize <= 1 ) 02769 { 02770 _src0.copyTo(_dst); 02771 return; 02772 } 02773 02774 #ifdef HAVE_OPENCL 02775 CV_OCL_RUN(_dst.isUMat(), 02776 ocl_medianFilter(_src0,_dst, ksize)) 02777 #endif 02778 02779 Mat src0 = _src0.getMat(); 02780 _dst.create( src0.size(), src0.type() ); 02781 Mat dst = _dst.getMat(); 02782 02783 CV_IPP_RUN(IPP_VERSION_X100 >= 810 && ksize <= 5, ipp_medianFilter(_src0,_dst, ksize)); 02784 02785 #ifdef HAVE_TEGRA_OPTIMIZATION 02786 if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize)) 02787 return; 02788 #endif 02789 02790 bool useSortNet = ksize == 3 || (ksize == 5 02791 #if !(CV_SSE2 || CV_NEON) 02792 && src0.depth() > CV_8U 02793 #endif 02794 ); 02795 02796 Mat src; 02797 if( useSortNet ) 02798 { 02799 if( dst.data != src0.data ) 02800 src = src0; 02801 else 02802 src0.copyTo(src); 02803 02804 if( src.depth() == CV_8U ) 02805 medianBlur_SortNet<MinMax8u, MinMaxVec8u>( src, dst, ksize ); 02806 else if( src.depth() == CV_16U ) 02807 medianBlur_SortNet<MinMax16u, MinMaxVec16u>( src, dst, ksize ); 02808 else if( src.depth() == CV_16S ) 02809 medianBlur_SortNet<MinMax16s, MinMaxVec16s>( src, dst, ksize ); 02810 else if( src.depth() == CV_32F ) 02811 medianBlur_SortNet<MinMax32f, MinMaxVec32f>( src, dst, ksize ); 02812 else 02813 CV_Error(CV_StsUnsupportedFormat, ""); 02814 02815 return; 02816 } 02817 else 02818 { 02819 cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE ); 02820 02821 int cn = src0.channels(); 02822 CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) ); 02823 02824 double img_size_mp = (double)(src0.total())/(1 << 20); 02825 if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)* 02826 (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3)) 02827 medianBlur_8u_Om( src, dst, ksize ); 02828 else 02829 medianBlur_8u_O1 ( src, dst, ksize ); 02830 } 02831 } 02832 02833 /****************************************************************************************\ 02834 Bilateral Filtering 02835 \****************************************************************************************/ 02836 02837 namespace cv 02838 { 02839 02840 class BilateralFilter_8u_Invoker : 02841 public ParallelLoopBody 02842 { 02843 public: 02844 BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk, 02845 int* _space_ofs, float *_space_weight, float *_color_weight) : 02846 temp(&_temp), dest(&_dest), radius(_radius), 02847 maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) 02848 { 02849 } 02850 02851 virtual void operator() (const Range& range) const 02852 { 02853 int i, j, cn = dest->channels(), k; 02854 Size size = dest->size(); 02855 #if CV_SSE3 02856 int CV_DECL_ALIGNED(16) buf[4]; 02857 float CV_DECL_ALIGNED(16) bufSum[4]; 02858 static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; 02859 bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); 02860 #endif 02861 02862 for( i = range.start; i < range.end; i++ ) 02863 { 02864 const uchar* sptr = temp->ptr(i+radius) + radius*cn; 02865 uchar* dptr = dest->ptr(i); 02866 02867 if( cn == 1 ) 02868 { 02869 for( j = 0; j < size.width; j++ ) 02870 { 02871 float sum = 0, wsum = 0; 02872 int val0 = sptr[j]; 02873 k = 0; 02874 #if CV_SSE3 02875 if( haveSSE3 ) 02876 { 02877 __m128 _val0 = _mm_set1_ps(static_cast<float>(val0)); 02878 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); 02879 02880 for( ; k <= maxk - 4; k += 4 ) 02881 { 02882 __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], 02883 sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]); 02884 02885 __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0)); 02886 _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val)); 02887 02888 __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], 02889 color_weight[buf[1]],color_weight[buf[0]]); 02890 __m128 _sw = _mm_loadu_ps(space_weight+k); 02891 __m128 _w = _mm_mul_ps(_cw, _sw); 02892 _cw = _mm_mul_ps(_w, _valF); 02893 02894 _sw = _mm_hadd_ps(_w, _cw); 02895 _sw = _mm_hadd_ps(_sw, _sw); 02896 _mm_storel_pi((__m64*)bufSum, _sw); 02897 02898 sum += bufSum[1]; 02899 wsum += bufSum[0]; 02900 } 02901 } 02902 #endif 02903 for( ; k < maxk; k++ ) 02904 { 02905 int val = sptr[j + space_ofs[k]]; 02906 float w = space_weight[k]*color_weight[std::abs(val - val0)]; 02907 sum += val*w; 02908 wsum += w; 02909 } 02910 // overflow is not possible here => there is no need to use cv::saturate_cast 02911 dptr[j] = (uchar)cvRound(sum/wsum); 02912 } 02913 } 02914 else 02915 { 02916 assert( cn == 3 ); 02917 for( j = 0; j < size.width*3; j += 3 ) 02918 { 02919 float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; 02920 int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; 02921 k = 0; 02922 #if CV_SSE3 02923 if( haveSSE3 ) 02924 { 02925 const __m128i izero = _mm_setzero_si128(); 02926 const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0)); 02927 const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0)); 02928 const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0)); 02929 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); 02930 02931 for( ; k <= maxk - 4; k += 4 ) 02932 { 02933 const int* const sptr_k0 = reinterpret_cast<const int*>(sptr + j + space_ofs[k]); 02934 const int* const sptr_k1 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]); 02935 const int* const sptr_k2 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]); 02936 const int* const sptr_k3 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]); 02937 02938 __m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero)); 02939 __m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero)); 02940 __m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero)); 02941 __m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero)); 02942 02943 _MM_TRANSPOSE4_PS(_b, _g, _r, _z); 02944 02945 __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0)); 02946 __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0)); 02947 __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0)); 02948 02949 bt =_mm_add_ps(rt, _mm_add_ps(bt, gt)); 02950 _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt)); 02951 02952 __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], 02953 color_weight[buf[1]],color_weight[buf[0]]); 02954 __m128 _sw = _mm_loadu_ps(space_weight+k); 02955 02956 _w = _mm_mul_ps(_w,_sw); 02957 _b = _mm_mul_ps(_b, _w); 02958 _g = _mm_mul_ps(_g, _w); 02959 _r = _mm_mul_ps(_r, _w); 02960 02961 _w = _mm_hadd_ps(_w, _b); 02962 _g = _mm_hadd_ps(_g, _r); 02963 02964 _w = _mm_hadd_ps(_w, _g); 02965 _mm_store_ps(bufSum, _w); 02966 02967 wsum += bufSum[0]; 02968 sum_b += bufSum[1]; 02969 sum_g += bufSum[2]; 02970 sum_r += bufSum[3]; 02971 } 02972 } 02973 #endif 02974 02975 for( ; k < maxk; k++ ) 02976 { 02977 const uchar* sptr_k = sptr + j + space_ofs[k]; 02978 int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; 02979 float w = space_weight[k]*color_weight[std::abs(b - b0) + 02980 std::abs(g - g0) + std::abs(r - r0)]; 02981 sum_b += b*w; sum_g += g*w; sum_r += r*w; 02982 wsum += w; 02983 } 02984 wsum = 1.f/wsum; 02985 b0 = cvRound(sum_b*wsum); 02986 g0 = cvRound(sum_g*wsum); 02987 r0 = cvRound(sum_r*wsum); 02988 dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0; 02989 } 02990 } 02991 } 02992 } 02993 02994 private: 02995 const Mat *temp; 02996 Mat *dest; 02997 int radius, maxk, *space_ofs; 02998 float *space_weight, *color_weight; 02999 }; 03000 03001 #if defined (HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY) && IPP_DISABLE_BLOCK 03002 class IPPBilateralFilter_8u_Invoker : 03003 public ParallelLoopBody 03004 { 03005 public: 03006 IPPBilateralFilter_8u_Invoker(Mat &_src, Mat &_dst, double _sigma_color, double _sigma_space, int _radius, bool *_ok) : 03007 ParallelLoopBody(), src(_src), dst(_dst), sigma_color(_sigma_color), sigma_space(_sigma_space), radius(_radius), ok(_ok) 03008 { 03009 *ok = true; 03010 } 03011 03012 virtual void operator() (const Range& range) const 03013 { 03014 int d = radius * 2 + 1; 03015 IppiSize kernel = {d, d}; 03016 IppiSize roi={dst.cols, range.end - range.start}; 03017 int bufsize=0; 03018 if (0 > ippiFilterBilateralGetBufSize_8u_C1R( ippiFilterBilateralGauss, roi, kernel, &bufsize)) 03019 { 03020 *ok = false; 03021 return; 03022 } 03023 AutoBuffer<uchar> buf(bufsize); 03024 IppiFilterBilateralSpec *pSpec = (IppiFilterBilateralSpec *)alignPtr(&buf[0], 32); 03025 if (0 > ippiFilterBilateralInit_8u_C1R( ippiFilterBilateralGauss, kernel, (Ipp32f)sigma_color, (Ipp32f)sigma_space, 1, pSpec )) 03026 { 03027 *ok = false; 03028 return; 03029 } 03030 if (0 > ippiFilterBilateral_8u_C1R( src.ptr<uchar>(range.start) + radius * ((int)src.step[0] + 1), (int)src.step[0], dst.ptr<uchar>(range.start), (int)dst.step[0], roi, kernel, pSpec )) 03031 *ok = false; 03032 else 03033 { 03034 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 03035 } 03036 } 03037 private: 03038 Mat &src; 03039 Mat &dst; 03040 double sigma_color; 03041 double sigma_space; 03042 int radius; 03043 bool *ok; 03044 const IPPBilateralFilter_8u_Invoker& operator= (const IPPBilateralFilter_8u_Invoker&); 03045 }; 03046 #endif 03047 03048 #ifdef HAVE_OPENCL 03049 03050 static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, 03051 double sigma_color, double sigma_space, 03052 int borderType) 03053 { 03054 #ifdef ANDROID 03055 if (ocl::Device::getDefault().isNVidia()) 03056 return false; 03057 #endif 03058 03059 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 03060 int i, j, maxk, radius; 03061 03062 if (depth != CV_8U || cn > 4) 03063 return false; 03064 03065 if (sigma_color <= 0) 03066 sigma_color = 1; 03067 if (sigma_space <= 0) 03068 sigma_space = 1; 03069 03070 double gauss_color_coeff = -0.5 / (sigma_color * sigma_color); 03071 double gauss_space_coeff = -0.5 / (sigma_space * sigma_space); 03072 03073 if ( d <= 0 ) 03074 radius = cvRound(sigma_space * 1.5); 03075 else 03076 radius = d / 2; 03077 radius = MAX(radius, 1); 03078 d = radius * 2 + 1; 03079 03080 UMat src = _src.getUMat(), dst = _dst.getUMat(), temp; 03081 if (src.u == dst.u) 03082 return false; 03083 03084 copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); 03085 std::vector<float> _space_weight(d * d); 03086 std::vector<int> _space_ofs(d * d); 03087 float * const space_weight = &_space_weight[0]; 03088 int * const space_ofs = &_space_ofs[0]; 03089 03090 // initialize space-related bilateral filter coefficients 03091 for( i = -radius, maxk = 0; i <= radius; i++ ) 03092 for( j = -radius; j <= radius; j++ ) 03093 { 03094 double r = std::sqrt((double)i * i + (double)j * j); 03095 if ( r > radius ) 03096 continue; 03097 space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); 03098 space_ofs[maxk++] = (int)(i * temp.step + j * cn); 03099 } 03100 03101 char cvt[3][40]; 03102 String cnstr = cn > 1 ? format("%d", cn) : ""; 03103 String kernelName("bilateral"); 03104 size_t sizeDiv = 1; 03105 if ((ocl::Device::getDefault().isIntel()) && 03106 (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) 03107 { 03108 //Intel GPU 03109 if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. 03110 { 03111 kernelName = "bilateral_float4"; 03112 sizeDiv = 4; 03113 } 03114 } 03115 ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, 03116 format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" 03117 " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f", 03118 radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), 03119 ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), 03120 ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), 03121 ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), 03122 ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); 03123 if (k.empty()) 03124 return false; 03125 03126 Mat mspace_weight(1, d * d, CV_32FC1, space_weight); 03127 Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); 03128 UMat ucolor_weight, uspace_weight, uspace_ofs; 03129 03130 mspace_weight.copyTo(uspace_weight); 03131 mspace_ofs.copyTo(uspace_ofs); 03132 03133 k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), 03134 ocl::KernelArg::PtrReadOnly(uspace_weight), 03135 ocl::KernelArg::PtrReadOnly(uspace_ofs)); 03136 03137 size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; 03138 return k.run(2, globalsize, NULL, false); 03139 } 03140 03141 #endif 03142 static void 03143 bilateralFilter_8u( const Mat& src, Mat& dst, int d, 03144 double sigma_color, double sigma_space, 03145 int borderType ) 03146 { 03147 int cn = src.channels(); 03148 int i, j, maxk, radius; 03149 Size size = src.size(); 03150 03151 CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); 03152 03153 if( sigma_color <= 0 ) 03154 sigma_color = 1; 03155 if( sigma_space <= 0 ) 03156 sigma_space = 1; 03157 03158 double gauss_color_coeff = -0.5/(sigma_color*sigma_color); 03159 double gauss_space_coeff = -0.5/(sigma_space*sigma_space); 03160 03161 if( d <= 0 ) 03162 radius = cvRound(sigma_space*1.5); 03163 else 03164 radius = d/2; 03165 radius = MAX(radius, 1); 03166 d = radius*2 + 1; 03167 03168 Mat temp; 03169 copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); 03170 03171 #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK 03172 CV_IPP_CHECK() 03173 { 03174 if( cn == 1 ) 03175 { 03176 bool ok; 03177 IPPBilateralFilter_8u_Invoker body(temp, dst, sigma_color * sigma_color, sigma_space * sigma_space, radius, &ok ); 03178 parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); 03179 if( ok ) 03180 { 03181 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 03182 return; 03183 } 03184 setIppErrorStatus(); 03185 } 03186 } 03187 #endif 03188 03189 std::vector<float> _color_weight(cn*256); 03190 std::vector<float> _space_weight(d*d); 03191 std::vector<int> _space_ofs(d*d); 03192 float* color_weight = &_color_weight[0]; 03193 float* space_weight = &_space_weight[0]; 03194 int* space_ofs = &_space_ofs[0]; 03195 03196 // initialize color-related bilateral filter coefficients 03197 03198 for( i = 0; i < 256*cn; i++ ) 03199 color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); 03200 03201 // initialize space-related bilateral filter coefficients 03202 for( i = -radius, maxk = 0; i <= radius; i++ ) 03203 { 03204 j = -radius; 03205 03206 for( ; j <= radius; j++ ) 03207 { 03208 double r = std::sqrt((double)i*i + (double)j*j); 03209 if( r > radius ) 03210 continue; 03211 space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); 03212 space_ofs[maxk++] = (int)(i*temp.step + j*cn); 03213 } 03214 } 03215 03216 BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); 03217 parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); 03218 } 03219 03220 03221 class BilateralFilter_32f_Invoker : 03222 public ParallelLoopBody 03223 { 03224 public: 03225 03226 BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, 03227 const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) : 03228 cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), 03229 temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) 03230 { 03231 } 03232 03233 virtual void operator() (const Range& range) const 03234 { 03235 int i, j, k; 03236 Size size = dest->size(); 03237 #if CV_SSE3 03238 int CV_DECL_ALIGNED(16) idxBuf[4]; 03239 float CV_DECL_ALIGNED(16) bufSum32[4]; 03240 static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; 03241 bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); 03242 #endif 03243 03244 for( i = range.start; i < range.end; i++ ) 03245 { 03246 const float* sptr = temp->ptr<float>(i+radius) + radius*cn; 03247 float* dptr = dest->ptr<float>(i); 03248 03249 if( cn == 1 ) 03250 { 03251 for( j = 0; j < size.width; j++ ) 03252 { 03253 float sum = 0, wsum = 0; 03254 float val0 = sptr[j]; 03255 k = 0; 03256 #if CV_SSE3 03257 if( haveSSE3 ) 03258 { 03259 __m128 psum = _mm_setzero_ps(); 03260 const __m128 _val0 = _mm_set1_ps(sptr[j]); 03261 const __m128 _scale_index = _mm_set1_ps(scale_index); 03262 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); 03263 03264 for( ; k <= maxk - 4 ; k += 4 ) 03265 { 03266 __m128 _sw = _mm_loadu_ps(space_weight + k); 03267 __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], 03268 sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]); 03269 __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index); 03270 03271 __m128i _idx = _mm_cvtps_epi32(_alpha); 03272 _mm_store_si128((__m128i*)idxBuf, _idx); 03273 _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); 03274 03275 __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], 03276 expLUT[idxBuf[1]], expLUT[idxBuf[0]]); 03277 __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], 03278 expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); 03279 03280 __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); 03281 _val = _mm_mul_ps(_w, _val); 03282 03283 _sw = _mm_hadd_ps(_w, _val); 03284 _sw = _mm_hadd_ps(_sw, _sw); 03285 psum = _mm_add_ps(_sw, psum); 03286 } 03287 _mm_storel_pi((__m64*)bufSum32, psum); 03288 03289 sum = bufSum32[1]; 03290 wsum = bufSum32[0]; 03291 } 03292 #endif 03293 03294 for( ; k < maxk; k++ ) 03295 { 03296 float val = sptr[j + space_ofs[k]]; 03297 float alpha = (float)(std::abs(val - val0)*scale_index); 03298 int idx = cvFloor(alpha); 03299 alpha -= idx; 03300 float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); 03301 sum += val*w; 03302 wsum += w; 03303 } 03304 dptr[j] = (float)(sum/wsum); 03305 } 03306 } 03307 else 03308 { 03309 CV_Assert( cn == 3 ); 03310 for( j = 0; j < size.width*3; j += 3 ) 03311 { 03312 float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; 03313 float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; 03314 k = 0; 03315 #if CV_SSE3 03316 if( haveSSE3 ) 03317 { 03318 __m128 sum = _mm_setzero_ps(); 03319 const __m128 _b0 = _mm_set1_ps(b0); 03320 const __m128 _g0 = _mm_set1_ps(g0); 03321 const __m128 _r0 = _mm_set1_ps(r0); 03322 const __m128 _scale_index = _mm_set1_ps(scale_index); 03323 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); 03324 03325 for( ; k <= maxk-4; k += 4 ) 03326 { 03327 __m128 _sw = _mm_loadu_ps(space_weight + k); 03328 03329 const float* const sptr_k0 = sptr + j + space_ofs[k]; 03330 const float* const sptr_k1 = sptr + j + space_ofs[k+1]; 03331 const float* const sptr_k2 = sptr + j + space_ofs[k+2]; 03332 const float* const sptr_k3 = sptr + j + space_ofs[k+3]; 03333 03334 __m128 _b = _mm_loadu_ps(sptr_k0); 03335 __m128 _g = _mm_loadu_ps(sptr_k1); 03336 __m128 _r = _mm_loadu_ps(sptr_k2); 03337 __m128 _z = _mm_loadu_ps(sptr_k3); 03338 _MM_TRANSPOSE4_PS(_b, _g, _r, _z); 03339 03340 __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0)); 03341 __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0)); 03342 __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0)); 03343 03344 __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt))); 03345 03346 __m128i _idx = _mm_cvtps_epi32(_alpha); 03347 _mm_store_si128((__m128i*)idxBuf, _idx); 03348 _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); 03349 03350 __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]); 03351 __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); 03352 03353 __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); 03354 03355 _b = _mm_mul_ps(_b, _w); 03356 _g = _mm_mul_ps(_g, _w); 03357 _r = _mm_mul_ps(_r, _w); 03358 03359 _w = _mm_hadd_ps(_w, _b); 03360 _g = _mm_hadd_ps(_g, _r); 03361 03362 _w = _mm_hadd_ps(_w, _g); 03363 sum = _mm_add_ps(sum, _w); 03364 } 03365 _mm_store_ps(bufSum32, sum); 03366 wsum = bufSum32[0]; 03367 sum_b = bufSum32[1]; 03368 sum_g = bufSum32[2]; 03369 sum_r = bufSum32[3]; 03370 } 03371 #endif 03372 03373 for(; k < maxk; k++ ) 03374 { 03375 const float* sptr_k = sptr + j + space_ofs[k]; 03376 float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; 03377 float alpha = (float)((std::abs(b - b0) + 03378 std::abs(g - g0) + std::abs(r - r0))*scale_index); 03379 int idx = cvFloor(alpha); 03380 alpha -= idx; 03381 float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); 03382 sum_b += b*w; sum_g += g*w; sum_r += r*w; 03383 wsum += w; 03384 } 03385 wsum = 1.f/wsum; 03386 b0 = sum_b*wsum; 03387 g0 = sum_g*wsum; 03388 r0 = sum_r*wsum; 03389 dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0; 03390 } 03391 } 03392 } 03393 } 03394 03395 private: 03396 int cn, radius, maxk, *space_ofs; 03397 const Mat* temp; 03398 Mat *dest; 03399 float scale_index, *space_weight, *expLUT; 03400 }; 03401 03402 03403 static void 03404 bilateralFilter_32f( const Mat& src, Mat& dst, int d, 03405 double sigma_color, double sigma_space, 03406 int borderType ) 03407 { 03408 int cn = src.channels(); 03409 int i, j, maxk, radius; 03410 double minValSrc=-1, maxValSrc=1; 03411 const int kExpNumBinsPerChannel = 1 << 12; 03412 int kExpNumBins = 0; 03413 float lastExpVal = 1.f; 03414 float len, scale_index; 03415 Size size = src.size(); 03416 03417 CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); 03418 03419 if( sigma_color <= 0 ) 03420 sigma_color = 1; 03421 if( sigma_space <= 0 ) 03422 sigma_space = 1; 03423 03424 double gauss_color_coeff = -0.5/(sigma_color*sigma_color); 03425 double gauss_space_coeff = -0.5/(sigma_space*sigma_space); 03426 03427 if( d <= 0 ) 03428 radius = cvRound(sigma_space*1.5); 03429 else 03430 radius = d/2; 03431 radius = MAX(radius, 1); 03432 d = radius*2 + 1; 03433 // compute the min/max range for the input image (even if multichannel) 03434 03435 minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); 03436 if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) 03437 { 03438 src.copyTo(dst); 03439 return; 03440 } 03441 03442 // temporary copy of the image with borders for easy processing 03443 Mat temp; 03444 copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); 03445 const double insteadNaNValue = -5. * sigma_color; 03446 patchNaNs( temp, insteadNaNValue ); // this replacement of NaNs makes the assumption that depth values are nonnegative 03447 // TODO: make insteadNaNValue avalible in the outside function interface to control the cases breaking the assumption 03448 // allocate lookup tables 03449 std::vector<float> _space_weight(d*d); 03450 std::vector<int> _space_ofs(d*d); 03451 float* space_weight = &_space_weight[0]; 03452 int* space_ofs = &_space_ofs[0]; 03453 03454 // assign a length which is slightly more than needed 03455 len = (float)(maxValSrc - minValSrc) * cn; 03456 kExpNumBins = kExpNumBinsPerChannel * cn; 03457 std::vector<float> _expLUT(kExpNumBins+2); 03458 float* expLUT = &_expLUT[0]; 03459 03460 scale_index = kExpNumBins/len; 03461 03462 // initialize the exp LUT 03463 for( i = 0; i < kExpNumBins+2; i++ ) 03464 { 03465 if( lastExpVal > 0.f ) 03466 { 03467 double val = i / scale_index; 03468 expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); 03469 lastExpVal = expLUT[i]; 03470 } 03471 else 03472 expLUT[i] = 0.f; 03473 } 03474 03475 // initialize space-related bilateral filter coefficients 03476 for( i = -radius, maxk = 0; i <= radius; i++ ) 03477 for( j = -radius; j <= radius; j++ ) 03478 { 03479 double r = std::sqrt((double)i*i + (double)j*j); 03480 if( r > radius ) 03481 continue; 03482 space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); 03483 space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); 03484 } 03485 03486 // parallel_for usage 03487 03488 BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); 03489 parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); 03490 } 03491 03492 } 03493 03494 void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, 03495 double sigmaColor, double sigmaSpace, 03496 int borderType ) 03497 { 03498 _dst.create( _src.size(), _src.type() ); 03499 03500 #ifdef HAVE_OPENCL 03501 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 03502 ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType)) 03503 #endif 03504 03505 Mat src = _src.getMat(), dst = _dst.getMat(); 03506 03507 if( src.depth() == CV_8U ) 03508 bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType ); 03509 else if( src.depth() == CV_32F ) 03510 bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType ); 03511 else 03512 CV_Error( CV_StsUnsupportedFormat, 03513 "Bilateral filtering is only implemented for 8u and 32f images" ); 03514 } 03515 03516 ////////////////////////////////////////////////////////////////////////////////////////// 03517 03518 CV_IMPL void 03519 cvSmooth( const void* srcarr, void* dstarr, int smooth_type, 03520 int param1, int param2, double param3, double param4 ) 03521 { 03522 cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0; 03523 03524 CV_Assert( dst.size() == src.size() && 03525 (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) ); 03526 03527 if( param2 <= 0 ) 03528 param2 = param1; 03529 03530 if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE ) 03531 cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point (-1,-1), 03532 smooth_type == CV_BLUR, cv::BORDER_REPLICATE ); 03533 else if( smooth_type == CV_GAUSSIAN ) 03534 cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE ); 03535 else if( smooth_type == CV_MEDIAN ) 03536 cv::medianBlur( src, dst, param1 ); 03537 else 03538 cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE ); 03539 03540 if( dst.data != dst0.data ) 03541 CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" ); 03542 } 03543 03544 /* End of file. */ 03545
Generated on Tue Jul 12 2022 15:17:31 by
1.7.2
