Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
imgwarp.cpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 00016 // Third party copyrights are property of their respective owners. 00017 // 00018 // Redistribution and use in source and binary forms, with or without modification, 00019 // are permitted provided that the following conditions are met: 00020 // 00021 // * Redistribution's of source code must retain the above copyright notice, 00022 // this list of conditions and the following disclaimer. 00023 // 00024 // * Redistribution's in binary form must reproduce the above copyright notice, 00025 // this list of conditions and the following disclaimer in the documentation 00026 // and/or other materials provided with the distribution. 00027 // 00028 // * The name of the copyright holders may not be used to endorse or promote products 00029 // derived from this software without specific prior written permission. 00030 // 00031 // This software is provided by the copyright holders and contributors "as is" and 00032 // any express or implied warranties, including, but not limited to, the implied 00033 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00034 // In no event shall the Intel Corporation or contributors be liable for any direct, 00035 // indirect, incidental, special, exemplary, or consequential damages 00036 // (including, but not limited to, procurement of substitute goods or services; 00037 // loss of use, data, or profits; or business interruption) however caused 00038 // and on any theory of liability, whether in contract, strict liability, 00039 // or tort (including negligence or otherwise) arising in any way out of 00040 // the use of this software, even if advised of the possibility of such damage. 00041 // 00042 //M*/ 00043 00044 /* //////////////////////////////////////////////////////////////////// 00045 // 00046 // Geometrical transforms on images and matrices: rotation, zoom etc. 00047 // 00048 // */ 00049 00050 #include "precomp.hpp" 00051 #include "opencl_kernels_imgproc.hpp" 00052 00053 namespace cv 00054 { 00055 #if IPP_VERSION_X100 >= 710 00056 typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*); 00057 typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*); 00058 typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*); 00059 #endif 00060 00061 #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK 00062 typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize); 00063 typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int); 00064 typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int); 00065 00066 template <int channels, typename Type> 00067 bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func) 00068 { 00069 Type values[channels]; 00070 for( int i = 0; i < channels; i++ ) 00071 values[i] = saturate_cast<Type>(value[i]); 00072 return func(values, dataPointer, step, size) >= 0; 00073 } 00074 00075 static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth) 00076 { 00077 if( channels == 1 ) 00078 { 00079 switch( depth ) 00080 { 00081 case CV_8U: 00082 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0; 00083 case CV_16U: 00084 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0; 00085 case CV_32F: 00086 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0; 00087 } 00088 } 00089 else 00090 { 00091 if( channels == 3 ) 00092 { 00093 switch( depth ) 00094 { 00095 case CV_8U: 00096 return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R); 00097 case CV_16U: 00098 return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R); 00099 case CV_32F: 00100 return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R); 00101 } 00102 } 00103 else if( channels == 4 ) 00104 { 00105 switch( depth ) 00106 { 00107 case CV_8U: 00108 return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R); 00109 case CV_16U: 00110 return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R); 00111 case CV_32F: 00112 return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R); 00113 } 00114 } 00115 } 00116 return false; 00117 } 00118 #endif 00119 00120 /************** interpolation formulas and tables ***************/ 00121 00122 const int INTER_RESIZE_COEF_BITS=11; 00123 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS; 00124 00125 const int INTER_REMAP_COEF_BITS=15; 00126 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS; 00127 00128 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2]; 00129 00130 static float BilinearTab_f[INTER_TAB_SIZE2][2][2]; 00131 static short BilinearTab_i[INTER_TAB_SIZE2][2][2]; 00132 00133 #if CV_SSE2 || CV_NEON 00134 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8]; 00135 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16); 00136 #endif 00137 00138 static float BicubicTab_f[INTER_TAB_SIZE2][4][4]; 00139 static short BicubicTab_i[INTER_TAB_SIZE2][4][4]; 00140 00141 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8]; 00142 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8]; 00143 00144 static inline void interpolateLinear( float x, float* coeffs ) 00145 { 00146 coeffs[0] = 1.f - x; 00147 coeffs[1] = x; 00148 } 00149 00150 static inline void interpolateCubic( float x, float* coeffs ) 00151 { 00152 const float A = -0.75f; 00153 00154 coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A; 00155 coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1; 00156 coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1; 00157 coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; 00158 } 00159 00160 static inline void interpolateLanczos4( float x, float* coeffs ) 00161 { 00162 static const double s45 = 0.70710678118654752440084436210485; 00163 static const double cs[][2]= 00164 {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; 00165 00166 if( x < FLT_EPSILON ) 00167 { 00168 for( int i = 0; i < 8; i++ ) 00169 coeffs[i] = 0; 00170 coeffs[3] = 1; 00171 return; 00172 } 00173 00174 float sum = 0; 00175 double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0); 00176 for(int i = 0; i < 8; i++ ) 00177 { 00178 double y = -(x+3-i)*CV_PI*0.25; 00179 coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y)); 00180 sum += coeffs[i]; 00181 } 00182 00183 sum = 1.f/sum; 00184 for(int i = 0; i < 8; i++ ) 00185 coeffs[i] *= sum; 00186 } 00187 00188 static void initInterTab1D(int method, float* tab, int tabsz) 00189 { 00190 float scale = 1.f/tabsz; 00191 if( method == INTER_LINEAR ) 00192 { 00193 for( int i = 0; i < tabsz; i++, tab += 2 ) 00194 interpolateLinear( i*scale, tab ); 00195 } 00196 else if( method == INTER_CUBIC ) 00197 { 00198 for( int i = 0; i < tabsz; i++, tab += 4 ) 00199 interpolateCubic( i*scale, tab ); 00200 } 00201 else if( method == INTER_LANCZOS4 ) 00202 { 00203 for( int i = 0; i < tabsz; i++, tab += 8 ) 00204 interpolateLanczos4( i*scale, tab ); 00205 } 00206 else 00207 CV_Error( CV_StsBadArg, "Unknown interpolation method" ); 00208 } 00209 00210 00211 static const void* initInterTab2D( int method, bool fixpt ) 00212 { 00213 static bool inittab[INTER_MAX+1] = {false}; 00214 float* tab = 0; 00215 short* itab = 0; 00216 int ksize = 0; 00217 if( method == INTER_LINEAR ) 00218 tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2; 00219 else if( method == INTER_CUBIC ) 00220 tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4; 00221 else if( method == INTER_LANCZOS4 ) 00222 tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8; 00223 else 00224 CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" ); 00225 00226 if( !inittab[method] ) 00227 { 00228 AutoBuffer<float> _tab(8*INTER_TAB_SIZE); 00229 int i, j, k1, k2; 00230 initInterTab1D(method, _tab, INTER_TAB_SIZE); 00231 for( i = 0; i < INTER_TAB_SIZE; i++ ) 00232 for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize ) 00233 { 00234 int isum = 0; 00235 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2; 00236 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2; 00237 00238 for( k1 = 0; k1 < ksize; k1++ ) 00239 { 00240 float vy = _tab[i*ksize + k1]; 00241 for( k2 = 0; k2 < ksize; k2++ ) 00242 { 00243 float v = vy*_tab[j*ksize + k2]; 00244 tab[k1*ksize + k2] = v; 00245 isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE); 00246 } 00247 } 00248 00249 if( isum != INTER_REMAP_COEF_SCALE ) 00250 { 00251 int diff = isum - INTER_REMAP_COEF_SCALE; 00252 int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2; 00253 for( k1 = ksize2; k1 < ksize2+2; k1++ ) 00254 for( k2 = ksize2; k2 < ksize2+2; k2++ ) 00255 { 00256 if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] ) 00257 mk1 = k1, mk2 = k2; 00258 else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] ) 00259 Mk1 = k1, Mk2 = k2; 00260 } 00261 if( diff < 0 ) 00262 itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff); 00263 else 00264 itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff); 00265 } 00266 } 00267 tab -= INTER_TAB_SIZE2*ksize*ksize; 00268 itab -= INTER_TAB_SIZE2*ksize*ksize; 00269 #if CV_SSE2 || CV_NEON 00270 if( method == INTER_LINEAR ) 00271 { 00272 for( i = 0; i < INTER_TAB_SIZE2; i++ ) 00273 for( j = 0; j < 4; j++ ) 00274 { 00275 BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0]; 00276 BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1]; 00277 BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0]; 00278 BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1]; 00279 } 00280 } 00281 #endif 00282 inittab[method] = true; 00283 } 00284 return fixpt ? (const void*)itab : (const void*)tab; 00285 } 00286 00287 #ifndef __MINGW32__ 00288 static bool initAllInterTab2D() 00289 { 00290 return initInterTab2D( INTER_LINEAR, false ) && 00291 initInterTab2D( INTER_LINEAR, true ) && 00292 initInterTab2D( INTER_CUBIC, false ) && 00293 initInterTab2D( INTER_CUBIC, true ) && 00294 initInterTab2D( INTER_LANCZOS4, false ) && 00295 initInterTab2D( INTER_LANCZOS4, true ); 00296 } 00297 00298 static volatile bool doInitAllInterTab2D = initAllInterTab2D(); 00299 #endif 00300 00301 template<typename ST, typename DT> struct Cast 00302 { 00303 typedef ST type1; 00304 typedef DT rtype; 00305 00306 DT operator()(ST val) const { return saturate_cast<DT>(val); } 00307 }; 00308 00309 template<typename ST, typename DT, int bits> struct FixedPtCast 00310 { 00311 typedef ST type1; 00312 typedef DT rtype; 00313 enum { SHIFT = bits, DELTA = 1 << (bits-1) }; 00314 00315 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); } 00316 }; 00317 00318 /****************************************************************************************\ 00319 * Resize * 00320 \****************************************************************************************/ 00321 00322 class resizeNNInvoker : 00323 public ParallelLoopBody 00324 { 00325 public: 00326 resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : 00327 ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), 00328 ify(_ify) 00329 { 00330 } 00331 00332 virtual void operator() (const Range& range) const 00333 { 00334 Size ssize = src.size(), dsize = dst.size(); 00335 int y, x, pix_size = (int)src.elemSize(); 00336 00337 for( y = range.start; y < range.end; y++ ) 00338 { 00339 uchar* D = dst.data + dst.step*y; 00340 int sy = std::min(cvFloor(y*ify), ssize.height-1); 00341 const uchar* S = src.ptr(sy); 00342 00343 switch( pix_size ) 00344 { 00345 case 1: 00346 for( x = 0; x <= dsize.width - 2; x += 2 ) 00347 { 00348 uchar t0 = S[x_ofs[x]]; 00349 uchar t1 = S[x_ofs[x+1]]; 00350 D[x] = t0; 00351 D[x+1] = t1; 00352 } 00353 00354 for( ; x < dsize.width; x++ ) 00355 D[x] = S[x_ofs[x]]; 00356 break; 00357 case 2: 00358 for( x = 0; x < dsize.width; x++ ) 00359 *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]); 00360 break; 00361 case 3: 00362 for( x = 0; x < dsize.width; x++, D += 3 ) 00363 { 00364 const uchar* _tS = S + x_ofs[x]; 00365 D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2]; 00366 } 00367 break; 00368 case 4: 00369 for( x = 0; x < dsize.width; x++ ) 00370 *(int*)(D + x*4) = *(int*)(S + x_ofs[x]); 00371 break; 00372 case 6: 00373 for( x = 0; x < dsize.width; x++, D += 6 ) 00374 { 00375 const ushort* _tS = (const ushort*)(S + x_ofs[x]); 00376 ushort* _tD = (ushort*)D; 00377 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2]; 00378 } 00379 break; 00380 case 8: 00381 for( x = 0; x < dsize.width; x++, D += 8 ) 00382 { 00383 const int* _tS = (const int*)(S + x_ofs[x]); 00384 int* _tD = (int*)D; 00385 _tD[0] = _tS[0]; _tD[1] = _tS[1]; 00386 } 00387 break; 00388 case 12: 00389 for( x = 0; x < dsize.width; x++, D += 12 ) 00390 { 00391 const int* _tS = (const int*)(S + x_ofs[x]); 00392 int* _tD = (int*)D; 00393 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2]; 00394 } 00395 break; 00396 default: 00397 for( x = 0; x < dsize.width; x++, D += pix_size ) 00398 { 00399 const int* _tS = (const int*)(S + x_ofs[x]); 00400 int* _tD = (int*)D; 00401 for( int k = 0; k < pix_size4; k++ ) 00402 _tD[k] = _tS[k]; 00403 } 00404 } 00405 } 00406 } 00407 00408 private: 00409 const Mat src; 00410 Mat dst; 00411 int* x_ofs, pix_size4; 00412 double ify; 00413 00414 resizeNNInvoker(const resizeNNInvoker&); 00415 resizeNNInvoker& operator=(const resizeNNInvoker&); 00416 }; 00417 00418 static void 00419 resizeNN( const Mat& src, Mat& dst, double fx, double fy ) 00420 { 00421 Size ssize = src.size(), dsize = dst.size(); 00422 AutoBuffer<int> _x_ofs(dsize.width); 00423 int* x_ofs = _x_ofs; 00424 int pix_size = (int)src.elemSize(); 00425 int pix_size4 = (int)(pix_size / sizeof(int)); 00426 double ifx = 1./fx, ify = 1./fy; 00427 int x; 00428 00429 for( x = 0; x < dsize.width; x++ ) 00430 { 00431 int sx = cvFloor(x*ifx); 00432 x_ofs[x] = std::min(sx, ssize.width-1)*pix_size; 00433 } 00434 00435 Range range(0, dsize.height); 00436 resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify); 00437 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 00438 } 00439 00440 00441 struct VResizeNoVec 00442 { 00443 int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; } 00444 }; 00445 00446 struct HResizeNoVec 00447 { 00448 int operator()(const uchar**, uchar**, int, const int*, 00449 const uchar*, int, int, int, int, int) const { return 0; } 00450 }; 00451 00452 #if CV_SSE2 00453 00454 struct VResizeLinearVec_32s8u 00455 { 00456 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const 00457 { 00458 if( !checkHardwareSupport(CV_CPU_SSE2) ) 00459 return 0; 00460 00461 const int** src = (const int**)_src; 00462 const short* beta = (const short*)_beta; 00463 const int *S0 = src[0], *S1 = src[1]; 00464 int x = 0; 00465 __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]); 00466 __m128i delta = _mm_set1_epi16(2); 00467 00468 if( (((size_t)S0|(size_t)S1)&15) == 0 ) 00469 for( ; x <= width - 16; x += 16 ) 00470 { 00471 __m128i x0, x1, x2, y0, y1, y2; 00472 x0 = _mm_load_si128((const __m128i*)(S0 + x)); 00473 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); 00474 y0 = _mm_load_si128((const __m128i*)(S1 + x)); 00475 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); 00476 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); 00477 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); 00478 00479 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8)); 00480 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12)); 00481 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8)); 00482 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12)); 00483 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); 00484 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); 00485 00486 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); 00487 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); 00488 00489 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); 00490 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); 00491 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); 00492 } 00493 else 00494 for( ; x <= width - 16; x += 16 ) 00495 { 00496 __m128i x0, x1, x2, y0, y1, y2; 00497 x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); 00498 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); 00499 y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); 00500 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); 00501 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); 00502 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); 00503 00504 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8)); 00505 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12)); 00506 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8)); 00507 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12)); 00508 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); 00509 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); 00510 00511 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); 00512 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); 00513 00514 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); 00515 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); 00516 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); 00517 } 00518 00519 for( ; x < width - 4; x += 4 ) 00520 { 00521 __m128i x0, y0; 00522 x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4); 00523 y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4); 00524 x0 = _mm_packs_epi32(x0, x0); 00525 y0 = _mm_packs_epi32(y0, y0); 00526 x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1)); 00527 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); 00528 x0 = _mm_packus_epi16(x0, x0); 00529 *(int*)(dst + x) = _mm_cvtsi128_si32(x0); 00530 } 00531 00532 return x; 00533 } 00534 }; 00535 00536 00537 template<int shiftval> struct VResizeLinearVec_32f16 00538 { 00539 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 00540 { 00541 if( !checkHardwareSupport(CV_CPU_SSE2) ) 00542 return 0; 00543 00544 const float** src = (const float**)_src; 00545 const float* beta = (const float*)_beta; 00546 const float *S0 = src[0], *S1 = src[1]; 00547 ushort* dst = (ushort*)_dst; 00548 int x = 0; 00549 00550 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); 00551 __m128i preshift = _mm_set1_epi32(shiftval); 00552 __m128i postshift = _mm_set1_epi16((short)shiftval); 00553 00554 if( (((size_t)S0|(size_t)S1)&15) == 0 ) 00555 for( ; x <= width - 16; x += 16 ) 00556 { 00557 __m128 x0, x1, y0, y1; 00558 __m128i t0, t1, t2; 00559 x0 = _mm_load_ps(S0 + x); 00560 x1 = _mm_load_ps(S0 + x + 4); 00561 y0 = _mm_load_ps(S1 + x); 00562 y1 = _mm_load_ps(S1 + x + 4); 00563 00564 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 00565 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 00566 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 00567 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); 00568 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); 00569 00570 x0 = _mm_load_ps(S0 + x + 8); 00571 x1 = _mm_load_ps(S0 + x + 12); 00572 y0 = _mm_load_ps(S1 + x + 8); 00573 y1 = _mm_load_ps(S1 + x + 12); 00574 00575 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 00576 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 00577 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 00578 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); 00579 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); 00580 00581 _mm_storeu_si128( (__m128i*)(dst + x), t0); 00582 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); 00583 } 00584 else 00585 for( ; x <= width - 16; x += 16 ) 00586 { 00587 __m128 x0, x1, y0, y1; 00588 __m128i t0, t1, t2; 00589 x0 = _mm_loadu_ps(S0 + x); 00590 x1 = _mm_loadu_ps(S0 + x + 4); 00591 y0 = _mm_loadu_ps(S1 + x); 00592 y1 = _mm_loadu_ps(S1 + x + 4); 00593 00594 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 00595 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 00596 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 00597 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); 00598 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); 00599 00600 x0 = _mm_loadu_ps(S0 + x + 8); 00601 x1 = _mm_loadu_ps(S0 + x + 12); 00602 y0 = _mm_loadu_ps(S1 + x + 8); 00603 y1 = _mm_loadu_ps(S1 + x + 12); 00604 00605 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 00606 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 00607 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 00608 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); 00609 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); 00610 00611 _mm_storeu_si128( (__m128i*)(dst + x), t0); 00612 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); 00613 } 00614 00615 for( ; x < width - 4; x += 4 ) 00616 { 00617 __m128 x0, y0; 00618 __m128i t0; 00619 x0 = _mm_loadu_ps(S0 + x); 00620 y0 = _mm_loadu_ps(S1 + x); 00621 00622 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 00623 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 00624 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift); 00625 _mm_storel_epi64( (__m128i*)(dst + x), t0); 00626 } 00627 00628 return x; 00629 } 00630 }; 00631 00632 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u; 00633 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s; 00634 00635 struct VResizeLinearVec_32f 00636 { 00637 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 00638 { 00639 if( !checkHardwareSupport(CV_CPU_SSE) ) 00640 return 0; 00641 00642 const float** src = (const float**)_src; 00643 const float* beta = (const float*)_beta; 00644 const float *S0 = src[0], *S1 = src[1]; 00645 float* dst = (float*)_dst; 00646 int x = 0; 00647 00648 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); 00649 00650 if( (((size_t)S0|(size_t)S1)&15) == 0 ) 00651 for( ; x <= width - 8; x += 8 ) 00652 { 00653 __m128 x0, x1, y0, y1; 00654 x0 = _mm_load_ps(S0 + x); 00655 x1 = _mm_load_ps(S0 + x + 4); 00656 y0 = _mm_load_ps(S1 + x); 00657 y1 = _mm_load_ps(S1 + x + 4); 00658 00659 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 00660 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 00661 00662 _mm_storeu_ps( dst + x, x0); 00663 _mm_storeu_ps( dst + x + 4, x1); 00664 } 00665 else 00666 for( ; x <= width - 8; x += 8 ) 00667 { 00668 __m128 x0, x1, y0, y1; 00669 x0 = _mm_loadu_ps(S0 + x); 00670 x1 = _mm_loadu_ps(S0 + x + 4); 00671 y0 = _mm_loadu_ps(S1 + x); 00672 y1 = _mm_loadu_ps(S1 + x + 4); 00673 00674 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 00675 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 00676 00677 _mm_storeu_ps( dst + x, x0); 00678 _mm_storeu_ps( dst + x + 4, x1); 00679 } 00680 00681 return x; 00682 } 00683 }; 00684 00685 00686 struct VResizeCubicVec_32s8u 00687 { 00688 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const 00689 { 00690 if( !checkHardwareSupport(CV_CPU_SSE2) ) 00691 return 0; 00692 00693 const int** src = (const int**)_src; 00694 const short* beta = (const short*)_beta; 00695 const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 00696 int x = 0; 00697 float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); 00698 __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale), 00699 b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale); 00700 00701 if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) 00702 for( ; x <= width - 8; x += 8 ) 00703 { 00704 __m128i x0, x1, y0, y1; 00705 __m128 s0, s1, f0, f1; 00706 x0 = _mm_load_si128((const __m128i*)(S0 + x)); 00707 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); 00708 y0 = _mm_load_si128((const __m128i*)(S1 + x)); 00709 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); 00710 00711 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); 00712 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); 00713 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); 00714 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); 00715 s0 = _mm_add_ps(s0, f0); 00716 s1 = _mm_add_ps(s1, f1); 00717 00718 x0 = _mm_load_si128((const __m128i*)(S2 + x)); 00719 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4)); 00720 y0 = _mm_load_si128((const __m128i*)(S3 + x)); 00721 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4)); 00722 00723 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); 00724 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); 00725 s0 = _mm_add_ps(s0, f0); 00726 s1 = _mm_add_ps(s1, f1); 00727 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); 00728 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); 00729 s0 = _mm_add_ps(s0, f0); 00730 s1 = _mm_add_ps(s1, f1); 00731 00732 x0 = _mm_cvtps_epi32(s0); 00733 x1 = _mm_cvtps_epi32(s1); 00734 00735 x0 = _mm_packs_epi32(x0, x1); 00736 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); 00737 } 00738 else 00739 for( ; x <= width - 8; x += 8 ) 00740 { 00741 __m128i x0, x1, y0, y1; 00742 __m128 s0, s1, f0, f1; 00743 x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); 00744 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); 00745 y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); 00746 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); 00747 00748 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); 00749 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); 00750 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); 00751 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); 00752 s0 = _mm_add_ps(s0, f0); 00753 s1 = _mm_add_ps(s1, f1); 00754 00755 x0 = _mm_loadu_si128((const __m128i*)(S2 + x)); 00756 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4)); 00757 y0 = _mm_loadu_si128((const __m128i*)(S3 + x)); 00758 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4)); 00759 00760 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); 00761 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); 00762 s0 = _mm_add_ps(s0, f0); 00763 s1 = _mm_add_ps(s1, f1); 00764 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); 00765 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); 00766 s0 = _mm_add_ps(s0, f0); 00767 s1 = _mm_add_ps(s1, f1); 00768 00769 x0 = _mm_cvtps_epi32(s0); 00770 x1 = _mm_cvtps_epi32(s1); 00771 00772 x0 = _mm_packs_epi32(x0, x1); 00773 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); 00774 } 00775 00776 return x; 00777 } 00778 }; 00779 00780 00781 template<int shiftval> struct VResizeCubicVec_32f16 00782 { 00783 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 00784 { 00785 if( !checkHardwareSupport(CV_CPU_SSE2) ) 00786 return 0; 00787 00788 const float** src = (const float**)_src; 00789 const float* beta = (const float*)_beta; 00790 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 00791 ushort* dst = (ushort*)_dst; 00792 int x = 0; 00793 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), 00794 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); 00795 __m128i preshift = _mm_set1_epi32(shiftval); 00796 __m128i postshift = _mm_set1_epi16((short)shiftval); 00797 00798 for( ; x <= width - 8; x += 8 ) 00799 { 00800 __m128 x0, x1, y0, y1, s0, s1; 00801 __m128i t0, t1; 00802 x0 = _mm_loadu_ps(S0 + x); 00803 x1 = _mm_loadu_ps(S0 + x + 4); 00804 y0 = _mm_loadu_ps(S1 + x); 00805 y1 = _mm_loadu_ps(S1 + x + 4); 00806 00807 s0 = _mm_mul_ps(x0, b0); 00808 s1 = _mm_mul_ps(x1, b0); 00809 y0 = _mm_mul_ps(y0, b1); 00810 y1 = _mm_mul_ps(y1, b1); 00811 s0 = _mm_add_ps(s0, y0); 00812 s1 = _mm_add_ps(s1, y1); 00813 00814 x0 = _mm_loadu_ps(S2 + x); 00815 x1 = _mm_loadu_ps(S2 + x + 4); 00816 y0 = _mm_loadu_ps(S3 + x); 00817 y1 = _mm_loadu_ps(S3 + x + 4); 00818 00819 x0 = _mm_mul_ps(x0, b2); 00820 x1 = _mm_mul_ps(x1, b2); 00821 y0 = _mm_mul_ps(y0, b3); 00822 y1 = _mm_mul_ps(y1, b3); 00823 s0 = _mm_add_ps(s0, x0); 00824 s1 = _mm_add_ps(s1, x1); 00825 s0 = _mm_add_ps(s0, y0); 00826 s1 = _mm_add_ps(s1, y1); 00827 00828 t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift); 00829 t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift); 00830 00831 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift); 00832 _mm_storeu_si128( (__m128i*)(dst + x), t0); 00833 } 00834 00835 return x; 00836 } 00837 }; 00838 00839 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u; 00840 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s; 00841 00842 struct VResizeCubicVec_32f 00843 { 00844 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 00845 { 00846 if( !checkHardwareSupport(CV_CPU_SSE) ) 00847 return 0; 00848 00849 const float** src = (const float**)_src; 00850 const float* beta = (const float*)_beta; 00851 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 00852 float* dst = (float*)_dst; 00853 int x = 0; 00854 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), 00855 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); 00856 00857 for( ; x <= width - 8; x += 8 ) 00858 { 00859 __m128 x0, x1, y0, y1, s0, s1; 00860 x0 = _mm_loadu_ps(S0 + x); 00861 x1 = _mm_loadu_ps(S0 + x + 4); 00862 y0 = _mm_loadu_ps(S1 + x); 00863 y1 = _mm_loadu_ps(S1 + x + 4); 00864 00865 s0 = _mm_mul_ps(x0, b0); 00866 s1 = _mm_mul_ps(x1, b0); 00867 y0 = _mm_mul_ps(y0, b1); 00868 y1 = _mm_mul_ps(y1, b1); 00869 s0 = _mm_add_ps(s0, y0); 00870 s1 = _mm_add_ps(s1, y1); 00871 00872 x0 = _mm_loadu_ps(S2 + x); 00873 x1 = _mm_loadu_ps(S2 + x + 4); 00874 y0 = _mm_loadu_ps(S3 + x); 00875 y1 = _mm_loadu_ps(S3 + x + 4); 00876 00877 x0 = _mm_mul_ps(x0, b2); 00878 x1 = _mm_mul_ps(x1, b2); 00879 y0 = _mm_mul_ps(y0, b3); 00880 y1 = _mm_mul_ps(y1, b3); 00881 s0 = _mm_add_ps(s0, x0); 00882 s1 = _mm_add_ps(s1, x1); 00883 s0 = _mm_add_ps(s0, y0); 00884 s1 = _mm_add_ps(s1, y1); 00885 00886 _mm_storeu_ps( dst + x, s0); 00887 _mm_storeu_ps( dst + x + 4, s1); 00888 } 00889 00890 return x; 00891 } 00892 }; 00893 00894 #if CV_SSE4_1 00895 00896 struct VResizeLanczos4Vec_32f16u 00897 { 00898 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 00899 { 00900 const float** src = (const float**)_src; 00901 const float* beta = (const float*)_beta; 00902 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 00903 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 00904 short * dst = (short*)_dst; 00905 int x = 0; 00906 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), 00907 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), 00908 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), 00909 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); 00910 00911 for( ; x <= width - 8; x += 8 ) 00912 { 00913 __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); 00914 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); 00915 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); 00916 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); 00917 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); 00918 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); 00919 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); 00920 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); 00921 00922 __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); 00923 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); 00924 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); 00925 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); 00926 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); 00927 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); 00928 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); 00929 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); 00930 00931 __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); 00932 __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); 00933 00934 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1)); 00935 } 00936 00937 return x; 00938 } 00939 }; 00940 00941 #else 00942 00943 typedef VResizeNoVec VResizeLanczos4Vec_32f16u; 00944 00945 #endif 00946 00947 struct VResizeLanczos4Vec_32f16s 00948 { 00949 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 00950 { 00951 const float** src = (const float**)_src; 00952 const float* beta = (const float*)_beta; 00953 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 00954 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 00955 short * dst = (short*)_dst; 00956 int x = 0; 00957 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), 00958 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), 00959 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), 00960 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); 00961 00962 for( ; x <= width - 8; x += 8 ) 00963 { 00964 __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); 00965 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); 00966 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); 00967 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); 00968 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); 00969 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); 00970 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); 00971 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); 00972 00973 __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); 00974 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); 00975 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); 00976 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); 00977 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); 00978 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); 00979 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); 00980 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); 00981 00982 __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); 00983 __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); 00984 00985 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1)); 00986 } 00987 00988 return x; 00989 } 00990 }; 00991 00992 00993 struct VResizeLanczos4Vec_32f 00994 { 00995 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 00996 { 00997 const float** src = (const float**)_src; 00998 const float* beta = (const float*)_beta; 00999 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 01000 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 01001 float* dst = (float*)_dst; 01002 int x = 0; 01003 01004 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), 01005 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), 01006 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), 01007 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); 01008 01009 for( ; x <= width - 4; x += 4 ) 01010 { 01011 __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); 01012 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); 01013 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); 01014 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); 01015 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); 01016 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); 01017 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); 01018 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); 01019 01020 _mm_storeu_ps(dst + x, v_dst); 01021 } 01022 01023 return x; 01024 } 01025 }; 01026 01027 01028 #elif CV_NEON 01029 01030 struct VResizeLinearVec_32s8u 01031 { 01032 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const 01033 { 01034 const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1]; 01035 const short* beta = (const short*)_beta; 01036 int x = 0; 01037 int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2); 01038 01039 for( ; x <= width - 16; x += 16) 01040 { 01041 int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4); 01042 int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4); 01043 01044 int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); 01045 int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); 01046 01047 int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), 01048 vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); 01049 v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2); 01050 01051 v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4); 01052 v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4); 01053 v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4); 01054 v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4); 01055 01056 v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); 01057 v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); 01058 01059 int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), 01060 vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); 01061 v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2); 01062 01063 vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1))); 01064 } 01065 01066 return x; 01067 } 01068 }; 01069 01070 struct VResizeLinearVec_32f16u 01071 { 01072 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01073 { 01074 const float** src = (const float**)_src; 01075 const float* beta = (const float*)_beta; 01076 const float *S0 = src[0], *S1 = src[1]; 01077 ushort* dst = (ushort*)_dst; 01078 int x = 0; 01079 01080 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); 01081 01082 for( ; x <= width - 8; x += 8 ) 01083 { 01084 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); 01085 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); 01086 01087 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); 01088 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); 01089 01090 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), 01091 vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); 01092 } 01093 01094 return x; 01095 } 01096 }; 01097 01098 struct VResizeLinearVec_32f16s 01099 { 01100 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01101 { 01102 const float** src = (const float**)_src; 01103 const float* beta = (const float*)_beta; 01104 const float *S0 = src[0], *S1 = src[1]; 01105 short* dst = (short*)_dst; 01106 int x = 0; 01107 01108 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); 01109 01110 for( ; x <= width - 8; x += 8 ) 01111 { 01112 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); 01113 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); 01114 01115 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); 01116 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); 01117 01118 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), 01119 vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); 01120 } 01121 01122 return x; 01123 } 01124 }; 01125 01126 struct VResizeLinearVec_32f 01127 { 01128 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01129 { 01130 const float** src = (const float**)_src; 01131 const float* beta = (const float*)_beta; 01132 const float *S0 = src[0], *S1 = src[1]; 01133 float* dst = (float*)_dst; 01134 int x = 0; 01135 01136 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); 01137 01138 for( ; x <= width - 8; x += 8 ) 01139 { 01140 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); 01141 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); 01142 01143 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1)); 01144 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1)); 01145 } 01146 01147 return x; 01148 } 01149 }; 01150 01151 typedef VResizeNoVec VResizeCubicVec_32s8u; 01152 01153 struct VResizeCubicVec_32f16u 01154 { 01155 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01156 { 01157 const float** src = (const float**)_src; 01158 const float* beta = (const float*)_beta; 01159 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 01160 ushort* dst = (ushort*)_dst; 01161 int x = 0; 01162 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 01163 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); 01164 01165 for( ; x <= width - 8; x += 8 ) 01166 { 01167 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 01168 v_b1, vld1q_f32(S1 + x)), 01169 v_b2, vld1q_f32(S2 + x)), 01170 v_b3, vld1q_f32(S3 + x)); 01171 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 01172 v_b1, vld1q_f32(S1 + x + 4)), 01173 v_b2, vld1q_f32(S2 + x + 4)), 01174 v_b3, vld1q_f32(S3 + x + 4)); 01175 01176 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), 01177 vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); 01178 } 01179 01180 return x; 01181 } 01182 }; 01183 01184 struct VResizeCubicVec_32f16s 01185 { 01186 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01187 { 01188 const float** src = (const float**)_src; 01189 const float* beta = (const float*)_beta; 01190 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 01191 short* dst = (short*)_dst; 01192 int x = 0; 01193 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 01194 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); 01195 01196 for( ; x <= width - 8; x += 8 ) 01197 { 01198 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 01199 v_b1, vld1q_f32(S1 + x)), 01200 v_b2, vld1q_f32(S2 + x)), 01201 v_b3, vld1q_f32(S3 + x)); 01202 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 01203 v_b1, vld1q_f32(S1 + x + 4)), 01204 v_b2, vld1q_f32(S2 + x + 4)), 01205 v_b3, vld1q_f32(S3 + x + 4)); 01206 01207 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), 01208 vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); 01209 } 01210 01211 return x; 01212 } 01213 }; 01214 01215 struct VResizeCubicVec_32f 01216 { 01217 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01218 { 01219 const float** src = (const float**)_src; 01220 const float* beta = (const float*)_beta; 01221 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 01222 float* dst = (float*)_dst; 01223 int x = 0; 01224 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 01225 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); 01226 01227 for( ; x <= width - 8; x += 8 ) 01228 { 01229 vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 01230 v_b1, vld1q_f32(S1 + x)), 01231 v_b2, vld1q_f32(S2 + x)), 01232 v_b3, vld1q_f32(S3 + x))); 01233 vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 01234 v_b1, vld1q_f32(S1 + x + 4)), 01235 v_b2, vld1q_f32(S2 + x + 4)), 01236 v_b3, vld1q_f32(S3 + x + 4))); 01237 } 01238 01239 return x; 01240 } 01241 }; 01242 01243 struct VResizeLanczos4Vec_32f16u 01244 { 01245 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01246 { 01247 const float** src = (const float**)_src; 01248 const float* beta = (const float*)_beta; 01249 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 01250 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 01251 ushort * dst = (ushort*)_dst; 01252 int x = 0; 01253 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 01254 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), 01255 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), 01256 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); 01257 01258 for( ; x <= width - 8; x += 8 ) 01259 { 01260 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 01261 v_b1, vld1q_f32(S1 + x)), 01262 v_b2, vld1q_f32(S2 + x)), 01263 v_b3, vld1q_f32(S3 + x)); 01264 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), 01265 v_b5, vld1q_f32(S5 + x)), 01266 v_b6, vld1q_f32(S6 + x)), 01267 v_b7, vld1q_f32(S7 + x)); 01268 float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); 01269 01270 v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 01271 v_b1, vld1q_f32(S1 + x + 4)), 01272 v_b2, vld1q_f32(S2 + x + 4)), 01273 v_b3, vld1q_f32(S3 + x + 4)); 01274 v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), 01275 v_b5, vld1q_f32(S5 + x + 4)), 01276 v_b6, vld1q_f32(S6 + x + 4)), 01277 v_b7, vld1q_f32(S7 + x + 4)); 01278 v_dst1 = vaddq_f32(v_dst0, v_dst1); 01279 01280 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)), 01281 vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); 01282 } 01283 01284 return x; 01285 } 01286 }; 01287 01288 struct VResizeLanczos4Vec_32f16s 01289 { 01290 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01291 { 01292 const float** src = (const float**)_src; 01293 const float* beta = (const float*)_beta; 01294 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 01295 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 01296 short * dst = (short*)_dst; 01297 int x = 0; 01298 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 01299 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), 01300 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), 01301 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); 01302 01303 for( ; x <= width - 8; x += 8 ) 01304 { 01305 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 01306 v_b1, vld1q_f32(S1 + x)), 01307 v_b2, vld1q_f32(S2 + x)), 01308 v_b3, vld1q_f32(S3 + x)); 01309 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), 01310 v_b5, vld1q_f32(S5 + x)), 01311 v_b6, vld1q_f32(S6 + x)), 01312 v_b7, vld1q_f32(S7 + x)); 01313 float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); 01314 01315 v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 01316 v_b1, vld1q_f32(S1 + x + 4)), 01317 v_b2, vld1q_f32(S2 + x + 4)), 01318 v_b3, vld1q_f32(S3 + x + 4)); 01319 v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), 01320 v_b5, vld1q_f32(S5 + x + 4)), 01321 v_b6, vld1q_f32(S6 + x + 4)), 01322 v_b7, vld1q_f32(S7 + x + 4)); 01323 v_dst1 = vaddq_f32(v_dst0, v_dst1); 01324 01325 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)), 01326 vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); 01327 } 01328 01329 return x; 01330 } 01331 }; 01332 01333 struct VResizeLanczos4Vec_32f 01334 { 01335 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 01336 { 01337 const float** src = (const float**)_src; 01338 const float* beta = (const float*)_beta; 01339 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 01340 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 01341 float* dst = (float*)_dst; 01342 int x = 0; 01343 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 01344 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), 01345 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), 01346 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); 01347 01348 for( ; x <= width - 4; x += 4 ) 01349 { 01350 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 01351 v_b1, vld1q_f32(S1 + x)), 01352 v_b2, vld1q_f32(S2 + x)), 01353 v_b3, vld1q_f32(S3 + x)); 01354 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), 01355 v_b5, vld1q_f32(S5 + x)), 01356 v_b6, vld1q_f32(S6 + x)), 01357 v_b7, vld1q_f32(S7 + x)); 01358 vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1)); 01359 } 01360 01361 return x; 01362 } 01363 }; 01364 01365 #else 01366 01367 typedef VResizeNoVec VResizeLinearVec_32s8u; 01368 typedef VResizeNoVec VResizeLinearVec_32f16u; 01369 typedef VResizeNoVec VResizeLinearVec_32f16s; 01370 typedef VResizeNoVec VResizeLinearVec_32f; 01371 01372 typedef VResizeNoVec VResizeCubicVec_32s8u; 01373 typedef VResizeNoVec VResizeCubicVec_32f16u; 01374 typedef VResizeNoVec VResizeCubicVec_32f16s; 01375 typedef VResizeNoVec VResizeCubicVec_32f; 01376 01377 typedef VResizeNoVec VResizeLanczos4Vec_32f16u; 01378 typedef VResizeNoVec VResizeLanczos4Vec_32f16s; 01379 typedef VResizeNoVec VResizeLanczos4Vec_32f; 01380 01381 #endif 01382 01383 typedef HResizeNoVec HResizeLinearVec_8u32s; 01384 typedef HResizeNoVec HResizeLinearVec_16u32f; 01385 typedef HResizeNoVec HResizeLinearVec_16s32f; 01386 typedef HResizeNoVec HResizeLinearVec_32f; 01387 typedef HResizeNoVec HResizeLinearVec_64f; 01388 01389 01390 template<typename T, typename WT, typename AT, int ONE, class VecOp> 01391 struct HResizeLinear 01392 { 01393 typedef T value_type; 01394 typedef WT buf_type; 01395 typedef AT alpha_type; 01396 01397 void operator()(const T** src, WT** dst, int count, 01398 const int* xofs, const AT* alpha, 01399 int swidth, int dwidth, int cn, int xmin, int xmax ) const 01400 { 01401 int dx, k; 01402 VecOp vecOp; 01403 01404 int dx0 = vecOp((const uchar**)src, (uchar**)dst, count, 01405 xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax ); 01406 01407 for( k = 0; k <= count - 2; k++ ) 01408 { 01409 const T *S0 = src[k], *S1 = src[k+1]; 01410 WT *D0 = dst[k], *D1 = dst[k+1]; 01411 for( dx = dx0; dx < xmax; dx++ ) 01412 { 01413 int sx = xofs[dx]; 01414 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1]; 01415 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1; 01416 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1; 01417 D0[dx] = t0; D1[dx] = t1; 01418 } 01419 01420 for( ; dx < dwidth; dx++ ) 01421 { 01422 int sx = xofs[dx]; 01423 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE); 01424 } 01425 } 01426 01427 for( ; k < count; k++ ) 01428 { 01429 const T *S = src[k]; 01430 WT *D = dst[k]; 01431 for( dx = 0; dx < xmax; dx++ ) 01432 { 01433 int sx = xofs[dx]; 01434 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1]; 01435 } 01436 01437 for( ; dx < dwidth; dx++ ) 01438 D[dx] = WT(S[xofs[dx]]*ONE); 01439 } 01440 } 01441 }; 01442 01443 01444 template<typename T, typename WT, typename AT, class CastOp, class VecOp> 01445 struct VResizeLinear 01446 { 01447 typedef T value_type; 01448 typedef WT buf_type; 01449 typedef AT alpha_type; 01450 01451 void operator()(const WT** src, T* dst, const AT* beta, int width ) const 01452 { 01453 WT b0 = beta[0], b1 = beta[1]; 01454 const WT *S0 = src[0], *S1 = src[1]; 01455 CastOp castOp; 01456 VecOp vecOp; 01457 01458 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); 01459 #if CV_ENABLE_UNROLLED 01460 for( ; x <= width - 4; x += 4 ) 01461 { 01462 WT t0, t1; 01463 t0 = S0[x]*b0 + S1[x]*b1; 01464 t1 = S0[x+1]*b0 + S1[x+1]*b1; 01465 dst[x] = castOp(t0); dst[x+1] = castOp(t1); 01466 t0 = S0[x+2]*b0 + S1[x+2]*b1; 01467 t1 = S0[x+3]*b0 + S1[x+3]*b1; 01468 dst[x+2] = castOp(t0); dst[x+3] = castOp(t1); 01469 } 01470 #endif 01471 for( ; x < width; x++ ) 01472 dst[x] = castOp(S0[x]*b0 + S1[x]*b1); 01473 } 01474 }; 01475 01476 template<> 01477 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u> 01478 { 01479 typedef uchar value_type; 01480 typedef int buf_type; 01481 typedef short alpha_type; 01482 01483 void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const 01484 { 01485 alpha_type b0 = beta[0], b1 = beta[1]; 01486 const buf_type *S0 = src[0], *S1 = src[1]; 01487 VResizeLinearVec_32s8u vecOp; 01488 01489 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); 01490 #if CV_ENABLE_UNROLLED 01491 for( ; x <= width - 4; x += 4 ) 01492 { 01493 dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2); 01494 dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2); 01495 dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2); 01496 dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2); 01497 } 01498 #endif 01499 for( ; x < width; x++ ) 01500 dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2); 01501 } 01502 }; 01503 01504 01505 template<typename T, typename WT, typename AT> 01506 struct HResizeCubic 01507 { 01508 typedef T value_type; 01509 typedef WT buf_type; 01510 typedef AT alpha_type; 01511 01512 void operator()(const T** src, WT** dst, int count, 01513 const int* xofs, const AT* alpha, 01514 int swidth, int dwidth, int cn, int xmin, int xmax ) const 01515 { 01516 for( int k = 0; k < count; k++ ) 01517 { 01518 const T *S = src[k]; 01519 WT *D = dst[k]; 01520 int dx = 0, limit = xmin; 01521 for(;;) 01522 { 01523 for( ; dx < limit; dx++, alpha += 4 ) 01524 { 01525 int j, sx = xofs[dx] - cn; 01526 WT v = 0; 01527 for( j = 0; j < 4; j++ ) 01528 { 01529 int sxj = sx + j*cn; 01530 if( (unsigned)sxj >= (unsigned)swidth ) 01531 { 01532 while( sxj < 0 ) 01533 sxj += cn; 01534 while( sxj >= swidth ) 01535 sxj -= cn; 01536 } 01537 v += S[sxj]*alpha[j]; 01538 } 01539 D[dx] = v; 01540 } 01541 if( limit == dwidth ) 01542 break; 01543 for( ; dx < xmax; dx++, alpha += 4 ) 01544 { 01545 int sx = xofs[dx]; 01546 D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] + 01547 S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3]; 01548 } 01549 limit = dwidth; 01550 } 01551 alpha -= dwidth*4; 01552 } 01553 } 01554 }; 01555 01556 01557 template<typename T, typename WT, typename AT, class CastOp, class VecOp> 01558 struct VResizeCubic 01559 { 01560 typedef T value_type; 01561 typedef WT buf_type; 01562 typedef AT alpha_type; 01563 01564 void operator()(const WT** src, T* dst, const AT* beta, int width ) const 01565 { 01566 WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; 01567 const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 01568 CastOp castOp; 01569 VecOp vecOp; 01570 01571 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); 01572 for( ; x < width; x++ ) 01573 dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3); 01574 } 01575 }; 01576 01577 01578 template<typename T, typename WT, typename AT> 01579 struct HResizeLanczos4 01580 { 01581 typedef T value_type; 01582 typedef WT buf_type; 01583 typedef AT alpha_type; 01584 01585 void operator()(const T** src, WT** dst, int count, 01586 const int* xofs, const AT* alpha, 01587 int swidth, int dwidth, int cn, int xmin, int xmax ) const 01588 { 01589 for( int k = 0; k < count; k++ ) 01590 { 01591 const T *S = src[k]; 01592 WT *D = dst[k]; 01593 int dx = 0, limit = xmin; 01594 for(;;) 01595 { 01596 for( ; dx < limit; dx++, alpha += 8 ) 01597 { 01598 int j, sx = xofs[dx] - cn*3; 01599 WT v = 0; 01600 for( j = 0; j < 8; j++ ) 01601 { 01602 int sxj = sx + j*cn; 01603 if( (unsigned)sxj >= (unsigned)swidth ) 01604 { 01605 while( sxj < 0 ) 01606 sxj += cn; 01607 while( sxj >= swidth ) 01608 sxj -= cn; 01609 } 01610 v += S[sxj]*alpha[j]; 01611 } 01612 D[dx] = v; 01613 } 01614 if( limit == dwidth ) 01615 break; 01616 for( ; dx < xmax; dx++, alpha += 8 ) 01617 { 01618 int sx = xofs[dx]; 01619 D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] + 01620 S[sx-cn]*alpha[2] + S[sx]*alpha[3] + 01621 S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] + 01622 S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7]; 01623 } 01624 limit = dwidth; 01625 } 01626 alpha -= dwidth*8; 01627 } 01628 } 01629 }; 01630 01631 01632 template<typename T, typename WT, typename AT, class CastOp, class VecOp> 01633 struct VResizeLanczos4 01634 { 01635 typedef T value_type; 01636 typedef WT buf_type; 01637 typedef AT alpha_type; 01638 01639 void operator()(const WT** src, T* dst, const AT* beta, int width ) const 01640 { 01641 CastOp castOp; 01642 VecOp vecOp; 01643 int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); 01644 #if CV_ENABLE_UNROLLED 01645 for( ; x <= width - 4; x += 4 ) 01646 { 01647 WT b = beta[0]; 01648 const WT* S = src[0]; 01649 WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b; 01650 01651 for( k = 1; k < 8; k++ ) 01652 { 01653 b = beta[k]; S = src[k]; 01654 s0 += S[x]*b; s1 += S[x+1]*b; 01655 s2 += S[x+2]*b; s3 += S[x+3]*b; 01656 } 01657 01658 dst[x] = castOp(s0); dst[x+1] = castOp(s1); 01659 dst[x+2] = castOp(s2); dst[x+3] = castOp(s3); 01660 } 01661 #endif 01662 for( ; x < width; x++ ) 01663 { 01664 dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] + 01665 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] + 01666 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]); 01667 } 01668 } 01669 }; 01670 01671 01672 static inline int clip(int x, int a, int b) 01673 { 01674 return x >= a ? (x < b ? x : b-1) : a; 01675 } 01676 01677 static const int MAX_ESIZE=16; 01678 01679 template <typename HResize, typename VResize> 01680 class resizeGeneric_Invoker : 01681 public ParallelLoopBody 01682 { 01683 public: 01684 typedef typename HResize::value_type T; 01685 typedef typename HResize::buf_type WT; 01686 typedef typename HResize::alpha_type AT; 01687 01688 resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs, 01689 const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize, 01690 int _ksize, int _xmin, int _xmax) : 01691 ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs), 01692 alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize), 01693 ksize(_ksize), xmin(_xmin), xmax(_xmax) 01694 { 01695 CV_Assert(ksize <= MAX_ESIZE); 01696 } 01697 01698 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) 01699 # pragma GCC diagnostic push 01700 # pragma GCC diagnostic ignored "-Warray-bounds" 01701 #endif 01702 virtual void operator() (const Range& range) const 01703 { 01704 int dy, cn = src.channels(); 01705 HResize hresize; 01706 VResize vresize; 01707 01708 int bufstep = (int)alignSize(dsize.width, 16); 01709 AutoBuffer<WT> _buffer(bufstep*ksize); 01710 const T* srows[MAX_ESIZE]={0}; 01711 WT* rows[MAX_ESIZE]={0}; 01712 int prev_sy[MAX_ESIZE]; 01713 01714 for(int k = 0; k < ksize; k++ ) 01715 { 01716 prev_sy[k] = -1; 01717 rows[k] = (WT*)_buffer + bufstep*k; 01718 } 01719 01720 const AT* beta = _beta + ksize * range.start; 01721 01722 for( dy = range.start; dy < range.end; dy++, beta += ksize ) 01723 { 01724 int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2; 01725 01726 for(int k = 0; k < ksize; k++ ) 01727 { 01728 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height); 01729 for( k1 = std::max(k1, k); k1 < ksize; k1++ ) 01730 { 01731 if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it. 01732 { 01733 if( k1 > k ) 01734 memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) ); 01735 break; 01736 } 01737 } 01738 if( k1 == ksize ) 01739 k0 = std::min(k0, k); // remember the first row that needs to be computed 01740 srows[k] = src.template ptr<T>(sy); 01741 prev_sy[k] = sy; 01742 } 01743 01744 if( k0 < ksize ) 01745 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha), 01746 ssize.width, dsize.width, cn, xmin, xmax ); 01747 vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width ); 01748 } 01749 } 01750 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) 01751 # pragma GCC diagnostic pop 01752 #endif 01753 01754 private: 01755 Mat src; 01756 Mat dst; 01757 const int* xofs, *yofs; 01758 const AT* alpha, *_beta; 01759 Size ssize, dsize; 01760 const int ksize, xmin, xmax; 01761 01762 resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&); 01763 }; 01764 01765 template<class HResize, class VResize> 01766 static void resizeGeneric_( const Mat& src, Mat& dst, 01767 const int* xofs, const void* _alpha, 01768 const int* yofs, const void* _beta, 01769 int xmin, int xmax, int ksize ) 01770 { 01771 typedef typename HResize::alpha_type AT; 01772 01773 const AT* beta = (const AT*)_beta; 01774 Size ssize = src.size(), dsize = dst.size(); 01775 int cn = src.channels(); 01776 ssize.width *= cn; 01777 dsize.width *= cn; 01778 xmin *= cn; 01779 xmax *= cn; 01780 // image resize is a separable operation. In case of not too strong 01781 01782 Range range(0, dsize.height); 01783 resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta, 01784 ssize, dsize, ksize, xmin, xmax); 01785 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 01786 } 01787 01788 template <typename T, typename WT> 01789 struct ResizeAreaFastNoVec 01790 { 01791 ResizeAreaFastNoVec(int, int) { } 01792 ResizeAreaFastNoVec(int, int, int, int) { } 01793 int operator() (const T*, T*, int) const 01794 { return 0; } 01795 }; 01796 01797 #if CV_NEON 01798 01799 class ResizeAreaFastVec_SIMD_8u 01800 { 01801 public: 01802 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : 01803 cn(_cn), step(_step) 01804 { 01805 } 01806 01807 int operator() (const uchar* S, uchar* D, int w) const 01808 { 01809 int dx = 0; 01810 const uchar* S0 = S, * S1 = S0 + step; 01811 01812 uint16x8_t v_2 = vdupq_n_u16(2); 01813 01814 if (cn == 1) 01815 { 01816 for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16) 01817 { 01818 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1); 01819 01820 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1])); 01821 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1]))); 01822 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2); 01823 01824 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1])); 01825 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1]))); 01826 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2); 01827 01828 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); 01829 } 01830 } 01831 else if (cn == 4) 01832 { 01833 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 01834 { 01835 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1); 01836 01837 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0)); 01838 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0)); 01839 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1)); 01840 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1)); 01841 01842 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)), 01843 vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10))); 01844 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)), 01845 vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11))); 01846 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2); 01847 01848 vst1_u8(D, vmovn_u16(v_dst)); 01849 } 01850 } 01851 01852 return dx; 01853 } 01854 01855 private: 01856 int cn, step; 01857 }; 01858 01859 class ResizeAreaFastVec_SIMD_16u 01860 { 01861 public: 01862 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : 01863 cn(_cn), step(_step) 01864 { 01865 } 01866 01867 int operator() (const ushort * S, ushort * D, int w) const 01868 { 01869 int dx = 0; 01870 const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step); 01871 01872 uint32x4_t v_2 = vdupq_n_u32(2); 01873 01874 if (cn == 1) 01875 { 01876 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 01877 { 01878 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1); 01879 01880 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1])); 01881 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1]))); 01882 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2); 01883 01884 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1])); 01885 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1]))); 01886 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2); 01887 01888 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))); 01889 } 01890 } 01891 else if (cn == 4) 01892 { 01893 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 01894 { 01895 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1); 01896 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)), 01897 vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1))); 01898 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2))); 01899 } 01900 } 01901 01902 return dx; 01903 } 01904 01905 private: 01906 int cn, step; 01907 }; 01908 01909 class ResizeAreaFastVec_SIMD_16s 01910 { 01911 public: 01912 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : 01913 cn(_cn), step(_step) 01914 { 01915 } 01916 01917 int operator() (const short * S, short * D, int w) const 01918 { 01919 int dx = 0; 01920 const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step); 01921 01922 int32x4_t v_2 = vdupq_n_s32(2); 01923 01924 if (cn == 1) 01925 { 01926 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 01927 { 01928 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1); 01929 01930 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1])); 01931 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1]))); 01932 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2); 01933 01934 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1])); 01935 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1]))); 01936 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2); 01937 01938 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1))); 01939 } 01940 } 01941 else if (cn == 4) 01942 { 01943 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 01944 { 01945 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1); 01946 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)), 01947 vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1))); 01948 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2))); 01949 } 01950 } 01951 01952 return dx; 01953 } 01954 01955 private: 01956 int cn, step; 01957 }; 01958 01959 struct ResizeAreaFastVec_SIMD_32f 01960 { 01961 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : 01962 cn(_cn), step(_step) 01963 { 01964 fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); 01965 } 01966 01967 int operator() (const float * S, float * D, int w) const 01968 { 01969 if (!fast_mode) 01970 return 0; 01971 01972 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); 01973 int dx = 0; 01974 01975 float32x4_t v_025 = vdupq_n_f32(0.25f); 01976 01977 if (cn == 1) 01978 { 01979 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 01980 { 01981 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1); 01982 01983 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]); 01984 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]); 01985 01986 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); 01987 } 01988 } 01989 else if (cn == 4) 01990 { 01991 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 01992 { 01993 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4)); 01994 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4)); 01995 01996 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); 01997 } 01998 } 01999 02000 return dx; 02001 } 02002 02003 private: 02004 int cn; 02005 bool fast_mode; 02006 int step; 02007 }; 02008 02009 #elif CV_SSE2 02010 02011 class ResizeAreaFastVec_SIMD_8u 02012 { 02013 public: 02014 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : 02015 cn(_cn), step(_step) 02016 { 02017 use_simd = checkHardwareSupport(CV_CPU_SSE2); 02018 } 02019 02020 int operator() (const uchar* S, uchar* D, int w) const 02021 { 02022 if (!use_simd) 02023 return 0; 02024 02025 int dx = 0; 02026 const uchar* S0 = S; 02027 const uchar* S1 = S0 + step; 02028 __m128i zero = _mm_setzero_si128(); 02029 __m128i delta2 = _mm_set1_epi16(2); 02030 02031 if (cn == 1) 02032 { 02033 __m128i masklow = _mm_set1_epi16(0x00ff); 02034 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 02035 { 02036 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02037 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02038 02039 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow)); 02040 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow)); 02041 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2); 02042 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); 02043 02044 _mm_storel_epi64((__m128i*)D, s0); 02045 } 02046 } 02047 else if (cn == 3) 02048 for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6) 02049 { 02050 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02051 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02052 02053 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); 02054 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero); 02055 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); 02056 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero); 02057 02058 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6)); 02059 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6)); 02060 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); 02061 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); 02062 _mm_storel_epi64((__m128i*)D, s0); 02063 02064 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6)); 02065 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6)); 02066 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); 02067 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); 02068 _mm_storel_epi64((__m128i*)(D+3), s0); 02069 } 02070 else 02071 { 02072 CV_Assert(cn == 4); 02073 int v[] = { 0, 0, -1, -1 }; 02074 __m128i mask = _mm_loadu_si128((const __m128i*)v); 02075 02076 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 02077 { 02078 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02079 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02080 02081 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); 02082 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero); 02083 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); 02084 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero); 02085 02086 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8)); 02087 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8)); 02088 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); 02089 __m128i res0 = _mm_srli_epi16(s0, 2); 02090 02091 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8)); 02092 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8)); 02093 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); 02094 __m128i res1 = _mm_srli_epi16(s0, 2); 02095 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0), 02096 _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero); 02097 _mm_storel_epi64((__m128i*)(D), s0); 02098 } 02099 } 02100 02101 return dx; 02102 } 02103 02104 private: 02105 int cn; 02106 bool use_simd; 02107 int step; 02108 }; 02109 02110 class ResizeAreaFastVec_SIMD_16u 02111 { 02112 public: 02113 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : 02114 cn(_cn), step(_step) 02115 { 02116 use_simd = checkHardwareSupport(CV_CPU_SSE2); 02117 } 02118 02119 int operator() (const ushort* S, ushort* D, int w) const 02120 { 02121 if (!use_simd) 02122 return 0; 02123 02124 int dx = 0; 02125 const ushort* S0 = (const ushort*)S; 02126 const ushort* S1 = (const ushort*)((const uchar*)(S) + step); 02127 __m128i masklow = _mm_set1_epi32(0x0000ffff); 02128 __m128i zero = _mm_setzero_si128(); 02129 __m128i delta2 = _mm_set1_epi32(2); 02130 02131 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero) 02132 02133 if (cn == 1) 02134 { 02135 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 02136 { 02137 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02138 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02139 02140 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow)); 02141 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow)); 02142 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); 02143 s0 = _mm_srli_epi32(s0, 2); 02144 s0 = _mm_packus_epi32(s0, zero); 02145 02146 _mm_storel_epi64((__m128i*)D, s0); 02147 } 02148 } 02149 else if (cn == 3) 02150 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) 02151 { 02152 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02153 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02154 02155 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero); 02156 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero); 02157 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero); 02158 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero); 02159 02160 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); 02161 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); 02162 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); 02163 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); 02164 _mm_storel_epi64((__m128i*)D, s0); 02165 } 02166 else 02167 { 02168 CV_Assert(cn == 4); 02169 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 02170 { 02171 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02172 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02173 02174 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero); 02175 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero); 02176 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero); 02177 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero); 02178 02179 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); 02180 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); 02181 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); 02182 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); 02183 _mm_storel_epi64((__m128i*)D, s0); 02184 } 02185 } 02186 02187 #undef _mm_packus_epi32 02188 02189 return dx; 02190 } 02191 02192 private: 02193 int cn; 02194 int step; 02195 bool use_simd; 02196 }; 02197 02198 class ResizeAreaFastVec_SIMD_16s 02199 { 02200 public: 02201 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : 02202 cn(_cn), step(_step) 02203 { 02204 use_simd = checkHardwareSupport(CV_CPU_SSE2); 02205 } 02206 02207 int operator() (const short* S, short* D, int w) const 02208 { 02209 if (!use_simd) 02210 return 0; 02211 02212 int dx = 0; 02213 const short* S0 = (const short*)S; 02214 const short* S1 = (const short*)((const uchar*)(S) + step); 02215 __m128i masklow = _mm_set1_epi32(0x0000ffff); 02216 __m128i zero = _mm_setzero_si128(); 02217 __m128i delta2 = _mm_set1_epi32(2); 02218 02219 if (cn == 1) 02220 { 02221 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 02222 { 02223 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02224 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02225 02226 __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), 02227 _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); 02228 __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), 02229 _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); 02230 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); 02231 s0 = _mm_srai_epi32(s0, 2); 02232 s0 = _mm_packs_epi32(s0, zero); 02233 02234 _mm_storel_epi64((__m128i*)D, s0); 02235 } 02236 } 02237 else if (cn == 3) 02238 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) 02239 { 02240 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02241 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02242 02243 __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); 02244 __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); 02245 __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); 02246 __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); 02247 02248 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); 02249 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); 02250 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); 02251 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); 02252 _mm_storel_epi64((__m128i*)D, s0); 02253 } 02254 else 02255 { 02256 CV_Assert(cn == 4); 02257 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 02258 { 02259 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 02260 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 02261 02262 __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); 02263 __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); 02264 __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); 02265 __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); 02266 02267 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); 02268 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); 02269 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); 02270 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); 02271 _mm_storel_epi64((__m128i*)D, s0); 02272 } 02273 } 02274 02275 return dx; 02276 } 02277 02278 private: 02279 int cn; 02280 int step; 02281 bool use_simd; 02282 }; 02283 02284 struct ResizeAreaFastVec_SIMD_32f 02285 { 02286 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : 02287 cn(_cn), step(_step) 02288 { 02289 fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); 02290 fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); 02291 } 02292 02293 int operator() (const float * S, float * D, int w) const 02294 { 02295 if (!fast_mode) 02296 return 0; 02297 02298 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); 02299 int dx = 0; 02300 02301 __m128 v_025 = _mm_set1_ps(0.25f); 02302 02303 if (cn == 1) 02304 { 02305 const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); 02306 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 02307 { 02308 __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), 02309 v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); 02310 02311 __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), 02312 _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); 02313 __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), 02314 _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); 02315 02316 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); 02317 } 02318 } 02319 else if (cn == 4) 02320 { 02321 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 02322 { 02323 __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); 02324 __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); 02325 02326 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); 02327 } 02328 } 02329 02330 return dx; 02331 } 02332 02333 private: 02334 int cn; 02335 bool fast_mode; 02336 int step; 02337 }; 02338 02339 #else 02340 02341 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u; 02342 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u; 02343 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s; 02344 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f; 02345 02346 #endif 02347 02348 template<typename T, typename SIMDVecOp> 02349 struct ResizeAreaFastVec 02350 { 02351 ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) : 02352 scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step) 02353 { 02354 fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); 02355 } 02356 02357 int operator() (const T* S, T* D, int w) const 02358 { 02359 if (!fast_mode) 02360 return 0; 02361 02362 const T* nextS = (const T*)((const uchar*)S + step); 02363 int dx = vecOp(S, D, w); 02364 02365 if (cn == 1) 02366 for( ; dx < w; ++dx ) 02367 { 02368 int index = dx*2; 02369 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2); 02370 } 02371 else if (cn == 3) 02372 for( ; dx < w; dx += 3 ) 02373 { 02374 int index = dx*2; 02375 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2); 02376 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2); 02377 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2); 02378 } 02379 else 02380 { 02381 CV_Assert(cn == 4); 02382 for( ; dx < w; dx += 4 ) 02383 { 02384 int index = dx*2; 02385 D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2); 02386 D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2); 02387 D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2); 02388 D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2); 02389 } 02390 } 02391 02392 return dx; 02393 } 02394 02395 private: 02396 int scale_x, scale_y; 02397 int cn; 02398 bool fast_mode; 02399 int step; 02400 SIMDVecOp vecOp; 02401 }; 02402 02403 template <typename T, typename WT, typename VecOp> 02404 class resizeAreaFast_Invoker : 02405 public ParallelLoopBody 02406 { 02407 public: 02408 resizeAreaFast_Invoker(const Mat &_src, Mat &_dst, 02409 int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) : 02410 ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x), 02411 scale_y(_scale_y), ofs(_ofs), xofs(_xofs) 02412 { 02413 } 02414 02415 virtual void operator() (const Range& range) const 02416 { 02417 Size ssize = src.size(), dsize = dst.size(); 02418 int cn = src.channels(); 02419 int area = scale_x*scale_y; 02420 float scale = 1.f/(area); 02421 int dwidth1 = (ssize.width/scale_x)*cn; 02422 dsize.width *= cn; 02423 ssize.width *= cn; 02424 int dy, dx, k = 0; 02425 02426 VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/); 02427 02428 for( dy = range.start; dy < range.end; dy++ ) 02429 { 02430 T* D = (T*)(dst.data + dst.step*dy); 02431 int sy0 = dy*scale_y; 02432 int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0; 02433 02434 if( sy0 >= ssize.height ) 02435 { 02436 for( dx = 0; dx < dsize.width; dx++ ) 02437 D[dx] = 0; 02438 continue; 02439 } 02440 02441 dx = vop(src.template ptr<T>(sy0), D, w); 02442 for( ; dx < w; dx++ ) 02443 { 02444 const T* S = src.template ptr<T>(sy0) + xofs[dx]; 02445 WT sum = 0; 02446 k = 0; 02447 #if CV_ENABLE_UNROLLED 02448 for( ; k <= area - 4; k += 4 ) 02449 sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]]; 02450 #endif 02451 for( ; k < area; k++ ) 02452 sum += S[ofs[k]]; 02453 02454 D[dx] = saturate_cast<T>(sum * scale); 02455 } 02456 02457 for( ; dx < dsize.width; dx++ ) 02458 { 02459 WT sum = 0; 02460 int count = 0, sx0 = xofs[dx]; 02461 if( sx0 >= ssize.width ) 02462 D[dx] = 0; 02463 02464 for( int sy = 0; sy < scale_y; sy++ ) 02465 { 02466 if( sy0 + sy >= ssize.height ) 02467 break; 02468 const T* S = src.template ptr<T>(sy0 + sy) + sx0; 02469 for( int sx = 0; sx < scale_x*cn; sx += cn ) 02470 { 02471 if( sx0 + sx >= ssize.width ) 02472 break; 02473 sum += S[sx]; 02474 count++; 02475 } 02476 } 02477 02478 D[dx] = saturate_cast<T>((float)sum/count); 02479 } 02480 } 02481 } 02482 02483 private: 02484 Mat src; 02485 Mat dst; 02486 int scale_x, scale_y; 02487 const int *ofs, *xofs; 02488 }; 02489 02490 template<typename T, typename WT, typename VecOp> 02491 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs, 02492 int scale_x, int scale_y ) 02493 { 02494 Range range(0, dst.rows); 02495 resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x, 02496 scale_y, ofs, xofs); 02497 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 02498 } 02499 02500 struct DecimateAlpha 02501 { 02502 int si, di; 02503 float alpha; 02504 }; 02505 02506 02507 template<typename T, typename WT> class ResizeArea_Invoker : 02508 public ParallelLoopBody 02509 { 02510 public: 02511 ResizeArea_Invoker( const Mat& _src, Mat& _dst, 02512 const DecimateAlpha* _xtab, int _xtab_size, 02513 const DecimateAlpha* _ytab, int _ytab_size, 02514 const int* _tabofs ) 02515 { 02516 src = &_src; 02517 dst = &_dst; 02518 xtab0 = _xtab; 02519 xtab_size0 = _xtab_size; 02520 ytab = _ytab; 02521 ytab_size = _ytab_size; 02522 tabofs = _tabofs; 02523 } 02524 02525 virtual void operator() (const Range& range) const 02526 { 02527 Size dsize = dst->size(); 02528 int cn = dst->channels(); 02529 dsize.width *= cn; 02530 AutoBuffer<WT> _buffer(dsize.width*2); 02531 const DecimateAlpha* xtab = xtab0; 02532 int xtab_size = xtab_size0; 02533 WT *buf = _buffer, *sum = buf + dsize.width; 02534 int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di; 02535 02536 for( dx = 0; dx < dsize.width; dx++ ) 02537 sum[dx] = (WT)0; 02538 02539 for( j = j_start; j < j_end; j++ ) 02540 { 02541 WT beta = ytab[j].alpha; 02542 int dy = ytab[j].di; 02543 int sy = ytab[j].si; 02544 02545 { 02546 const T* S = src->template ptr<T>(sy); 02547 for( dx = 0; dx < dsize.width; dx++ ) 02548 buf[dx] = (WT)0; 02549 02550 if( cn == 1 ) 02551 for( k = 0; k < xtab_size; k++ ) 02552 { 02553 int dxn = xtab[k].di; 02554 WT alpha = xtab[k].alpha; 02555 buf[dxn] += S[xtab[k].si]*alpha; 02556 } 02557 else if( cn == 2 ) 02558 for( k = 0; k < xtab_size; k++ ) 02559 { 02560 int sxn = xtab[k].si; 02561 int dxn = xtab[k].di; 02562 WT alpha = xtab[k].alpha; 02563 WT t0 = buf[dxn] + S[sxn]*alpha; 02564 WT t1 = buf[dxn+1] + S[sxn+1]*alpha; 02565 buf[dxn] = t0; buf[dxn+1] = t1; 02566 } 02567 else if( cn == 3 ) 02568 for( k = 0; k < xtab_size; k++ ) 02569 { 02570 int sxn = xtab[k].si; 02571 int dxn = xtab[k].di; 02572 WT alpha = xtab[k].alpha; 02573 WT t0 = buf[dxn] + S[sxn]*alpha; 02574 WT t1 = buf[dxn+1] + S[sxn+1]*alpha; 02575 WT t2 = buf[dxn+2] + S[sxn+2]*alpha; 02576 buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2; 02577 } 02578 else if( cn == 4 ) 02579 { 02580 for( k = 0; k < xtab_size; k++ ) 02581 { 02582 int sxn = xtab[k].si; 02583 int dxn = xtab[k].di; 02584 WT alpha = xtab[k].alpha; 02585 WT t0 = buf[dxn] + S[sxn]*alpha; 02586 WT t1 = buf[dxn+1] + S[sxn+1]*alpha; 02587 buf[dxn] = t0; buf[dxn+1] = t1; 02588 t0 = buf[dxn+2] + S[sxn+2]*alpha; 02589 t1 = buf[dxn+3] + S[sxn+3]*alpha; 02590 buf[dxn+2] = t0; buf[dxn+3] = t1; 02591 } 02592 } 02593 else 02594 { 02595 for( k = 0; k < xtab_size; k++ ) 02596 { 02597 int sxn = xtab[k].si; 02598 int dxn = xtab[k].di; 02599 WT alpha = xtab[k].alpha; 02600 for( int c = 0; c < cn; c++ ) 02601 buf[dxn + c] += S[sxn + c]*alpha; 02602 } 02603 } 02604 } 02605 02606 if( dy != prev_dy ) 02607 { 02608 T* D = dst->template ptr<T>(prev_dy); 02609 02610 for( dx = 0; dx < dsize.width; dx++ ) 02611 { 02612 D[dx] = saturate_cast<T>(sum[dx]); 02613 sum[dx] = beta*buf[dx]; 02614 } 02615 prev_dy = dy; 02616 } 02617 else 02618 { 02619 for( dx = 0; dx < dsize.width; dx++ ) 02620 sum[dx] += beta*buf[dx]; 02621 } 02622 } 02623 02624 { 02625 T* D = dst->template ptr<T>(prev_dy); 02626 for( dx = 0; dx < dsize.width; dx++ ) 02627 D[dx] = saturate_cast<T>(sum[dx]); 02628 } 02629 } 02630 02631 private: 02632 const Mat* src; 02633 Mat* dst; 02634 const DecimateAlpha* xtab0; 02635 const DecimateAlpha* ytab; 02636 int xtab_size0, ytab_size; 02637 const int* tabofs; 02638 }; 02639 02640 02641 template <typename T, typename WT> 02642 static void resizeArea_( const Mat& src, Mat& dst, 02643 const DecimateAlpha* xtab, int xtab_size, 02644 const DecimateAlpha* ytab, int ytab_size, 02645 const int* tabofs ) 02646 { 02647 parallel_for_(Range(0, dst.rows), 02648 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs), 02649 dst.total()/((double)(1 << 16))); 02650 } 02651 02652 02653 typedef void (*ResizeFunc)( const Mat& src, Mat& dst, 02654 const int* xofs, const void* alpha, 02655 const int* yofs, const void* beta, 02656 int xmin, int xmax, int ksize ); 02657 02658 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst, 02659 const int* ofs, const int *xofs, 02660 int scale_x, int scale_y ); 02661 02662 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst, 02663 const DecimateAlpha* xtab, int xtab_size, 02664 const DecimateAlpha* ytab, int ytab_size, 02665 const int* yofs); 02666 02667 02668 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab ) 02669 { 02670 int k = 0; 02671 for(int dx = 0; dx < dsize; dx++ ) 02672 { 02673 double fsx1 = dx * scale; 02674 double fsx2 = fsx1 + scale; 02675 double cellWidth = std::min(scale, ssize - fsx1); 02676 02677 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); 02678 02679 sx2 = std::min(sx2, ssize - 1); 02680 sx1 = std::min(sx1, sx2); 02681 02682 if( sx1 - fsx1 > 1e-3 ) 02683 { 02684 assert( k < ssize*2 ); 02685 tab[k].di = dx * cn; 02686 tab[k].si = (sx1 - 1) * cn; 02687 tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth); 02688 } 02689 02690 for(int sx = sx1; sx < sx2; sx++ ) 02691 { 02692 assert( k < ssize*2 ); 02693 tab[k].di = dx * cn; 02694 tab[k].si = sx * cn; 02695 tab[k++].alpha = float(1.0 / cellWidth); 02696 } 02697 02698 if( fsx2 - sx2 > 1e-3 ) 02699 { 02700 assert( k < ssize*2 ); 02701 tab[k].di = dx * cn; 02702 tab[k].si = sx2 * cn; 02703 tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth); 02704 } 02705 } 02706 return k; 02707 } 02708 02709 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; } 02710 02711 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \ 02712 func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \ 02713 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\ 02714 specBuf.allocate(specSize);\ 02715 pSpec = (uchar*)specBuf;\ 02716 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec)); 02717 02718 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \ 02719 if (mode == (int)ippCubic) { *ok = false; return; } \ 02720 func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \ 02721 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\ 02722 specBuf.allocate(specSize);\ 02723 pSpec = (uchar*)specBuf;\ 02724 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\ 02725 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\ 02726 getSrcOffsetFunc = (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE; 02727 02728 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \ 02729 func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \ 02730 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\ 02731 specBuf.allocate(specSize);\ 02732 pSpec = (uchar*)specBuf;\ 02733 AutoBuffer<uchar> buf(initSize);\ 02734 uchar* pInit = (uchar*)buf;\ 02735 CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit)); 02736 02737 #define SET_IPP_RESIZE_PTR(TYPE, CN) \ 02738 if (mode == (int)ippLinear) { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \ 02739 else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \ 02740 else { *ok = false; return; } \ 02741 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \ 02742 getSrcOffsetFunc = (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE; 02743 02744 #if IPP_VERSION_X100 >= 710 02745 class IPPresizeInvoker : 02746 public ParallelLoopBody 02747 { 02748 public: 02749 IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) : 02750 ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), 02751 inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode), 02752 func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok) 02753 { 02754 *ok = true; 02755 IppiSize srcSize, dstSize; 02756 int type = src.type(), specSize = 0, initSize = 0; 02757 srcSize.width = src.cols; 02758 srcSize.height = src.rows; 02759 dstSize.width = dst.cols; 02760 dstSize.height = dst.rows; 02761 02762 switch (type) 02763 { 02764 #if IPP_DISABLE_BLOCK // disabled since it breaks tests for CascadeClassifier 02765 case CV_8UC1: SET_IPP_RESIZE_PTR(8u,C1); break; 02766 case CV_8UC3: SET_IPP_RESIZE_PTR(8u,C3); break; 02767 case CV_8UC4: SET_IPP_RESIZE_PTR(8u,C4); break; 02768 #endif 02769 case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break; 02770 case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break; 02771 case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break; 02772 case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break; 02773 case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break; 02774 case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break; 02775 case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break; 02776 case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break; 02777 case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break; 02778 case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break; 02779 case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break; 02780 case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break; 02781 default: { *ok = false; return; } break; 02782 } 02783 } 02784 02785 ~IPPresizeInvoker() 02786 { 02787 } 02788 02789 virtual void operator() (const Range& range) const 02790 { 02791 if (*ok == false) 02792 return; 02793 02794 int cn = src.channels(); 02795 int dsty = min(cvRound(range.start * inv_scale_y), dst.rows); 02796 int dstwidth = min(cvRound(src.cols * inv_scale_x), dst.cols); 02797 int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows); 02798 02799 IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0}; 02800 IppiSize dstSize = { dstwidth, dstheight - dsty }; 02801 int bufsize = 0, itemSize = (int)src.elemSize1(); 02802 02803 CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize)); 02804 CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset)); 02805 02806 const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize; 02807 Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize; 02808 02809 AutoBuffer<uchar> buf(bufsize + 64); 02810 uchar* bufptr = alignPtr((uchar*)buf, 32); 02811 02812 if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 ) 02813 *ok = false; 02814 else 02815 { 02816 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 02817 } 02818 } 02819 private: 02820 const Mat & src; 02821 Mat & dst; 02822 double inv_scale_x; 02823 double inv_scale_y; 02824 void *pSpec; 02825 AutoBuffer<uchar> specBuf; 02826 int mode; 02827 ippiResizeFunc func; 02828 ippiResizeGetBufferSize getBufferSizeFunc; 02829 ippiResizeGetSrcOffset getSrcOffsetFunc; 02830 bool *ok; 02831 const IPPresizeInvoker& operator= (const IPPresizeInvoker&); 02832 }; 02833 02834 #endif 02835 02836 #ifdef HAVE_OPENCL 02837 02838 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab, 02839 float * const alpha_tab, int * const ofs_tab) 02840 { 02841 int k = 0, dx = 0; 02842 for ( ; dx < dsize; dx++) 02843 { 02844 ofs_tab[dx] = k; 02845 02846 double fsx1 = dx * scale; 02847 double fsx2 = fsx1 + scale; 02848 double cellWidth = std::min(scale, ssize - fsx1); 02849 02850 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); 02851 02852 sx2 = std::min(sx2, ssize - 1); 02853 sx1 = std::min(sx1, sx2); 02854 02855 if (sx1 - fsx1 > 1e-3) 02856 { 02857 map_tab[k] = sx1 - 1; 02858 alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth); 02859 } 02860 02861 for (int sx = sx1; sx < sx2; sx++) 02862 { 02863 map_tab[k] = sx; 02864 alpha_tab[k++] = float(1.0 / cellWidth); 02865 } 02866 02867 if (fsx2 - sx2 > 1e-3) 02868 { 02869 map_tab[k] = sx2; 02870 alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth); 02871 } 02872 } 02873 ofs_tab[dx] = k; 02874 } 02875 02876 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, 02877 double fx, double fy, int interpolation) 02878 { 02879 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 02880 02881 double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy; 02882 float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy; 02883 int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx); 02884 bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON && 02885 std::abs(inv_fy - iscale_y) < DBL_EPSILON; 02886 02887 // in case of scale_x && scale_y is equal to 2 02888 // INTER_AREA (fast) also is equal to INTER_LINEAR 02889 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) 02890 /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower 02891 02892 if( !(cn <= 4 && 02893 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || 02894 (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) ) 02895 return false; 02896 02897 UMat src = _src.getUMat(); 02898 _dst.create(dsize, type); 02899 UMat dst = _dst.getUMat(); 02900 02901 Size ssize = src.size(); 02902 ocl::Kernel k; 02903 size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows }; 02904 02905 ocl::Image2D srcImage; 02906 02907 // See if this could be done with a sampler. We stick with integer 02908 // datatypes because the observed error is low. 02909 bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() && 02910 ocl::Image2D::canCreateAlias(src) && depth <= 4 && 02911 ocl::Image2D::isFormatSupported(depth, cn, true) && 02912 src.offset==0); 02913 if (useSampler) 02914 { 02915 int wdepth = std::max(depth, CV_32S); 02916 char buf[2][32]; 02917 cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s " 02918 "-D convertToDT=%s -D cn=%d", 02919 depth, ocl::typeToStr(type), ocl::typeToStr(depth), 02920 ocl::convertTypeStr(wdepth, depth, cn, buf[1]), 02921 cn); 02922 k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts); 02923 02924 if (k.empty()) 02925 useSampler = false; 02926 else 02927 { 02928 // Convert the input into an OpenCL image type, using normalized channel data types 02929 // and aliasing the UMat. 02930 srcImage = ocl::Image2D(src, true, true); 02931 k.args(srcImage, ocl::KernelArg::WriteOnly(dst), 02932 (float)inv_fx, (float)inv_fy); 02933 } 02934 } 02935 02936 if (interpolation == INTER_LINEAR && !useSampler) 02937 { 02938 char buf[2][32]; 02939 02940 // integer path is slower because of CPU part, so it's disabled 02941 if (depth == CV_8U && ((void)0, 0)) 02942 { 02943 AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2)); 02944 int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width; 02945 short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2; 02946 float fxx, fyy; 02947 int sx, sy; 02948 02949 for (int dx = 0; dx < dsize.width; dx++) 02950 { 02951 fxx = (float)((dx+0.5)*inv_fx - 0.5); 02952 sx = cvFloor(fxx); 02953 fxx -= sx; 02954 02955 if (sx < 0) 02956 fxx = 0, sx = 0; 02957 02958 if (sx >= ssize.width-1) 02959 fxx = 0, sx = ssize.width-1; 02960 02961 xofs[dx] = sx; 02962 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE); 02963 ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE); 02964 } 02965 02966 for (int dy = 0; dy < dsize.height; dy++) 02967 { 02968 fyy = (float)((dy+0.5)*inv_fy - 0.5); 02969 sy = cvFloor(fyy); 02970 fyy -= sy; 02971 02972 yofs[dy] = sy; 02973 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE); 02974 ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE); 02975 } 02976 02977 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); 02978 UMat coeffs; 02979 Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs); 02980 02981 k.create("resizeLN", ocl::imgproc::resize_oclsrc, 02982 format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s " 02983 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " 02984 "-D INTER_RESIZE_COEF_BITS=%d", 02985 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), 02986 ocl::convertTypeStr(depth, wdepth, cn, buf[0]), 02987 ocl::convertTypeStr(wdepth, depth, cn, buf[1]), 02988 cn, INTER_RESIZE_COEF_BITS)); 02989 if (k.empty()) 02990 return false; 02991 02992 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), 02993 ocl::KernelArg::PtrReadOnly(coeffs)); 02994 } 02995 else 02996 { 02997 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); 02998 k.create("resizeLN", ocl::imgproc::resize_oclsrc, 02999 format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s " 03000 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " 03001 "-D INTER_RESIZE_COEF_BITS=%d", 03002 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), 03003 ocl::convertTypeStr(depth, wdepth, cn, buf[0]), 03004 ocl::convertTypeStr(wdepth, depth, cn, buf[1]), 03005 cn, INTER_RESIZE_COEF_BITS)); 03006 if (k.empty()) 03007 return false; 03008 03009 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), 03010 (float)inv_fx, (float)inv_fy); 03011 } 03012 } 03013 else if (interpolation == INTER_NEAREST) 03014 { 03015 k.create("resizeNN", ocl::imgproc::resize_oclsrc, 03016 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d", 03017 ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn)); 03018 if (k.empty()) 03019 return false; 03020 03021 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), 03022 (float)inv_fx, (float)inv_fy); 03023 } 03024 else if (interpolation == INTER_AREA) 03025 { 03026 int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F); 03027 int wtype = CV_MAKE_TYPE(wdepth, cn); 03028 03029 char cvt[2][40]; 03030 String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d", 03031 ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), 03032 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn); 03033 03034 UMat alphaOcl, tabofsOcl, mapOcl; 03035 UMat dmap, smap; 03036 03037 if (is_area_fast) 03038 { 03039 int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn); 03040 buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST" 03041 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff", 03042 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]), 03043 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]), 03044 iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y)); 03045 03046 k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption); 03047 if (k.empty()) 03048 return false; 03049 } 03050 else 03051 { 03052 buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0])); 03053 k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption); 03054 if (k.empty()) 03055 return false; 03056 03057 int xytab_size = (ssize.width + ssize.height) << 1; 03058 int tabofs_size = dsize.height + dsize.width + 2; 03059 03060 AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size); 03061 AutoBuffer<float> _xyalpha_tab(xytab_size); 03062 int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1); 03063 float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1); 03064 int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1; 03065 03066 ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab); 03067 ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab); 03068 03069 // loading precomputed arrays to GPU 03070 Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl); 03071 Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl); 03072 Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl); 03073 } 03074 03075 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst); 03076 03077 if (is_area_fast) 03078 k.args(srcarg, dstarg); 03079 else 03080 k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl), 03081 ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl)); 03082 03083 return k.run(2, globalsize, NULL, false); 03084 } 03085 03086 return k.run(2, globalsize, 0, false); 03087 } 03088 03089 #endif 03090 03091 #if IPP_VERSION_X100 >= 710 03092 static bool ipp_resize_mt( Mat src, Mat dst, 03093 double inv_scale_x, double inv_scale_y, int interpolation) 03094 { 03095 int mode = -1; 03096 if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2) 03097 mode = ippLinear; 03098 else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4) 03099 mode = ippCubic; 03100 else 03101 return false; 03102 03103 bool ok = true; 03104 Range range(0, src.rows); 03105 IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok); 03106 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 03107 if( ok ) 03108 return true; 03109 03110 return false; 03111 } 03112 #endif 03113 03114 } 03115 03116 03117 03118 ////////////////////////////////////////////////////////////////////////////////////////// 03119 03120 void cv::resize( InputArray _src, OutputArray _dst, Size dsize, 03121 double inv_scale_x, double inv_scale_y, int interpolation ) 03122 { 03123 static ResizeFunc linear_tab[] = 03124 { 03125 resizeGeneric_< 03126 HResizeLinear<uchar, int, short, 03127 INTER_RESIZE_COEF_SCALE, 03128 HResizeLinearVec_8u32s>, 03129 VResizeLinear<uchar, int, short, 03130 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, 03131 VResizeLinearVec_32s8u> >, 03132 0, 03133 resizeGeneric_< 03134 HResizeLinear<ushort, float, float, 1, 03135 HResizeLinearVec_16u32f>, 03136 VResizeLinear<ushort, float, float, Cast<float, ushort>, 03137 VResizeLinearVec_32f16u> >, 03138 resizeGeneric_< 03139 HResizeLinear<short, float, float, 1, 03140 HResizeLinearVec_16s32f>, 03141 VResizeLinear<short, float, float, Cast<float, short>, 03142 VResizeLinearVec_32f16s> >, 03143 0, 03144 resizeGeneric_< 03145 HResizeLinear<float, float, float, 1, 03146 HResizeLinearVec_32f>, 03147 VResizeLinear<float, float, float, Cast<float, float>, 03148 VResizeLinearVec_32f> >, 03149 resizeGeneric_< 03150 HResizeLinear<double, double, float, 1, 03151 HResizeNoVec>, 03152 VResizeLinear<double, double, float, Cast<double, double>, 03153 VResizeNoVec> >, 03154 0 03155 }; 03156 03157 static ResizeFunc cubic_tab[] = 03158 { 03159 resizeGeneric_< 03160 HResizeCubic<uchar, int, short>, 03161 VResizeCubic<uchar, int, short, 03162 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, 03163 VResizeCubicVec_32s8u> >, 03164 0, 03165 resizeGeneric_< 03166 HResizeCubic<ushort, float, float>, 03167 VResizeCubic<ushort, float, float, Cast<float, ushort>, 03168 VResizeCubicVec_32f16u> >, 03169 resizeGeneric_< 03170 HResizeCubic<short, float, float>, 03171 VResizeCubic<short, float, float, Cast<float, short>, 03172 VResizeCubicVec_32f16s> >, 03173 0, 03174 resizeGeneric_< 03175 HResizeCubic<float, float, float>, 03176 VResizeCubic<float, float, float, Cast<float, float>, 03177 VResizeCubicVec_32f> >, 03178 resizeGeneric_< 03179 HResizeCubic<double, double, float>, 03180 VResizeCubic<double, double, float, Cast<double, double>, 03181 VResizeNoVec> >, 03182 0 03183 }; 03184 03185 static ResizeFunc lanczos4_tab[] = 03186 { 03187 resizeGeneric_<HResizeLanczos4<uchar, int, short>, 03188 VResizeLanczos4<uchar, int, short, 03189 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, 03190 VResizeNoVec> >, 03191 0, 03192 resizeGeneric_<HResizeLanczos4<ushort, float, float>, 03193 VResizeLanczos4<ushort, float, float, Cast<float, ushort>, 03194 VResizeLanczos4Vec_32f16u> >, 03195 resizeGeneric_<HResizeLanczos4<short, float, float>, 03196 VResizeLanczos4<short, float, float, Cast<float, short>, 03197 VResizeLanczos4Vec_32f16s> >, 03198 0, 03199 resizeGeneric_<HResizeLanczos4<float, float, float>, 03200 VResizeLanczos4<float, float, float, Cast<float, float>, 03201 VResizeLanczos4Vec_32f> >, 03202 resizeGeneric_<HResizeLanczos4<double, double, float>, 03203 VResizeLanczos4<double, double, float, Cast<double, double>, 03204 VResizeNoVec> >, 03205 0 03206 }; 03207 03208 static ResizeAreaFastFunc areafast_tab[] = 03209 { 03210 resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >, 03211 0, 03212 resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >, 03213 resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >, 03214 0, 03215 resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>, 03216 resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >, 03217 0 03218 }; 03219 03220 static ResizeAreaFunc area_tab[] = 03221 { 03222 resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>, 03223 resizeArea_<short, float>, 0, resizeArea_<float, float>, 03224 resizeArea_<double, double>, 0 03225 }; 03226 03227 Size ssize = _src.size(); 03228 03229 CV_Assert( ssize.area() > 0 ); 03230 CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) ); 03231 if( dsize.area() == 0 ) 03232 { 03233 dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x), 03234 saturate_cast<int>(ssize.height*inv_scale_y)); 03235 CV_Assert( dsize.area() > 0 ); 03236 } 03237 else 03238 { 03239 inv_scale_x = (double)dsize.width/ssize.width; 03240 inv_scale_y = (double)dsize.height/ssize.height; 03241 } 03242 03243 03244 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 03245 double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y; 03246 03247 int iscale_x = saturate_cast<int>(scale_x); 03248 int iscale_y = saturate_cast<int>(scale_y); 03249 03250 bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON && 03251 std::abs(scale_y - iscale_y) < DBL_EPSILON; 03252 03253 #ifdef HAVE_OPENCL 03254 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10, 03255 ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation)) 03256 #endif 03257 03258 Mat src = _src.getMat(); 03259 _dst.create(dsize, src.type()); 03260 Mat dst = _dst.getMat(); 03261 03262 if (dsize == ssize) { 03263 // Source and destination are of same size. Use simple copy. 03264 src.copyTo(dst); 03265 return; 03266 } 03267 03268 #ifdef HAVE_TEGRA_OPTIMIZATION 03269 if (tegra::useTegra() && tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation)) 03270 return; 03271 #endif 03272 03273 #ifdef HAVE_IPP 03274 int mode = -1; 03275 if (interpolation == INTER_LINEAR && _src.rows() >= 2 && _src.cols() >= 2) 03276 mode = INTER_LINEAR; 03277 else if (interpolation == INTER_CUBIC && _src.rows() >= 4 && _src.cols() >= 4) 03278 mode = INTER_CUBIC; 03279 03280 const double IPP_RESIZE_EPS = 1e-10; 03281 double ex = fabs((double)dsize.width / _src.cols() - inv_scale_x) / inv_scale_x; 03282 double ey = fabs((double)dsize.height / _src.rows() - inv_scale_y) / inv_scale_y; 03283 #endif 03284 CV_IPP_RUN(IPP_VERSION_X100 >= 710 && ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) && 03285 (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) && 03286 !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U) && 03287 mode >= 0 && (cn == 1 || cn == 3 || cn == 4) && (depth == CV_16U || depth == CV_16S || depth == CV_32F || 03288 (depth == CV_64F && mode == INTER_LINEAR)), ipp_resize_mt(src, dst, inv_scale_x, inv_scale_y, interpolation)) 03289 03290 03291 if( interpolation == INTER_NEAREST ) 03292 { 03293 resizeNN( src, dst, inv_scale_x, inv_scale_y ); 03294 return; 03295 } 03296 03297 int k, sx, sy, dx, dy; 03298 03299 03300 { 03301 // in case of scale_x && scale_y is equal to 2 03302 // INTER_AREA (fast) also is equal to INTER_LINEAR 03303 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) 03304 interpolation = INTER_AREA; 03305 03306 // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1). 03307 // In other cases it is emulated using some variant of bilinear interpolation 03308 if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 ) 03309 { 03310 if( is_area_fast ) 03311 { 03312 int area = iscale_x*iscale_y; 03313 size_t srcstep = src.step / src.elemSize1(); 03314 AutoBuffer<int> _ofs(area + dsize.width*cn); 03315 int* ofs = _ofs; 03316 int* xofs = ofs + area; 03317 ResizeAreaFastFunc func = areafast_tab[depth]; 03318 CV_Assert( func != 0 ); 03319 03320 for( sy = 0, k = 0; sy < iscale_y; sy++ ) 03321 for( sx = 0; sx < iscale_x; sx++ ) 03322 ofs[k++] = (int)(sy*srcstep + sx*cn); 03323 03324 for( dx = 0; dx < dsize.width; dx++ ) 03325 { 03326 int j = dx * cn; 03327 sx = iscale_x * j; 03328 for( k = 0; k < cn; k++ ) 03329 xofs[j + k] = sx + k; 03330 } 03331 03332 func( src, dst, ofs, xofs, iscale_x, iscale_y ); 03333 return; 03334 } 03335 03336 ResizeAreaFunc func = area_tab[depth]; 03337 CV_Assert( func != 0 && cn <= 4 ); 03338 03339 AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2); 03340 DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2; 03341 03342 int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab); 03343 int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab); 03344 03345 AutoBuffer<int> _tabofs(dsize.height + 1); 03346 int* tabofs = _tabofs; 03347 for( k = 0, dy = 0; k < ytab_size; k++ ) 03348 { 03349 if( k == 0 || ytab[k].di != ytab[k-1].di ) 03350 { 03351 assert( ytab[k].di == dy ); 03352 tabofs[dy++] = k; 03353 } 03354 } 03355 tabofs[dy] = ytab_size; 03356 03357 func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs ); 03358 return; 03359 } 03360 } 03361 03362 int xmin = 0, xmax = dsize.width, width = dsize.width*cn; 03363 bool area_mode = interpolation == INTER_AREA; 03364 bool fixpt = depth == CV_8U; 03365 float fx, fy; 03366 ResizeFunc func=0; 03367 int ksize=0, ksize2; 03368 if( interpolation == INTER_CUBIC ) 03369 ksize = 4, func = cubic_tab[depth]; 03370 else if( interpolation == INTER_LANCZOS4 ) 03371 ksize = 8, func = lanczos4_tab[depth]; 03372 else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA ) 03373 ksize = 2, func = linear_tab[depth]; 03374 else 03375 CV_Error( CV_StsBadArg, "Unknown interpolation method" ); 03376 ksize2 = ksize/2; 03377 03378 CV_Assert( func != 0 ); 03379 03380 AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize)); 03381 int* xofs = (int*)(uchar*)_buffer; 03382 int* yofs = xofs + width; 03383 float* alpha = (float*)(yofs + dsize.height); 03384 short* ialpha = (short*)alpha; 03385 float* beta = alpha + width*ksize; 03386 short* ibeta = ialpha + width*ksize; 03387 float cbuf[MAX_ESIZE]; 03388 03389 for( dx = 0; dx < dsize.width; dx++ ) 03390 { 03391 if( !area_mode ) 03392 { 03393 fx = (float)((dx+0.5)*scale_x - 0.5); 03394 sx = cvFloor(fx); 03395 fx -= sx; 03396 } 03397 else 03398 { 03399 sx = cvFloor(dx*scale_x); 03400 fx = (float)((dx+1) - (sx+1)*inv_scale_x); 03401 fx = fx <= 0 ? 0.f : fx - cvFloor(fx); 03402 } 03403 03404 if( sx < ksize2-1 ) 03405 { 03406 xmin = dx+1; 03407 if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4)) 03408 fx = 0, sx = 0; 03409 } 03410 03411 if( sx + ksize2 >= ssize.width ) 03412 { 03413 xmax = std::min( xmax, dx ); 03414 if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4)) 03415 fx = 0, sx = ssize.width-1; 03416 } 03417 03418 for( k = 0, sx *= cn; k < cn; k++ ) 03419 xofs[dx*cn + k] = sx + k; 03420 03421 if( interpolation == INTER_CUBIC ) 03422 interpolateCubic( fx, cbuf ); 03423 else if( interpolation == INTER_LANCZOS4 ) 03424 interpolateLanczos4( fx, cbuf ); 03425 else 03426 { 03427 cbuf[0] = 1.f - fx; 03428 cbuf[1] = fx; 03429 } 03430 if( fixpt ) 03431 { 03432 for( k = 0; k < ksize; k++ ) 03433 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE); 03434 for( ; k < cn*ksize; k++ ) 03435 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize]; 03436 } 03437 else 03438 { 03439 for( k = 0; k < ksize; k++ ) 03440 alpha[dx*cn*ksize + k] = cbuf[k]; 03441 for( ; k < cn*ksize; k++ ) 03442 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize]; 03443 } 03444 } 03445 03446 for( dy = 0; dy < dsize.height; dy++ ) 03447 { 03448 if( !area_mode ) 03449 { 03450 fy = (float)((dy+0.5)*scale_y - 0.5); 03451 sy = cvFloor(fy); 03452 fy -= sy; 03453 } 03454 else 03455 { 03456 sy = cvFloor(dy*scale_y); 03457 fy = (float)((dy+1) - (sy+1)*inv_scale_y); 03458 fy = fy <= 0 ? 0.f : fy - cvFloor(fy); 03459 } 03460 03461 yofs[dy] = sy; 03462 if( interpolation == INTER_CUBIC ) 03463 interpolateCubic( fy, cbuf ); 03464 else if( interpolation == INTER_LANCZOS4 ) 03465 interpolateLanczos4( fy, cbuf ); 03466 else 03467 { 03468 cbuf[0] = 1.f - fy; 03469 cbuf[1] = fy; 03470 } 03471 03472 if( fixpt ) 03473 { 03474 for( k = 0; k < ksize; k++ ) 03475 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE); 03476 } 03477 else 03478 { 03479 for( k = 0; k < ksize; k++ ) 03480 beta[dy*ksize + k] = cbuf[k]; 03481 } 03482 } 03483 03484 func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs, 03485 fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize ); 03486 } 03487 03488 03489 /****************************************************************************************\ 03490 * General warping (affine, perspective, remap) * 03491 \****************************************************************************************/ 03492 03493 namespace cv 03494 { 03495 03496 template<typename T> 03497 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy, 03498 int borderType, const Scalar& _borderValue ) 03499 { 03500 Size ssize = _src.size(), dsize = _dst.size(); 03501 int cn = _src.channels(); 03502 const T* S0 = _src.ptr<T>(); 03503 size_t sstep = _src.step/sizeof(S0[0]); 03504 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]), 03505 saturate_cast<T>(_borderValue[1]), 03506 saturate_cast<T>(_borderValue[2]), 03507 saturate_cast<T>(_borderValue[3])); 03508 int dx, dy; 03509 03510 unsigned width1 = ssize.width, height1 = ssize.height; 03511 03512 if( _dst.isContinuous() && _xy.isContinuous() ) 03513 { 03514 dsize.width *= dsize.height; 03515 dsize.height = 1; 03516 } 03517 03518 for( dy = 0; dy < dsize.height; dy++ ) 03519 { 03520 T* D = _dst.ptr<T>(dy); 03521 const short* XY = _xy.ptr<short>(dy); 03522 03523 if( cn == 1 ) 03524 { 03525 for( dx = 0; dx < dsize.width; dx++ ) 03526 { 03527 int sx = XY[dx*2], sy = XY[dx*2+1]; 03528 if( (unsigned)sx < width1 && (unsigned)sy < height1 ) 03529 D[dx] = S0[sy*sstep + sx]; 03530 else 03531 { 03532 if( borderType == BORDER_REPLICATE ) 03533 { 03534 sx = clip(sx, 0, ssize.width); 03535 sy = clip(sy, 0, ssize.height); 03536 D[dx] = S0[sy*sstep + sx]; 03537 } 03538 else if( borderType == BORDER_CONSTANT ) 03539 D[dx] = cval[0]; 03540 else if( borderType != BORDER_TRANSPARENT ) 03541 { 03542 sx = borderInterpolate(sx, ssize.width, borderType); 03543 sy = borderInterpolate(sy, ssize.height, borderType); 03544 D[dx] = S0[sy*sstep + sx]; 03545 } 03546 } 03547 } 03548 } 03549 else 03550 { 03551 for( dx = 0; dx < dsize.width; dx++, D += cn ) 03552 { 03553 int sx = XY[dx*2], sy = XY[dx*2+1], k; 03554 const T *S; 03555 if( (unsigned)sx < width1 && (unsigned)sy < height1 ) 03556 { 03557 if( cn == 3 ) 03558 { 03559 S = S0 + sy*sstep + sx*3; 03560 D[0] = S[0], D[1] = S[1], D[2] = S[2]; 03561 } 03562 else if( cn == 4 ) 03563 { 03564 S = S0 + sy*sstep + sx*4; 03565 D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3]; 03566 } 03567 else 03568 { 03569 S = S0 + sy*sstep + sx*cn; 03570 for( k = 0; k < cn; k++ ) 03571 D[k] = S[k]; 03572 } 03573 } 03574 else if( borderType != BORDER_TRANSPARENT ) 03575 { 03576 if( borderType == BORDER_REPLICATE ) 03577 { 03578 sx = clip(sx, 0, ssize.width); 03579 sy = clip(sy, 0, ssize.height); 03580 S = S0 + sy*sstep + sx*cn; 03581 } 03582 else if( borderType == BORDER_CONSTANT ) 03583 S = &cval[0]; 03584 else 03585 { 03586 sx = borderInterpolate(sx, ssize.width, borderType); 03587 sy = borderInterpolate(sy, ssize.height, borderType); 03588 S = S0 + sy*sstep + sx*cn; 03589 } 03590 for( k = 0; k < cn; k++ ) 03591 D[k] = S[k]; 03592 } 03593 } 03594 } 03595 } 03596 } 03597 03598 03599 struct RemapNoVec 03600 { 03601 int operator()( const Mat&, void*, const short*, const ushort*, 03602 const void*, int ) const { return 0; } 03603 }; 03604 03605 #if CV_SSE2 03606 03607 struct RemapVec_8u 03608 { 03609 int operator()( const Mat& _src, void* _dst, const short* XY, 03610 const ushort* FXY, const void* _wtab, int width ) const 03611 { 03612 int cn = _src.channels(), x = 0, sstep = (int)_src.step; 03613 03614 if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) || 03615 sstep > 0x8000 ) 03616 return 0; 03617 03618 const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1); 03619 const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0]; 03620 uchar* D = (uchar*)_dst; 03621 __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2); 03622 __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16)); 03623 __m128i z = _mm_setzero_si128(); 03624 int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4]; 03625 03626 if( cn == 1 ) 03627 { 03628 for( ; x <= width - 8; x += 8 ) 03629 { 03630 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); 03631 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8)); 03632 __m128i v0, v1, v2, v3, a0, a1, b0, b1; 03633 unsigned i0, i1; 03634 03635 xy0 = _mm_madd_epi16( xy0, xy2ofs ); 03636 xy1 = _mm_madd_epi16( xy1, xy2ofs ); 03637 _mm_store_si128( (__m128i*)iofs0, xy0 ); 03638 _mm_store_si128( (__m128i*)iofs1, xy1 ); 03639 03640 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16); 03641 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16); 03642 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); 03643 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16); 03644 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16); 03645 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); 03646 v0 = _mm_unpacklo_epi8(v0, z); 03647 v1 = _mm_unpacklo_epi8(v1, z); 03648 03649 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)), 03650 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4))); 03651 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)), 03652 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4))); 03653 b0 = _mm_unpacklo_epi64(a0, a1); 03654 b1 = _mm_unpackhi_epi64(a0, a1); 03655 v0 = _mm_madd_epi16(v0, b0); 03656 v1 = _mm_madd_epi16(v1, b1); 03657 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta); 03658 03659 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16); 03660 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16); 03661 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); 03662 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16); 03663 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16); 03664 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); 03665 v2 = _mm_unpacklo_epi8(v2, z); 03666 v3 = _mm_unpacklo_epi8(v3, z); 03667 03668 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)), 03669 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4))); 03670 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)), 03671 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4))); 03672 b0 = _mm_unpacklo_epi64(a0, a1); 03673 b1 = _mm_unpackhi_epi64(a0, a1); 03674 v2 = _mm_madd_epi16(v2, b0); 03675 v3 = _mm_madd_epi16(v3, b1); 03676 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta); 03677 03678 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS); 03679 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS); 03680 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z); 03681 _mm_storel_epi64( (__m128i*)(D + x), v0 ); 03682 } 03683 } 03684 else if( cn == 3 ) 03685 { 03686 for( ; x <= width - 5; x += 4, D += 12 ) 03687 { 03688 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); 03689 __m128i u0, v0, u1, v1; 03690 03691 xy0 = _mm_madd_epi16( xy0, xy2ofs ); 03692 _mm_store_si128( (__m128i*)iofs0, xy0 ); 03693 const __m128i *w0, *w1; 03694 w0 = (const __m128i*)(wtab + FXY[x]*16); 03695 w1 = (const __m128i*)(wtab + FXY[x+1]*16); 03696 03697 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])), 03698 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3))); 03699 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])), 03700 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3))); 03701 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])), 03702 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3))); 03703 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])), 03704 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3))); 03705 u0 = _mm_unpacklo_epi8(u0, z); 03706 v0 = _mm_unpacklo_epi8(v0, z); 03707 u1 = _mm_unpacklo_epi8(u1, z); 03708 v1 = _mm_unpacklo_epi8(v1, z); 03709 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); 03710 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); 03711 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); 03712 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); 03713 u0 = _mm_slli_si128(u0, 4); 03714 u0 = _mm_packs_epi32(u0, u1); 03715 u0 = _mm_packus_epi16(u0, u0); 03716 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1)); 03717 03718 w0 = (const __m128i*)(wtab + FXY[x+2]*16); 03719 w1 = (const __m128i*)(wtab + FXY[x+3]*16); 03720 03721 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])), 03722 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3))); 03723 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])), 03724 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3))); 03725 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])), 03726 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3))); 03727 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])), 03728 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3))); 03729 u0 = _mm_unpacklo_epi8(u0, z); 03730 v0 = _mm_unpacklo_epi8(v0, z); 03731 u1 = _mm_unpacklo_epi8(u1, z); 03732 v1 = _mm_unpacklo_epi8(v1, z); 03733 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); 03734 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); 03735 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); 03736 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); 03737 u0 = _mm_slli_si128(u0, 4); 03738 u0 = _mm_packs_epi32(u0, u1); 03739 u0 = _mm_packus_epi16(u0, u0); 03740 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1)); 03741 } 03742 } 03743 else if( cn == 4 ) 03744 { 03745 for( ; x <= width - 4; x += 4, D += 16 ) 03746 { 03747 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); 03748 __m128i u0, v0, u1, v1; 03749 03750 xy0 = _mm_madd_epi16( xy0, xy2ofs ); 03751 _mm_store_si128( (__m128i*)iofs0, xy0 ); 03752 const __m128i *w0, *w1; 03753 w0 = (const __m128i*)(wtab + FXY[x]*16); 03754 w1 = (const __m128i*)(wtab + FXY[x+1]*16); 03755 03756 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])), 03757 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4))); 03758 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])), 03759 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4))); 03760 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])), 03761 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4))); 03762 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])), 03763 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4))); 03764 u0 = _mm_unpacklo_epi8(u0, z); 03765 v0 = _mm_unpacklo_epi8(v0, z); 03766 u1 = _mm_unpacklo_epi8(u1, z); 03767 v1 = _mm_unpacklo_epi8(v1, z); 03768 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); 03769 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); 03770 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); 03771 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); 03772 u0 = _mm_packs_epi32(u0, u1); 03773 u0 = _mm_packus_epi16(u0, u0); 03774 _mm_storel_epi64((__m128i*)D, u0); 03775 03776 w0 = (const __m128i*)(wtab + FXY[x+2]*16); 03777 w1 = (const __m128i*)(wtab + FXY[x+3]*16); 03778 03779 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])), 03780 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4))); 03781 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])), 03782 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4))); 03783 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])), 03784 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4))); 03785 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])), 03786 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4))); 03787 u0 = _mm_unpacklo_epi8(u0, z); 03788 v0 = _mm_unpacklo_epi8(v0, z); 03789 u1 = _mm_unpacklo_epi8(u1, z); 03790 v1 = _mm_unpacklo_epi8(v1, z); 03791 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); 03792 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); 03793 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); 03794 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); 03795 u0 = _mm_packs_epi32(u0, u1); 03796 u0 = _mm_packus_epi16(u0, u0); 03797 _mm_storel_epi64((__m128i*)(D + 8), u0); 03798 } 03799 } 03800 03801 return x; 03802 } 03803 }; 03804 03805 #else 03806 03807 typedef RemapNoVec RemapVec_8u; 03808 03809 #endif 03810 03811 03812 template<class CastOp, class VecOp, typename AT> 03813 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy, 03814 const Mat& _fxy, const void* _wtab, 03815 int borderType, const Scalar& _borderValue ) 03816 { 03817 typedef typename CastOp::rtype T; 03818 typedef typename CastOp::type1 WT; 03819 Size ssize = _src.size(), dsize = _dst.size(); 03820 int k, cn = _src.channels(); 03821 const AT* wtab = (const AT*)_wtab; 03822 const T* S0 = _src.ptr<T>(); 03823 size_t sstep = _src.step/sizeof(S0[0]); 03824 T cval[CV_CN_MAX]; 03825 int dx, dy; 03826 CastOp castOp; 03827 VecOp vecOp; 03828 03829 for( k = 0; k < cn; k++ ) 03830 cval[k] = saturate_cast<T>(_borderValue[k & 3]); 03831 03832 unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0); 03833 CV_Assert( ssize.area() > 0 ); 03834 #if CV_SSE2 03835 if( _src.type() == CV_8UC3 ) 03836 width1 = std::max(ssize.width-2, 0); 03837 #endif 03838 03839 for( dy = 0; dy < dsize.height; dy++ ) 03840 { 03841 T* D = _dst.ptr<T>(dy); 03842 const short* XY = _xy.ptr<short>(dy); 03843 const ushort* FXY = _fxy.ptr<ushort>(dy); 03844 int X0 = 0; 03845 bool prevInlier = false; 03846 03847 for( dx = 0; dx <= dsize.width; dx++ ) 03848 { 03849 bool curInlier = dx < dsize.width ? 03850 (unsigned)XY[dx*2] < width1 && 03851 (unsigned)XY[dx*2+1] < height1 : !prevInlier; 03852 if( curInlier == prevInlier ) 03853 continue; 03854 03855 int X1 = dx; 03856 dx = X0; 03857 X0 = X1; 03858 prevInlier = curInlier; 03859 03860 if( !curInlier ) 03861 { 03862 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx ); 03863 D += len*cn; 03864 dx += len; 03865 03866 if( cn == 1 ) 03867 { 03868 for( ; dx < X1; dx++, D++ ) 03869 { 03870 int sx = XY[dx*2], sy = XY[dx*2+1]; 03871 const AT* w = wtab + FXY[dx]*4; 03872 const T* S = S0 + sy*sstep + sx; 03873 *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3])); 03874 } 03875 } 03876 else if( cn == 2 ) 03877 for( ; dx < X1; dx++, D += 2 ) 03878 { 03879 int sx = XY[dx*2], sy = XY[dx*2+1]; 03880 const AT* w = wtab + FXY[dx]*4; 03881 const T* S = S0 + sy*sstep + sx*2; 03882 WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3]; 03883 WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3]; 03884 D[0] = castOp(t0); D[1] = castOp(t1); 03885 } 03886 else if( cn == 3 ) 03887 for( ; dx < X1; dx++, D += 3 ) 03888 { 03889 int sx = XY[dx*2], sy = XY[dx*2+1]; 03890 const AT* w = wtab + FXY[dx]*4; 03891 const T* S = S0 + sy*sstep + sx*3; 03892 WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3]; 03893 WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3]; 03894 WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3]; 03895 D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2); 03896 } 03897 else if( cn == 4 ) 03898 for( ; dx < X1; dx++, D += 4 ) 03899 { 03900 int sx = XY[dx*2], sy = XY[dx*2+1]; 03901 const AT* w = wtab + FXY[dx]*4; 03902 const T* S = S0 + sy*sstep + sx*4; 03903 WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3]; 03904 WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3]; 03905 D[0] = castOp(t0); D[1] = castOp(t1); 03906 t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3]; 03907 t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3]; 03908 D[2] = castOp(t0); D[3] = castOp(t1); 03909 } 03910 else 03911 for( ; dx < X1; dx++, D += cn ) 03912 { 03913 int sx = XY[dx*2], sy = XY[dx*2+1]; 03914 const AT* w = wtab + FXY[dx]*4; 03915 const T* S = S0 + sy*sstep + sx*cn; 03916 for( k = 0; k < cn; k++ ) 03917 { 03918 WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3]; 03919 D[k] = castOp(t0); 03920 } 03921 } 03922 } 03923 else 03924 { 03925 if( borderType == BORDER_TRANSPARENT && cn != 3 ) 03926 { 03927 D += (X1 - dx)*cn; 03928 dx = X1; 03929 continue; 03930 } 03931 03932 if( cn == 1 ) 03933 for( ; dx < X1; dx++, D++ ) 03934 { 03935 int sx = XY[dx*2], sy = XY[dx*2+1]; 03936 if( borderType == BORDER_CONSTANT && 03937 (sx >= ssize.width || sx+1 < 0 || 03938 sy >= ssize.height || sy+1 < 0) ) 03939 { 03940 D[0] = cval[0]; 03941 } 03942 else 03943 { 03944 int sx0, sx1, sy0, sy1; 03945 T v0, v1, v2, v3; 03946 const AT* w = wtab + FXY[dx]*4; 03947 if( borderType == BORDER_REPLICATE ) 03948 { 03949 sx0 = clip(sx, 0, ssize.width); 03950 sx1 = clip(sx+1, 0, ssize.width); 03951 sy0 = clip(sy, 0, ssize.height); 03952 sy1 = clip(sy+1, 0, ssize.height); 03953 v0 = S0[sy0*sstep + sx0]; 03954 v1 = S0[sy0*sstep + sx1]; 03955 v2 = S0[sy1*sstep + sx0]; 03956 v3 = S0[sy1*sstep + sx1]; 03957 } 03958 else 03959 { 03960 sx0 = borderInterpolate(sx, ssize.width, borderType); 03961 sx1 = borderInterpolate(sx+1, ssize.width, borderType); 03962 sy0 = borderInterpolate(sy, ssize.height, borderType); 03963 sy1 = borderInterpolate(sy+1, ssize.height, borderType); 03964 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0]; 03965 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0]; 03966 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0]; 03967 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0]; 03968 } 03969 D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3])); 03970 } 03971 } 03972 else 03973 for( ; dx < X1; dx++, D += cn ) 03974 { 03975 int sx = XY[dx*2], sy = XY[dx*2+1]; 03976 if( borderType == BORDER_CONSTANT && 03977 (sx >= ssize.width || sx+1 < 0 || 03978 sy >= ssize.height || sy+1 < 0) ) 03979 { 03980 for( k = 0; k < cn; k++ ) 03981 D[k] = cval[k]; 03982 } 03983 else 03984 { 03985 int sx0, sx1, sy0, sy1; 03986 const T *v0, *v1, *v2, *v3; 03987 const AT* w = wtab + FXY[dx]*4; 03988 if( borderType == BORDER_REPLICATE ) 03989 { 03990 sx0 = clip(sx, 0, ssize.width); 03991 sx1 = clip(sx+1, 0, ssize.width); 03992 sy0 = clip(sy, 0, ssize.height); 03993 sy1 = clip(sy+1, 0, ssize.height); 03994 v0 = S0 + sy0*sstep + sx0*cn; 03995 v1 = S0 + sy0*sstep + sx1*cn; 03996 v2 = S0 + sy1*sstep + sx0*cn; 03997 v3 = S0 + sy1*sstep + sx1*cn; 03998 } 03999 else if( borderType == BORDER_TRANSPARENT && 04000 ((unsigned)sx >= (unsigned)(ssize.width-1) || 04001 (unsigned)sy >= (unsigned)(ssize.height-1))) 04002 continue; 04003 else 04004 { 04005 sx0 = borderInterpolate(sx, ssize.width, borderType); 04006 sx1 = borderInterpolate(sx+1, ssize.width, borderType); 04007 sy0 = borderInterpolate(sy, ssize.height, borderType); 04008 sy1 = borderInterpolate(sy+1, ssize.height, borderType); 04009 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0]; 04010 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0]; 04011 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0]; 04012 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0]; 04013 } 04014 for( k = 0; k < cn; k++ ) 04015 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3])); 04016 } 04017 } 04018 } 04019 } 04020 } 04021 } 04022 04023 04024 template<class CastOp, typename AT, int ONE> 04025 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy, 04026 const Mat& _fxy, const void* _wtab, 04027 int borderType, const Scalar& _borderValue ) 04028 { 04029 typedef typename CastOp::rtype T; 04030 typedef typename CastOp::type1 WT; 04031 Size ssize = _src.size(), dsize = _dst.size(); 04032 int cn = _src.channels(); 04033 const AT* wtab = (const AT*)_wtab; 04034 const T* S0 = _src.ptr<T>(); 04035 size_t sstep = _src.step/sizeof(S0[0]); 04036 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]), 04037 saturate_cast<T>(_borderValue[1]), 04038 saturate_cast<T>(_borderValue[2]), 04039 saturate_cast<T>(_borderValue[3])); 04040 int dx, dy; 04041 CastOp castOp; 04042 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101; 04043 04044 unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0); 04045 04046 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() ) 04047 { 04048 dsize.width *= dsize.height; 04049 dsize.height = 1; 04050 } 04051 04052 for( dy = 0; dy < dsize.height; dy++ ) 04053 { 04054 T* D = _dst.ptr<T>(dy); 04055 const short* XY = _xy.ptr<short>(dy); 04056 const ushort* FXY = _fxy.ptr<ushort>(dy); 04057 04058 for( dx = 0; dx < dsize.width; dx++, D += cn ) 04059 { 04060 int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1; 04061 const AT* w = wtab + FXY[dx]*16; 04062 int i, k; 04063 if( (unsigned)sx < width1 && (unsigned)sy < height1 ) 04064 { 04065 const T* S = S0 + sy*sstep + sx*cn; 04066 for( k = 0; k < cn; k++ ) 04067 { 04068 WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3]; 04069 S += sstep; 04070 sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7]; 04071 S += sstep; 04072 sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11]; 04073 S += sstep; 04074 sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15]; 04075 S += 1 - sstep*3; 04076 D[k] = castOp(sum); 04077 } 04078 } 04079 else 04080 { 04081 int x[4], y[4]; 04082 if( borderType == BORDER_TRANSPARENT && 04083 ((unsigned)(sx+1) >= (unsigned)ssize.width || 04084 (unsigned)(sy+1) >= (unsigned)ssize.height) ) 04085 continue; 04086 04087 if( borderType1 == BORDER_CONSTANT && 04088 (sx >= ssize.width || sx+4 <= 0 || 04089 sy >= ssize.height || sy+4 <= 0)) 04090 { 04091 for( k = 0; k < cn; k++ ) 04092 D[k] = cval[k]; 04093 continue; 04094 } 04095 04096 for( i = 0; i < 4; i++ ) 04097 { 04098 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn; 04099 y[i] = borderInterpolate(sy + i, ssize.height, borderType1); 04100 } 04101 04102 for( k = 0; k < cn; k++, S0++, w -= 16 ) 04103 { 04104 WT cv = cval[k], sum = cv*ONE; 04105 for( i = 0; i < 4; i++, w += 4 ) 04106 { 04107 int yi = y[i]; 04108 const T* S = S0 + yi*sstep; 04109 if( yi < 0 ) 04110 continue; 04111 if( x[0] >= 0 ) 04112 sum += (S[x[0]] - cv)*w[0]; 04113 if( x[1] >= 0 ) 04114 sum += (S[x[1]] - cv)*w[1]; 04115 if( x[2] >= 0 ) 04116 sum += (S[x[2]] - cv)*w[2]; 04117 if( x[3] >= 0 ) 04118 sum += (S[x[3]] - cv)*w[3]; 04119 } 04120 D[k] = castOp(sum); 04121 } 04122 S0 -= cn; 04123 } 04124 } 04125 } 04126 } 04127 04128 04129 template<class CastOp, typename AT, int ONE> 04130 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy, 04131 const Mat& _fxy, const void* _wtab, 04132 int borderType, const Scalar& _borderValue ) 04133 { 04134 typedef typename CastOp::rtype T; 04135 typedef typename CastOp::type1 WT; 04136 Size ssize = _src.size(), dsize = _dst.size(); 04137 int cn = _src.channels(); 04138 const AT* wtab = (const AT*)_wtab; 04139 const T* S0 = _src.ptr<T>(); 04140 size_t sstep = _src.step/sizeof(S0[0]); 04141 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]), 04142 saturate_cast<T>(_borderValue[1]), 04143 saturate_cast<T>(_borderValue[2]), 04144 saturate_cast<T>(_borderValue[3])); 04145 int dx, dy; 04146 CastOp castOp; 04147 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101; 04148 04149 unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0); 04150 04151 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() ) 04152 { 04153 dsize.width *= dsize.height; 04154 dsize.height = 1; 04155 } 04156 04157 for( dy = 0; dy < dsize.height; dy++ ) 04158 { 04159 T* D = _dst.ptr<T>(dy); 04160 const short* XY = _xy.ptr<short>(dy); 04161 const ushort* FXY = _fxy.ptr<ushort>(dy); 04162 04163 for( dx = 0; dx < dsize.width; dx++, D += cn ) 04164 { 04165 int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3; 04166 const AT* w = wtab + FXY[dx]*64; 04167 const T* S = S0 + sy*sstep + sx*cn; 04168 int i, k; 04169 if( (unsigned)sx < width1 && (unsigned)sy < height1 ) 04170 { 04171 for( k = 0; k < cn; k++ ) 04172 { 04173 WT sum = 0; 04174 for( int r = 0; r < 8; r++, S += sstep, w += 8 ) 04175 sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] + 04176 S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7]; 04177 w -= 64; 04178 S -= sstep*8 - 1; 04179 D[k] = castOp(sum); 04180 } 04181 } 04182 else 04183 { 04184 int x[8], y[8]; 04185 if( borderType == BORDER_TRANSPARENT && 04186 ((unsigned)(sx+3) >= (unsigned)ssize.width || 04187 (unsigned)(sy+3) >= (unsigned)ssize.height) ) 04188 continue; 04189 04190 if( borderType1 == BORDER_CONSTANT && 04191 (sx >= ssize.width || sx+8 <= 0 || 04192 sy >= ssize.height || sy+8 <= 0)) 04193 { 04194 for( k = 0; k < cn; k++ ) 04195 D[k] = cval[k]; 04196 continue; 04197 } 04198 04199 for( i = 0; i < 8; i++ ) 04200 { 04201 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn; 04202 y[i] = borderInterpolate(sy + i, ssize.height, borderType1); 04203 } 04204 04205 for( k = 0; k < cn; k++, S0++, w -= 64 ) 04206 { 04207 WT cv = cval[k], sum = cv*ONE; 04208 for( i = 0; i < 8; i++, w += 8 ) 04209 { 04210 int yi = y[i]; 04211 const T* S1 = S0 + yi*sstep; 04212 if( yi < 0 ) 04213 continue; 04214 if( x[0] >= 0 ) 04215 sum += (S1[x[0]] - cv)*w[0]; 04216 if( x[1] >= 0 ) 04217 sum += (S1[x[1]] - cv)*w[1]; 04218 if( x[2] >= 0 ) 04219 sum += (S1[x[2]] - cv)*w[2]; 04220 if( x[3] >= 0 ) 04221 sum += (S1[x[3]] - cv)*w[3]; 04222 if( x[4] >= 0 ) 04223 sum += (S1[x[4]] - cv)*w[4]; 04224 if( x[5] >= 0 ) 04225 sum += (S1[x[5]] - cv)*w[5]; 04226 if( x[6] >= 0 ) 04227 sum += (S1[x[6]] - cv)*w[6]; 04228 if( x[7] >= 0 ) 04229 sum += (S1[x[7]] - cv)*w[7]; 04230 } 04231 D[k] = castOp(sum); 04232 } 04233 S0 -= cn; 04234 } 04235 } 04236 } 04237 } 04238 04239 04240 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy, 04241 int borderType, const Scalar& _borderValue ); 04242 04243 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy, 04244 const Mat& _fxy, const void* _wtab, 04245 int borderType, const Scalar& _borderValue); 04246 04247 class RemapInvoker : 04248 public ParallelLoopBody 04249 { 04250 public: 04251 RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1, 04252 const Mat *_m2, int _borderType, const Scalar &_borderValue, 04253 int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) : 04254 ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2), 04255 borderType(_borderType), borderValue(_borderValue), 04256 planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab) 04257 { 04258 } 04259 04260 virtual void operator() (const Range& range) const 04261 { 04262 int x, y, x1, y1; 04263 const int buf_size = 1 << 14; 04264 int brows0 = std::min(128, dst->rows), map_depth = m1->depth(); 04265 int bcols0 = std::min(buf_size/brows0, dst->cols); 04266 brows0 = std::min(buf_size/bcols0, dst->rows); 04267 #if CV_SSE2 04268 bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); 04269 #endif 04270 04271 Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa; 04272 if( !nnfunc ) 04273 _bufa.create(brows0, bcols0, CV_16UC1); 04274 04275 for( y = range.start; y < range.end; y += brows0 ) 04276 { 04277 for( x = 0; x < dst->cols; x += bcols0 ) 04278 { 04279 int brows = std::min(brows0, range.end - y); 04280 int bcols = std::min(bcols0, dst->cols - x); 04281 Mat dpart(*dst, Rect(x, y, bcols, brows)); 04282 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows)); 04283 04284 if( nnfunc ) 04285 { 04286 if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format 04287 bufxy = (*m1)(Rect(x, y, bcols, brows)); 04288 else if( map_depth != CV_32F ) 04289 { 04290 for( y1 = 0; y1 < brows; y1++ ) 04291 { 04292 short* XY = bufxy.ptr<short>(y1); 04293 const short* sXY = m1->ptr<short>(y+y1) + x*2; 04294 const ushort* sA = m2->ptr<ushort>(y+y1) + x; 04295 04296 for( x1 = 0; x1 < bcols; x1++ ) 04297 { 04298 int a = sA[x1] & (INTER_TAB_SIZE2-1); 04299 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0]; 04300 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1]; 04301 } 04302 } 04303 } 04304 else if( !planar_input ) 04305 (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth()); 04306 else 04307 { 04308 for( y1 = 0; y1 < brows; y1++ ) 04309 { 04310 short* XY = bufxy.ptr<short>(y1); 04311 const float* sX = m1->ptr<float>(y+y1) + x; 04312 const float* sY = m2->ptr<float>(y+y1) + x; 04313 x1 = 0; 04314 04315 #if CV_SSE2 04316 if( useSIMD ) 04317 { 04318 for( ; x1 <= bcols - 8; x1 += 8 ) 04319 { 04320 __m128 fx0 = _mm_loadu_ps(sX + x1); 04321 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4); 04322 __m128 fy0 = _mm_loadu_ps(sY + x1); 04323 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4); 04324 __m128i ix0 = _mm_cvtps_epi32(fx0); 04325 __m128i ix1 = _mm_cvtps_epi32(fx1); 04326 __m128i iy0 = _mm_cvtps_epi32(fy0); 04327 __m128i iy1 = _mm_cvtps_epi32(fy1); 04328 ix0 = _mm_packs_epi32(ix0, ix1); 04329 iy0 = _mm_packs_epi32(iy0, iy1); 04330 ix1 = _mm_unpacklo_epi16(ix0, iy0); 04331 iy1 = _mm_unpackhi_epi16(ix0, iy0); 04332 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1); 04333 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1); 04334 } 04335 } 04336 #endif 04337 04338 for( ; x1 < bcols; x1++ ) 04339 { 04340 XY[x1*2] = saturate_cast<short>(sX[x1]); 04341 XY[x1*2+1] = saturate_cast<short>(sY[x1]); 04342 } 04343 } 04344 } 04345 nnfunc( *src, dpart, bufxy, borderType, borderValue ); 04346 continue; 04347 } 04348 04349 Mat bufa(_bufa, Rect(0, 0, bcols, brows)); 04350 for( y1 = 0; y1 < brows; y1++ ) 04351 { 04352 short* XY = bufxy.ptr<short>(y1); 04353 ushort* A = bufa.ptr<ushort>(y1); 04354 04355 if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) ) 04356 { 04357 bufxy = (*m1)(Rect(x, y, bcols, brows)); 04358 04359 const ushort* sA = m2->ptr<ushort>(y+y1) + x; 04360 x1 = 0; 04361 04362 #if CV_NEON 04363 uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1); 04364 for ( ; x1 <= bcols - 8; x1 += 8) 04365 vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale)); 04366 #elif CV_SSE2 04367 __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1); 04368 for ( ; x1 <= bcols - 8; x1 += 8) 04369 _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale)); 04370 #endif 04371 04372 for( ; x1 < bcols; x1++ ) 04373 A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1)); 04374 } 04375 else if( planar_input ) 04376 { 04377 const float* sX = m1->ptr<float>(y+y1) + x; 04378 const float* sY = m2->ptr<float>(y+y1) + x; 04379 04380 x1 = 0; 04381 #if CV_SSE2 04382 if( useSIMD ) 04383 { 04384 __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE); 04385 __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1); 04386 for( ; x1 <= bcols - 8; x1 += 8 ) 04387 { 04388 __m128 fx0 = _mm_loadu_ps(sX + x1); 04389 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4); 04390 __m128 fy0 = _mm_loadu_ps(sY + x1); 04391 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4); 04392 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale)); 04393 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale)); 04394 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale)); 04395 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale)); 04396 __m128i mx0 = _mm_and_si128(ix0, mask); 04397 __m128i mx1 = _mm_and_si128(ix1, mask); 04398 __m128i my0 = _mm_and_si128(iy0, mask); 04399 __m128i my1 = _mm_and_si128(iy1, mask); 04400 mx0 = _mm_packs_epi32(mx0, mx1); 04401 my0 = _mm_packs_epi32(my0, my1); 04402 my0 = _mm_slli_epi16(my0, INTER_BITS); 04403 mx0 = _mm_or_si128(mx0, my0); 04404 _mm_storeu_si128((__m128i*)(A + x1), mx0); 04405 ix0 = _mm_srai_epi32(ix0, INTER_BITS); 04406 ix1 = _mm_srai_epi32(ix1, INTER_BITS); 04407 iy0 = _mm_srai_epi32(iy0, INTER_BITS); 04408 iy1 = _mm_srai_epi32(iy1, INTER_BITS); 04409 ix0 = _mm_packs_epi32(ix0, ix1); 04410 iy0 = _mm_packs_epi32(iy0, iy1); 04411 ix1 = _mm_unpacklo_epi16(ix0, iy0); 04412 iy1 = _mm_unpackhi_epi16(ix0, iy0); 04413 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1); 04414 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1); 04415 } 04416 } 04417 #elif CV_NEON 04418 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); 04419 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE); 04420 04421 for( ; x1 <= bcols - 4; x1 += 4 ) 04422 { 04423 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)), 04424 v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale)); 04425 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3, 04426 vandq_s32(v_sy, v_scale2)); 04427 vst1_u16(A + x1, vqmovun_s32(v_v)); 04428 04429 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)), 04430 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS))); 04431 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); 04432 } 04433 #endif 04434 04435 for( ; x1 < bcols; x1++ ) 04436 { 04437 int sx = cvRound(sX[x1]*INTER_TAB_SIZE); 04438 int sy = cvRound(sY[x1]*INTER_TAB_SIZE); 04439 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1)); 04440 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS); 04441 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS); 04442 A[x1] = (ushort)v; 04443 } 04444 } 04445 else 04446 { 04447 const float* sXY = m1->ptr<float>(y+y1) + x*2; 04448 x1 = 0; 04449 04450 #if CV_NEON 04451 float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE); 04452 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE); 04453 04454 for( ; x1 <= bcols - 4; x1 += 4 ) 04455 { 04456 float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1)); 04457 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale)); 04458 int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale)); 04459 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3, 04460 vandq_s32(v_sy, v_scale2)); 04461 vst1_u16(A + x1, vqmovun_s32(v_v)); 04462 04463 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)), 04464 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS))); 04465 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); 04466 } 04467 #endif 04468 04469 for( x1 = 0; x1 < bcols; x1++ ) 04470 { 04471 int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE); 04472 int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE); 04473 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1)); 04474 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS); 04475 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS); 04476 A[x1] = (ushort)v; 04477 } 04478 } 04479 } 04480 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue); 04481 } 04482 } 04483 } 04484 04485 private: 04486 const Mat* src; 04487 Mat* dst; 04488 const Mat *m1, *m2; 04489 int borderType; 04490 Scalar borderValue; 04491 int planar_input; 04492 RemapNNFunc nnfunc; 04493 RemapFunc ifunc; 04494 const void *ctab; 04495 }; 04496 04497 #ifdef HAVE_OPENCL 04498 04499 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2, 04500 int interpolation, int borderType, const Scalar& borderValue) 04501 { 04502 const ocl::Device & dev = ocl::Device::getDefault(); 04503 int cn = _src.channels(), type = _src.type(), depth = _src.depth(), 04504 rowsPerWI = dev.isIntel() ? 4 : 1; 04505 04506 if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST) 04507 || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1) 04508 return false; 04509 04510 UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat(); 04511 04512 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) || 04513 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) ) 04514 { 04515 if (map1.type() != CV_16SC2) 04516 std::swap(map1, map2); 04517 } 04518 else 04519 CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) ); 04520 04521 _dst.create(map1.size(), type); 04522 UMat dst = _dst.getUMat(); 04523 04524 String kernelName = "remap"; 04525 if (map1.type() == CV_32FC2 && map2.empty()) 04526 kernelName += "_32FC2"; 04527 else if (map1.type() == CV_16SC2) 04528 { 04529 kernelName += "_16SC2"; 04530 if (!map2.empty()) 04531 kernelName += "_16UC1"; 04532 } 04533 else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) 04534 kernelName += "_2_32FC1"; 04535 else 04536 CV_Error(Error::StsBadArg, "Unsupported map types"); 04537 04538 static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" }; 04539 static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", 04540 "BORDER_REFLECT_101", "BORDER_TRANSPARENT" }; 04541 String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d", 04542 interMap[interpolation], borderMap[borderType], 04543 ocl::typeToStr(type), rowsPerWI); 04544 04545 if (interpolation != INTER_NEAREST) 04546 { 04547 char cvt[3][40]; 04548 int wdepth = std::max(CV_32F, depth); 04549 buildOptions = buildOptions 04550 + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s" 04551 " -D convertToWT2=%s -D WT2=%s", 04552 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), 04553 ocl::convertTypeStr(wdepth, depth, cn, cvt[0]), 04554 ocl::convertTypeStr(depth, wdepth, cn, cvt[1]), 04555 ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]), 04556 ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2))); 04557 } 04558 int scalarcn = cn == 3 ? 4 : cn; 04559 int sctype = CV_MAKETYPE(depth, scalarcn); 04560 buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d", 04561 ocl::typeToStr(type), ocl::typeToStr(depth), 04562 cn, ocl::typeToStr(sctype), depth); 04563 04564 ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions); 04565 04566 Mat scalar(1, 1, sctype, borderValue); 04567 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst), 04568 map1arg = ocl::KernelArg::ReadOnlyNoSize(map1), 04569 scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize()); 04570 04571 if (map2.empty()) 04572 k.args(srcarg, dstarg, map1arg, scalararg); 04573 else 04574 k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg); 04575 04576 size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI }; 04577 return k.run(2, globalThreads, NULL, false); 04578 } 04579 04580 #endif 04581 04582 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && IPP_DISABLE_BLOCK 04583 04584 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi, 04585 const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep, 04586 void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation); 04587 04588 class IPPRemapInvoker : 04589 public ParallelLoopBody 04590 { 04591 public: 04592 IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc, 04593 int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) : 04594 ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc), 04595 ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok) 04596 { 04597 *ok = true; 04598 } 04599 04600 virtual void operator() (const Range & range) const 04601 { 04602 IppiRect srcRoiRect = { 0, 0, src.cols, src.rows }; 04603 Mat dstRoi = dst.rowRange(range); 04604 IppiSize dstRoiSize = ippiSize(dstRoi.size()); 04605 int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 04606 04607 if (borderType == BORDER_CONSTANT && 04608 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth)) 04609 { 04610 *ok = false; 04611 return; 04612 } 04613 04614 if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect, 04615 map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step, 04616 dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0) 04617 *ok = false; 04618 else 04619 { 04620 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 04621 } 04622 } 04623 04624 private: 04625 Mat & src, & dst, & map1, & map2; 04626 ippiRemap ippFunc; 04627 int ippInterpolation, borderType; 04628 Scalar borderValue; 04629 bool * ok; 04630 }; 04631 04632 #endif 04633 04634 } 04635 04636 void cv::remap( InputArray _src, OutputArray _dst, 04637 InputArray _map1, InputArray _map2, 04638 int interpolation, int borderType, const Scalar & borderValue ) 04639 { 04640 static RemapNNFunc nn_tab[] = 04641 { 04642 remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>, 04643 remapNearest<int>, remapNearest<float>, remapNearest<double>, 0 04644 }; 04645 04646 static RemapFunc linear_tab[] = 04647 { 04648 remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0, 04649 remapBilinear<Cast<float, ushort>, RemapNoVec, float>, 04650 remapBilinear<Cast<float, short>, RemapNoVec, float>, 0, 04651 remapBilinear<Cast<float, float>, RemapNoVec, float>, 04652 remapBilinear<Cast<double, double>, RemapNoVec, float>, 0 04653 }; 04654 04655 static RemapFunc cubic_tab[] = 04656 { 04657 remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0, 04658 remapBicubic<Cast<float, ushort>, float, 1>, 04659 remapBicubic<Cast<float, short>, float, 1>, 0, 04660 remapBicubic<Cast<float, float>, float, 1>, 04661 remapBicubic<Cast<double, double>, float, 1>, 0 04662 }; 04663 04664 static RemapFunc lanczos4_tab[] = 04665 { 04666 remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0, 04667 remapLanczos4<Cast<float, ushort>, float, 1>, 04668 remapLanczos4<Cast<float, short>, float, 1>, 0, 04669 remapLanczos4<Cast<float, float>, float, 1>, 04670 remapLanczos4<Cast<double, double>, float, 1>, 0 04671 }; 04672 04673 CV_Assert( _map1.size().area() > 0 ); 04674 CV_Assert( _map2.empty() || (_map2.size() == _map1.size())); 04675 04676 #ifdef HAVE_OPENCL 04677 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 04678 ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue)) 04679 #endif 04680 04681 Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat(); 04682 _dst.create( map1.size(), src.type() ); 04683 Mat dst = _dst.getMat(); 04684 if( dst.data == src.data ) 04685 src = src.clone(); 04686 04687 if( interpolation == INTER_AREA ) 04688 interpolation = INTER_LINEAR; 04689 04690 int type = src.type(), depth = CV_MAT_DEPTH(type); 04691 04692 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && IPP_DISABLE_BLOCK 04693 CV_IPP_CHECK() 04694 { 04695 if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) && 04696 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 && 04697 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT)) 04698 { 04699 int ippInterpolation = 04700 interpolation == INTER_NEAREST ? IPPI_INTER_NN : 04701 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC; 04702 04703 ippiRemap ippFunc = 04704 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R : 04705 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R : 04706 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R : 04707 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R : 04708 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R : 04709 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R : 04710 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R : 04711 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R : 04712 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0; 04713 04714 if (ippFunc) 04715 { 04716 bool ok; 04717 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation, 04718 borderType, borderValue, &ok); 04719 Range range(0, dst.rows); 04720 parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); 04721 04722 if (ok) 04723 { 04724 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 04725 return; 04726 } 04727 setIppErrorStatus(); 04728 } 04729 } 04730 } 04731 #endif 04732 04733 RemapNNFunc nnfunc = 0; 04734 RemapFunc ifunc = 0; 04735 const void* ctab = 0; 04736 bool fixpt = depth == CV_8U; 04737 bool planar_input = false; 04738 04739 if( interpolation == INTER_NEAREST ) 04740 { 04741 nnfunc = nn_tab[depth]; 04742 CV_Assert( nnfunc != 0 ); 04743 } 04744 else 04745 { 04746 if( interpolation == INTER_LINEAR ) 04747 ifunc = linear_tab[depth]; 04748 else if( interpolation == INTER_CUBIC ) 04749 ifunc = cubic_tab[depth]; 04750 else if( interpolation == INTER_LANCZOS4 ) 04751 ifunc = lanczos4_tab[depth]; 04752 else 04753 CV_Error( CV_StsBadArg, "Unknown interpolation method" ); 04754 CV_Assert( ifunc != 0 ); 04755 ctab = initInterTab2D( interpolation, fixpt ); 04756 } 04757 04758 const Mat *m1 = &map1, *m2 = &map2; 04759 04760 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) || 04761 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) ) 04762 { 04763 if( map1.type() != CV_16SC2 ) 04764 std::swap(m1, m2); 04765 } 04766 else 04767 { 04768 CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) || 04769 (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) ); 04770 planar_input = map1.channels() == 1; 04771 } 04772 04773 RemapInvoker invoker(src, dst, m1, m2, 04774 borderType, borderValue, planar_input, nnfunc, ifunc, 04775 ctab); 04776 parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16)); 04777 } 04778 04779 04780 void cv::convertMaps( InputArray _map1, InputArray _map2, 04781 OutputArray _dstmap1, OutputArray _dstmap2, 04782 int dstm1type, bool nninterpolate ) 04783 { 04784 Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2; 04785 Size size = map1.size(); 04786 const Mat *m1 = &map1, *m2 = &map2; 04787 int m1type = m1->type(), m2type = m2->type(); 04788 04789 CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) || 04790 (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) || 04791 (m1type == CV_32FC1 && m2type == CV_32FC1) || 04792 (m1type == CV_32FC2 && m2->empty()) ); 04793 04794 if( m2type == CV_16SC2 ) 04795 { 04796 std::swap( m1, m2 ); 04797 std::swap( m1type, m2type ); 04798 } 04799 04800 if( dstm1type <= 0 ) 04801 dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2; 04802 CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 ); 04803 _dstmap1.create( size, dstm1type ); 04804 dstmap1 = _dstmap1.getMat(); 04805 04806 if( !nninterpolate && dstm1type != CV_32FC2 ) 04807 { 04808 _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 ); 04809 dstmap2 = _dstmap2.getMat(); 04810 } 04811 else 04812 _dstmap2.release(); 04813 04814 if( m1type == dstm1type || (nninterpolate && 04815 ((m1type == CV_16SC2 && dstm1type == CV_32FC2) || 04816 (m1type == CV_32FC2 && dstm1type == CV_16SC2))) ) 04817 { 04818 m1->convertTo( dstmap1, dstmap1.type() ); 04819 if( !dstmap2.empty() && dstmap2.type() == m2->type() ) 04820 m2->copyTo( dstmap2 ); 04821 return; 04822 } 04823 04824 if( m1type == CV_32FC1 && dstm1type == CV_32FC2 ) 04825 { 04826 Mat vdata[] = { *m1, *m2 }; 04827 merge( vdata, 2, dstmap1 ); 04828 return; 04829 } 04830 04831 if( m1type == CV_32FC2 && dstm1type == CV_32FC1 ) 04832 { 04833 Mat mv[] = { dstmap1, dstmap2 }; 04834 split( *m1, mv ); 04835 return; 04836 } 04837 04838 if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) && 04839 dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) ) 04840 { 04841 size.width *= size.height; 04842 size.height = 1; 04843 } 04844 04845 #if CV_SSE2 04846 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); 04847 #endif 04848 #if CV_SSE4_1 04849 bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); 04850 #endif 04851 04852 const float scale = 1.f/INTER_TAB_SIZE; 04853 int x, y; 04854 for( y = 0; y < size.height; y++ ) 04855 { 04856 const float* src1f = m1->ptr<float>(y); 04857 const float* src2f = m2->ptr<float>(y); 04858 const short* src1 = (const short*)src1f; 04859 const ushort* src2 = (const ushort*)src2f; 04860 04861 float* dst1f = dstmap1.ptr<float>(y); 04862 float* dst2f = dstmap2.ptr<float>(y); 04863 short* dst1 = (short*)dst1f; 04864 ushort* dst2 = (ushort*)dst2f; 04865 x = 0; 04866 04867 if( m1type == CV_32FC1 && dstm1type == CV_16SC2 ) 04868 { 04869 if( nninterpolate ) 04870 { 04871 #if CV_NEON 04872 for( ; x <= size.width - 8; x += 8 ) 04873 { 04874 int16x8x2_t v_dst; 04875 v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), 04876 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))); 04877 v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))), 04878 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4)))); 04879 04880 vst2q_s16(dst1 + (x << 1), v_dst); 04881 } 04882 #elif CV_SSE4_1 04883 if (useSSE4_1) 04884 { 04885 for( ; x <= size.width - 16; x += 16 ) 04886 { 04887 __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), 04888 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); 04889 __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), 04890 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); 04891 04892 __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), 04893 _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); 04894 __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), 04895 _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); 04896 04897 _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); 04898 04899 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); 04900 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); 04901 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); 04902 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); 04903 } 04904 } 04905 #endif 04906 for( ; x < size.width; x++ ) 04907 { 04908 dst1[x*2] = saturate_cast<short>(src1f[x]); 04909 dst1[x*2+1] = saturate_cast<short>(src2f[x]); 04910 } 04911 } 04912 else 04913 { 04914 #if CV_NEON 04915 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); 04916 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); 04917 04918 for( ; x <= size.width - 8; x += 8 ) 04919 { 04920 int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale)); 04921 int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale)); 04922 int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale)); 04923 int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale)); 04924 04925 int16x8x2_t v_dst; 04926 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)), 04927 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS))); 04928 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)), 04929 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS))); 04930 04931 vst2q_s16(dst1 + (x << 1), v_dst); 04932 04933 uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS), 04934 vandq_s32(v_ix0, v_mask))); 04935 uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS), 04936 vandq_s32(v_ix1, v_mask))); 04937 vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); 04938 } 04939 #elif CV_SSE4_1 04940 if (useSSE4_1) 04941 { 04942 __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); 04943 __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); 04944 04945 for( ; x <= size.width - 16; x += 16 ) 04946 { 04947 __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); 04948 __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); 04949 __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); 04950 __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); 04951 04952 __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), 04953 _mm_srai_epi32(v_ix1, INTER_BITS)); 04954 __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), 04955 _mm_srai_epi32(v_iy1, INTER_BITS)); 04956 __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), 04957 _mm_and_si128(v_ix0, v_its1)); 04958 __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), 04959 _mm_and_si128(v_ix1, v_its1)); 04960 _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); 04961 04962 v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); 04963 v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); 04964 v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); 04965 v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); 04966 04967 __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), 04968 _mm_srai_epi32(v_ix1, INTER_BITS)); 04969 __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), 04970 _mm_srai_epi32(v_iy1, INTER_BITS)); 04971 v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), 04972 _mm_and_si128(v_ix0, v_its1)); 04973 v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), 04974 _mm_and_si128(v_ix1, v_its1)); 04975 _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); 04976 04977 _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); 04978 04979 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); 04980 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); 04981 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); 04982 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); 04983 } 04984 } 04985 #endif 04986 for( ; x < size.width; x++ ) 04987 { 04988 int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE); 04989 int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE); 04990 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS); 04991 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS); 04992 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); 04993 } 04994 } 04995 } 04996 else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 ) 04997 { 04998 if( nninterpolate ) 04999 { 05000 #if CV_NEON 05001 for( ; x <= (size.width << 1) - 8; x += 8 ) 05002 vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), 05003 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))))); 05004 #elif CV_SSE2 05005 for( ; x <= (size.width << 1) - 8; x += 8 ) 05006 { 05007 _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), 05008 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)))); 05009 } 05010 #endif 05011 for( ; x < size.width; x++ ) 05012 { 05013 dst1[x*2] = saturate_cast<short>(src1f[x*2]); 05014 dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]); 05015 } 05016 } 05017 else 05018 { 05019 #if CV_NEON 05020 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); 05021 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); 05022 05023 for( ; x <= size.width - 8; x += 8 ) 05024 { 05025 float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8); 05026 int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale)); 05027 int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale)); 05028 int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale)); 05029 int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale)); 05030 05031 int16x8x2_t v_dst; 05032 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)), 05033 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS))); 05034 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)), 05035 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS))); 05036 05037 vst2q_s16(dst1 + (x << 1), v_dst); 05038 05039 uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS), 05040 vandq_s32(v_ix0, v_mask))); 05041 uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS), 05042 vandq_s32(v_ix1, v_mask))); 05043 vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); 05044 } 05045 #elif CV_SSE4_1 05046 if (useSSE4_1) 05047 { 05048 __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); 05049 __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); 05050 __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); 05051 05052 for( ; x <= size.width - 4; x += 4 ) 05053 { 05054 __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); 05055 __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); 05056 05057 __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), 05058 _mm_srai_epi32(v_src1, INTER_BITS)); 05059 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); 05060 05061 // x0 y0 x1 y1 . . . 05062 v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), 05063 _mm_and_si128(v_src1, v_its1)); 05064 __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . 05065 _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . 05066 _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); 05067 } 05068 } 05069 #endif 05070 for( ; x < size.width; x++ ) 05071 { 05072 int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE); 05073 int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE); 05074 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS); 05075 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS); 05076 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); 05077 } 05078 } 05079 } 05080 else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 ) 05081 { 05082 #if CV_NEON 05083 uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1); 05084 uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1); 05085 float32x4_t v_scale = vdupq_n_f32(scale); 05086 05087 for( ; x <= size.width - 8; x += 8) 05088 { 05089 uint32x4_t v_fxy1, v_fxy2; 05090 if (src2) 05091 { 05092 uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2); 05093 v_fxy1 = vmovl_u16(vget_low_u16(v_src2)); 05094 v_fxy2 = vmovl_u16(vget_high_u16(v_src2)); 05095 } 05096 else 05097 v_fxy1 = v_fxy2 = v_zero; 05098 05099 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1)); 05100 float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))), 05101 v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask))); 05102 float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))), 05103 v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS))); 05104 vst1q_f32(dst1f + x, v_dst1); 05105 vst1q_f32(dst2f + x, v_dst2); 05106 05107 v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))), 05108 v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask))); 05109 v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))), 05110 v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS))); 05111 vst1q_f32(dst1f + x + 4, v_dst1); 05112 vst1q_f32(dst2f + x + 4, v_dst2); 05113 } 05114 #elif CV_SSE2 05115 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); 05116 __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); 05117 __m128 v_scale = _mm_set1_ps(scale); 05118 05119 for( ; x <= size.width - 16; x += 16) 05120 { 05121 __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); 05122 __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8)); 05123 __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16)); 05124 __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24)); 05125 05126 _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21); 05127 05128 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; 05129 __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero); 05130 _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)), 05131 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); 05132 _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)), 05133 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); 05134 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); 05135 _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)), 05136 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); 05137 _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)), 05138 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); 05139 05140 v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero; 05141 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); 05142 _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)), 05143 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); 05144 _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)), 05145 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); 05146 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); 05147 _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)), 05148 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); 05149 _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)), 05150 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); 05151 } 05152 #endif 05153 for( ; x < size.width; x++ ) 05154 { 05155 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0; 05156 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; 05157 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale; 05158 } 05159 } 05160 else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 ) 05161 { 05162 #if CV_NEON 05163 int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1); 05164 int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1); 05165 float32x4_t v_scale = vdupq_n_f32(scale); 05166 05167 for( ; x <= size.width - 8; x += 8) 05168 { 05169 int32x4_t v_fxy1, v_fxy2; 05170 if (src2) 05171 { 05172 int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2); 05173 v_fxy1 = vmovl_s16(vget_low_s16(v_src2)); 05174 v_fxy2 = vmovl_s16(vget_high_s16(v_src2)); 05175 } 05176 else 05177 v_fxy1 = v_fxy2 = v_zero; 05178 05179 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1)); 05180 float32x4x2_t v_dst; 05181 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))), 05182 v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask))); 05183 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))), 05184 v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS))); 05185 vst2q_f32(dst1f + (x << 1), v_dst); 05186 05187 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))), 05188 v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask))); 05189 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))), 05190 v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS))); 05191 vst2q_f32(dst1f + (x << 1) + 8, v_dst); 05192 } 05193 #elif CV_SSE2 05194 if (useSSE2) 05195 { 05196 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); 05197 __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); 05198 __m128 v_scale = _mm_set1_ps(scale); 05199 05200 for ( ; x <= size.width - 8; x += 8) 05201 { 05202 __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); 05203 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; 05204 __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); 05205 __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); 05206 05207 __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); 05208 _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); 05209 05210 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); 05211 _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); 05212 } 05213 } 05214 #endif 05215 for( ; x < size.width; x++ ) 05216 { 05217 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0; 05218 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; 05219 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale; 05220 } 05221 } 05222 else 05223 CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" ); 05224 } 05225 } 05226 05227 05228 namespace cv 05229 { 05230 05231 class WarpAffineInvoker : 05232 public ParallelLoopBody 05233 { 05234 public: 05235 WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType, 05236 const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) : 05237 ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation), 05238 borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta), 05239 M(_M) 05240 { 05241 } 05242 05243 virtual void operator() (const Range& range) const 05244 { 05245 const int BLOCK_SZ = 64; 05246 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ]; 05247 const int AB_BITS = MAX(10, (int)INTER_BITS); 05248 const int AB_SCALE = 1 << AB_BITS; 05249 int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1; 05250 #if CV_SSE2 05251 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); 05252 #endif 05253 #if CV_SSE4_1 05254 bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); 05255 #endif 05256 05257 int bh0 = std::min(BLOCK_SZ/2, dst.rows); 05258 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols); 05259 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows); 05260 05261 for( y = range.start; y < range.end; y += bh0 ) 05262 { 05263 for( x = 0; x < dst.cols; x += bw0 ) 05264 { 05265 int bw = std::min( bw0, dst.cols - x); 05266 int bh = std::min( bh0, range.end - y); 05267 05268 Mat _XY(bh, bw, CV_16SC2, XY), matA; 05269 Mat dpart(dst, Rect(x, y, bw, bh)); 05270 05271 for( y1 = 0; y1 < bh; y1++ ) 05272 { 05273 short* xy = XY + y1*bw*2; 05274 int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta; 05275 int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta; 05276 05277 if( interpolation == INTER_NEAREST ) 05278 { 05279 x1 = 0; 05280 #if CV_NEON 05281 int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0); 05282 for( ; x1 <= bw - 8; x1 += 8 ) 05283 { 05284 int16x8x2_t v_dst; 05285 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)), 05286 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS))); 05287 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)), 05288 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS))); 05289 05290 vst2q_s16(xy + (x1 << 1), v_dst); 05291 } 05292 #elif CV_SSE4_1 05293 if (useSSE4_1) 05294 { 05295 __m128i v_X0 = _mm_set1_epi32(X0); 05296 __m128i v_Y0 = _mm_set1_epi32(Y0); 05297 for ( ; x1 <= bw - 16; x1 += 16) 05298 { 05299 __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), 05300 _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); 05301 __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), 05302 _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); 05303 05304 __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), 05305 _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); 05306 __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), 05307 _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); 05308 05309 _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); 05310 05311 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); 05312 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); 05313 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); 05314 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); 05315 } 05316 } 05317 #endif 05318 for( ; x1 < bw; x1++ ) 05319 { 05320 int X = (X0 + adelta[x+x1]) >> AB_BITS; 05321 int Y = (Y0 + bdelta[x+x1]) >> AB_BITS; 05322 xy[x1*2] = saturate_cast<short>(X); 05323 xy[x1*2+1] = saturate_cast<short>(Y); 05324 } 05325 } 05326 else 05327 { 05328 short* alpha = A + y1*bw; 05329 x1 = 0; 05330 #if CV_SSE2 05331 if( useSSE2 ) 05332 { 05333 __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1); 05334 __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0); 05335 for( ; x1 <= bw - 8; x1 += 8 ) 05336 { 05337 __m128i tx0, tx1, ty0, ty1; 05338 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX); 05339 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY); 05340 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX); 05341 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY); 05342 05343 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS); 05344 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS); 05345 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS); 05346 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS); 05347 05348 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask), 05349 _mm_and_si128(tx1, fxy_mask)); 05350 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask), 05351 _mm_and_si128(ty1, fxy_mask)); 05352 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS), 05353 _mm_srai_epi32(tx1, INTER_BITS)); 05354 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS), 05355 _mm_srai_epi32(ty1, INTER_BITS)); 05356 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS)); 05357 05358 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0)); 05359 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0)); 05360 _mm_storeu_si128((__m128i*)(alpha + x1), fx_); 05361 } 05362 } 05363 #elif CV_NEON 05364 int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); 05365 for( ; x1 <= bw - 8; x1 += 8 ) 05366 { 05367 int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS); 05368 int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS); 05369 int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS); 05370 int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS); 05371 05372 int16x8x2_t v_xy; 05373 v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS))); 05374 v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS))); 05375 05376 vst2q_s16(xy + (x1 << 1), v_xy); 05377 05378 int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS), 05379 vandq_s32(v_X0, v_mask))); 05380 int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS), 05381 vandq_s32(v_X1, v_mask))); 05382 vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1)); 05383 } 05384 #endif 05385 for( ; x1 < bw; x1++ ) 05386 { 05387 int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS); 05388 int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS); 05389 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS); 05390 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS); 05391 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + 05392 (X & (INTER_TAB_SIZE-1))); 05393 } 05394 } 05395 } 05396 05397 if( interpolation == INTER_NEAREST ) 05398 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue ); 05399 else 05400 { 05401 Mat _matA(bh, bw, CV_16U, A); 05402 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue ); 05403 } 05404 } 05405 } 05406 } 05407 05408 private: 05409 Mat src; 05410 Mat dst; 05411 int interpolation, borderType; 05412 Scalar borderValue; 05413 int *adelta, *bdelta; 05414 double *M; 05415 }; 05416 05417 05418 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK 05419 class IPPWarpAffineInvoker : 05420 public ParallelLoopBody 05421 { 05422 public: 05423 IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType, 05424 const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) : 05425 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs), 05426 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok) 05427 { 05428 *ok = true; 05429 } 05430 05431 virtual void operator() (const Range& range) const 05432 { 05433 IppiSize srcsize = { src.cols, src.rows }; 05434 IppiRect srcroi = { 0, 0, src.cols, src.rows }; 05435 IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start }; 05436 int cnn = src.channels(); 05437 if( borderType == BORDER_CONSTANT ) 05438 { 05439 IppiSize setSize = { dst.cols, range.end - range.start }; 05440 void *dataPointer = dst.ptr(range.start); 05441 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) ) 05442 { 05443 *ok = false; 05444 return; 05445 } 05446 } 05447 05448 // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr 05449 IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), 05450 (int)dst.step[0], dstroi, coeffs, mode ); 05451 if( status < 0) 05452 *ok = false; 05453 else 05454 { 05455 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 05456 } 05457 } 05458 private: 05459 Mat &src; 05460 Mat &dst; 05461 int mode; 05462 double (&coeffs)[2][3]; 05463 int borderType; 05464 Scalar borderValue; 05465 ippiWarpAffineBackFunc func; 05466 bool *ok; 05467 const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&); 05468 }; 05469 #endif 05470 05471 #ifdef HAVE_OPENCL 05472 05473 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 }; 05474 05475 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, 05476 Size dsize, int flags, int borderType, const Scalar& borderValue, 05477 int op_type) 05478 { 05479 CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE); 05480 const ocl::Device & dev = ocl::Device::getDefault(); 05481 05482 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 05483 const bool doubleSupport = dev.doubleFPConfig() > 0; 05484 05485 int interpolation = flags & INTER_MAX; 05486 if( interpolation == INTER_AREA ) 05487 interpolation = INTER_LINEAR; 05488 int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1; 05489 05490 if ( !(borderType == cv::BORDER_CONSTANT && 05491 (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) || 05492 (!doubleSupport && depth == CV_64F) || cn > 4) 05493 return false; 05494 05495 const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" }; 05496 ocl::ProgramSource program = op_type == OCL_OP_AFFINE ? 05497 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc; 05498 const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective"; 05499 05500 int scalarcn = cn == 3 ? 4 : cn; 05501 bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE; 05502 int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth); 05503 int sctype = CV_MAKETYPE(wdepth, scalarcn); 05504 05505 ocl::Kernel k; 05506 String opts; 05507 if (interpolation == INTER_NEAREST) 05508 { 05509 opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d", 05510 ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "", 05511 ocl::typeToStr(CV_MAT_DEPTH(type)), 05512 ocl::typeToStr(sctype), cn, rowsPerWI); 05513 } 05514 else 05515 { 05516 char cvt[2][50]; 05517 opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d" 05518 " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d", 05519 interpolationMap[interpolation], ocl::typeToStr(type), 05520 ocl::typeToStr(CV_MAT_DEPTH(type)), 05521 ocl::typeToStr(sctype), 05522 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth, 05523 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), 05524 ocl::convertTypeStr(wdepth, depth, cn, cvt[1]), 05525 doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI); 05526 } 05527 05528 k.create(kernelName, program, opts); 05529 if (k.empty()) 05530 return false; 05531 05532 double borderBuf[] = { 0, 0, 0, 0 }; 05533 scalarToRawData(borderValue, borderBuf, sctype); 05534 05535 UMat src = _src.getUMat(), M0; 05536 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); 05537 UMat dst = _dst.getUMat(); 05538 05539 double M[9]; 05540 int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3); 05541 Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat(); 05542 CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) && 05543 M1.rows == matRows && M1.cols == 3 ); 05544 M1.convertTo(matM, matM.type()); 05545 05546 if( !(flags & WARP_INVERSE_MAP) ) 05547 { 05548 if (op_type == OCL_OP_PERSPECTIVE) 05549 invert(matM, matM); 05550 else 05551 { 05552 double D = M[0]*M[4] - M[1]*M[3]; 05553 D = D != 0 ? 1./D : 0; 05554 double A11 = M[4]*D, A22=M[0]*D; 05555 M[0] = A11; M[1] *= -D; 05556 M[3] *= -D; M[4] = A22; 05557 double b1 = -M[0]*M[2] - M[1]*M[5]; 05558 double b2 = -M[3]*M[2] - M[4]*M[5]; 05559 M[2] = b1; M[5] = b2; 05560 } 05561 } 05562 matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F); 05563 05564 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0), 05565 ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype))); 05566 05567 size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI }; 05568 return k.run(2, globalThreads, NULL, false); 05569 } 05570 05571 #endif 05572 05573 } 05574 05575 05576 void cv::warpAffine( InputArray _src, OutputArray _dst, 05577 InputArray _M0, Size dsize, 05578 int flags, int borderType, const Scalar & borderValue ) 05579 { 05580 #ifdef HAVE_OPENCL 05581 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 05582 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, 05583 borderValue, OCL_OP_AFFINE)) 05584 #endif 05585 05586 Mat src = _src.getMat(), M0 = _M0.getMat(); 05587 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); 05588 Mat dst = _dst.getMat(); 05589 CV_Assert( src.cols > 0 && src.rows > 0 ); 05590 if( dst.data == src.data ) 05591 src = src.clone(); 05592 05593 double M[6]; 05594 Mat matM(2, 3, CV_64F, M); 05595 int interpolation = flags & INTER_MAX; 05596 if( interpolation == INTER_AREA ) 05597 interpolation = INTER_LINEAR; 05598 05599 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 ); 05600 M0.convertTo(matM, matM.type()); 05601 05602 #ifdef HAVE_TEGRA_OPTIMIZATION 05603 if( tegra::useTegra() && tegra::warpAffine(src, dst, M, flags, borderType, borderValue) ) 05604 return; 05605 #endif 05606 05607 if( !(flags & WARP_INVERSE_MAP) ) 05608 { 05609 double D = M[0]*M[4] - M[1]*M[3]; 05610 D = D != 0 ? 1./D : 0; 05611 double A11 = M[4]*D, A22=M[0]*D; 05612 M[0] = A11; M[1] *= -D; 05613 M[3] *= -D; M[4] = A22; 05614 double b1 = -M[0]*M[2] - M[1]*M[5]; 05615 double b2 = -M[3]*M[2] - M[4]*M[5]; 05616 M[2] = b1; M[5] = b2; 05617 } 05618 05619 int x; 05620 AutoBuffer<int> _abdelta(dst.cols*2); 05621 int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols; 05622 const int AB_BITS = MAX(10, (int)INTER_BITS); 05623 const int AB_SCALE = 1 << AB_BITS; 05624 05625 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK 05626 CV_IPP_CHECK() 05627 { 05628 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 05629 if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) && 05630 ( cn == 1 || cn == 3 || cn == 4 ) && 05631 ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) && 05632 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) ) 05633 { 05634 ippiWarpAffineBackFunc ippFunc = 0; 05635 if ((flags & WARP_INVERSE_MAP) != 0) 05636 { 05637 ippFunc = 05638 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R : 05639 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R : 05640 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R : 05641 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R : 05642 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R : 05643 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R : 05644 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R : 05645 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R : 05646 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R : 05647 0; 05648 } 05649 else 05650 { 05651 ippFunc = 05652 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R : 05653 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R : 05654 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R : 05655 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R : 05656 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R : 05657 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R : 05658 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R : 05659 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R : 05660 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R : 05661 0; 05662 } 05663 int mode = 05664 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : 05665 interpolation == INTER_NEAREST ? IPPI_INTER_NN : 05666 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 05667 0; 05668 CV_Assert(mode && ippFunc); 05669 05670 double coeffs[2][3]; 05671 for( int i = 0; i < 2; i++ ) 05672 for( int j = 0; j < 3; j++ ) 05673 coeffs[i][j] = matM.at<double>(i, j); 05674 05675 bool ok; 05676 Range range(0, dst.rows); 05677 IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok); 05678 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 05679 if( ok ) 05680 { 05681 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 05682 return; 05683 } 05684 setIppErrorStatus(); 05685 } 05686 } 05687 #endif 05688 05689 for( x = 0; x < dst.cols; x++ ) 05690 { 05691 adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE); 05692 bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE); 05693 } 05694 05695 Range range(0, dst.rows); 05696 WarpAffineInvoker invoker(src, dst, interpolation, borderType, 05697 borderValue, adelta, bdelta, M); 05698 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 05699 } 05700 05701 05702 namespace cv 05703 { 05704 05705 class WarpPerspectiveInvoker : 05706 public ParallelLoopBody 05707 { 05708 public: 05709 WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation, 05710 int _borderType, const Scalar &_borderValue) : 05711 ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation), 05712 borderType(_borderType), borderValue(_borderValue) 05713 { 05714 } 05715 05716 virtual void operator() (const Range& range) const 05717 { 05718 const int BLOCK_SZ = 32; 05719 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ]; 05720 int x, y, x1, y1, width = dst.cols, height = dst.rows; 05721 05722 int bh0 = std::min(BLOCK_SZ/2, height); 05723 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); 05724 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); 05725 05726 #if CV_SSE4_1 05727 bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); 05728 __m128d v_M0 = _mm_set1_pd(M[0]); 05729 __m128d v_M3 = _mm_set1_pd(M[3]); 05730 __m128d v_M6 = _mm_set1_pd(M[6]); 05731 __m128d v_intmax = _mm_set1_pd((double)INT_MAX); 05732 __m128d v_intmin = _mm_set1_pd((double)INT_MIN); 05733 __m128d v_2 = _mm_set1_pd(2), 05734 v_zero = _mm_setzero_pd(), 05735 v_1 = _mm_set1_pd(1), 05736 v_its = _mm_set1_pd(INTER_TAB_SIZE); 05737 __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1); 05738 #endif 05739 05740 for( y = range.start; y < range.end; y += bh0 ) 05741 { 05742 for( x = 0; x < width; x += bw0 ) 05743 { 05744 int bw = std::min( bw0, width - x); 05745 int bh = std::min( bh0, range.end - y); // height 05746 05747 Mat _XY(bh, bw, CV_16SC2, XY), matA; 05748 Mat dpart(dst, Rect(x, y, bw, bh)); 05749 05750 for( y1 = 0; y1 < bh; y1++ ) 05751 { 05752 short* xy = XY + y1*bw*2; 05753 double X0 = M[0]*x + M[1]*(y + y1) + M[2]; 05754 double Y0 = M[3]*x + M[4]*(y + y1) + M[5]; 05755 double W0 = M[6]*x + M[7]*(y + y1) + M[8]; 05756 05757 if( interpolation == INTER_NEAREST ) 05758 { 05759 x1 = 0; 05760 05761 #if CV_SSE4_1 05762 if (haveSSE4_1) 05763 { 05764 __m128d v_X0d = _mm_set1_pd(X0); 05765 __m128d v_Y0d = _mm_set1_pd(Y0); 05766 __m128d v_W0 = _mm_set1_pd(W0); 05767 __m128d v_x1 = _mm_set_pd(1, 0); 05768 05769 for( ; x1 <= bw - 16; x1 += 16 ) 05770 { 05771 // 0-3 05772 __m128i v_X0, v_Y0; 05773 { 05774 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05775 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 05776 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05777 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05778 v_x1 = _mm_add_pd(v_x1, v_2); 05779 05780 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05781 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 05782 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05783 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05784 v_x1 = _mm_add_pd(v_x1, v_2); 05785 05786 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 05787 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 05788 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 05789 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 05790 } 05791 05792 // 4-8 05793 __m128i v_X1, v_Y1; 05794 { 05795 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05796 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 05797 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05798 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05799 v_x1 = _mm_add_pd(v_x1, v_2); 05800 05801 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05802 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 05803 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05804 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05805 v_x1 = _mm_add_pd(v_x1, v_2); 05806 05807 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 05808 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 05809 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 05810 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 05811 } 05812 05813 // 8-11 05814 __m128i v_X2, v_Y2; 05815 { 05816 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05817 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 05818 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05819 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05820 v_x1 = _mm_add_pd(v_x1, v_2); 05821 05822 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05823 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 05824 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05825 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05826 v_x1 = _mm_add_pd(v_x1, v_2); 05827 05828 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 05829 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 05830 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 05831 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 05832 } 05833 05834 // 12-15 05835 __m128i v_X3, v_Y3; 05836 { 05837 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05838 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 05839 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05840 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05841 v_x1 = _mm_add_pd(v_x1, v_2); 05842 05843 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05844 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 05845 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05846 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05847 v_x1 = _mm_add_pd(v_x1, v_2); 05848 05849 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 05850 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 05851 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 05852 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 05853 } 05854 05855 // convert to 16s 05856 v_X0 = _mm_packs_epi32(v_X0, v_X1); 05857 v_X1 = _mm_packs_epi32(v_X2, v_X3); 05858 v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); 05859 v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); 05860 05861 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); 05862 05863 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); 05864 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); 05865 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); 05866 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); 05867 } 05868 } 05869 #endif 05870 05871 for( ; x1 < bw; x1++ ) 05872 { 05873 double W = W0 + M[6]*x1; 05874 W = W ? 1./W : 0; 05875 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W)); 05876 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W)); 05877 int X = saturate_cast<int>(fX); 05878 int Y = saturate_cast<int>(fY); 05879 05880 xy[x1*2] = saturate_cast<short>(X); 05881 xy[x1*2+1] = saturate_cast<short>(Y); 05882 } 05883 } 05884 else 05885 { 05886 short* alpha = A + y1*bw; 05887 x1 = 0; 05888 05889 #if CV_SSE4_1 05890 if (haveSSE4_1) 05891 { 05892 __m128d v_X0d = _mm_set1_pd(X0); 05893 __m128d v_Y0d = _mm_set1_pd(Y0); 05894 __m128d v_W0 = _mm_set1_pd(W0); 05895 __m128d v_x1 = _mm_set_pd(1, 0); 05896 05897 for( ; x1 <= bw - 16; x1 += 16 ) 05898 { 05899 // 0-3 05900 __m128i v_X0, v_Y0; 05901 { 05902 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05903 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 05904 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05905 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05906 v_x1 = _mm_add_pd(v_x1, v_2); 05907 05908 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05909 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 05910 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05911 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05912 v_x1 = _mm_add_pd(v_x1, v_2); 05913 05914 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 05915 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 05916 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 05917 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 05918 } 05919 05920 // 4-8 05921 __m128i v_X1, v_Y1; 05922 { 05923 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05924 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 05925 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05926 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05927 v_x1 = _mm_add_pd(v_x1, v_2); 05928 05929 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05930 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 05931 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05932 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05933 v_x1 = _mm_add_pd(v_x1, v_2); 05934 05935 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 05936 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 05937 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 05938 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 05939 } 05940 05941 // 8-11 05942 __m128i v_X2, v_Y2; 05943 { 05944 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05945 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 05946 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05947 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05948 v_x1 = _mm_add_pd(v_x1, v_2); 05949 05950 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05951 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 05952 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05953 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05954 v_x1 = _mm_add_pd(v_x1, v_2); 05955 05956 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 05957 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 05958 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 05959 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 05960 } 05961 05962 // 12-15 05963 __m128i v_X3, v_Y3; 05964 { 05965 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05966 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 05967 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05968 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05969 v_x1 = _mm_add_pd(v_x1, v_2); 05970 05971 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 05972 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 05973 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 05974 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 05975 v_x1 = _mm_add_pd(v_x1, v_2); 05976 05977 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 05978 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 05979 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 05980 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 05981 } 05982 05983 // store alpha 05984 __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), 05985 _mm_and_si128(v_X0, v_itsi1)); 05986 __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), 05987 _mm_and_si128(v_X1, v_itsi1)); 05988 _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); 05989 05990 v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), 05991 _mm_and_si128(v_X2, v_itsi1)); 05992 v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), 05993 _mm_and_si128(v_X3, v_itsi1)); 05994 _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); 05995 05996 // convert to 16s 05997 v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); 05998 v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); 05999 v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); 06000 v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); 06001 06002 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); 06003 06004 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); 06005 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); 06006 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); 06007 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); 06008 } 06009 } 06010 #endif 06011 06012 for( ; x1 < bw; x1++ ) 06013 { 06014 double W = W0 + M[6]*x1; 06015 W = W ? INTER_TAB_SIZE/W : 0; 06016 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W)); 06017 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W)); 06018 int X = saturate_cast<int>(fX); 06019 int Y = saturate_cast<int>(fY); 06020 06021 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS); 06022 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS); 06023 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + 06024 (X & (INTER_TAB_SIZE-1))); 06025 } 06026 } 06027 } 06028 06029 if( interpolation == INTER_NEAREST ) 06030 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue ); 06031 else 06032 { 06033 Mat _matA(bh, bw, CV_16U, A); 06034 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue ); 06035 } 06036 } 06037 } 06038 } 06039 06040 private: 06041 Mat src; 06042 Mat dst; 06043 double* M; 06044 int interpolation, borderType; 06045 Scalar borderValue; 06046 }; 06047 06048 06049 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK 06050 class IPPWarpPerspectiveInvoker : 06051 public ParallelLoopBody 06052 { 06053 public: 06054 IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation, 06055 int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) : 06056 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs), 06057 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok) 06058 { 06059 *ok = true; 06060 } 06061 06062 virtual void operator() (const Range& range) const 06063 { 06064 IppiSize srcsize = {src.cols, src.rows}; 06065 IppiRect srcroi = {0, 0, src.cols, src.rows}; 06066 IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start}; 06067 int cnn = src.channels(); 06068 06069 if( borderType == BORDER_CONSTANT ) 06070 { 06071 IppiSize setSize = {dst.cols, range.end - range.start}; 06072 void *dataPointer = dst.ptr(range.start); 06073 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) ) 06074 { 06075 *ok = false; 06076 return; 06077 } 06078 } 06079 06080 IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode); 06081 if (status != ippStsNoErr) 06082 *ok = false; 06083 else 06084 { 06085 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 06086 } 06087 } 06088 private: 06089 Mat &src; 06090 Mat &dst; 06091 int mode; 06092 double (&coeffs)[3][3]; 06093 int borderType; 06094 const Scalar borderValue; 06095 ippiWarpPerspectiveFunc func; 06096 bool *ok; 06097 06098 const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&); 06099 }; 06100 #endif 06101 } 06102 06103 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0, 06104 Size dsize, int flags, int borderType, const Scalar & borderValue ) 06105 { 06106 CV_Assert( _src.total() > 0 ); 06107 06108 #ifdef HAVE_OPENCL 06109 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 06110 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue, 06111 OCL_OP_PERSPECTIVE)) 06112 #endif 06113 06114 Mat src = _src.getMat(), M0 = _M0.getMat(); 06115 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); 06116 Mat dst = _dst.getMat(); 06117 06118 if( dst.data == src.data ) 06119 src = src.clone(); 06120 06121 double M[9]; 06122 Mat matM(3, 3, CV_64F, M); 06123 int interpolation = flags & INTER_MAX; 06124 if( interpolation == INTER_AREA ) 06125 interpolation = INTER_LINEAR; 06126 06127 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 ); 06128 M0.convertTo(matM, matM.type()); 06129 06130 #ifdef HAVE_TEGRA_OPTIMIZATION 06131 if( tegra::useTegra() && tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) ) 06132 return; 06133 #endif 06134 06135 06136 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK 06137 CV_IPP_CHECK() 06138 { 06139 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 06140 if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) && 06141 (cn == 1 || cn == 3 || cn == 4) && 06142 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) && 06143 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC)) 06144 { 06145 ippiWarpPerspectiveFunc ippFunc = 0; 06146 if ((flags & WARP_INVERSE_MAP) != 0) 06147 { 06148 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R : 06149 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R : 06150 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R : 06151 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R : 06152 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R : 06153 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R : 06154 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R : 06155 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R : 06156 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0; 06157 } 06158 else 06159 { 06160 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R : 06161 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R : 06162 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R : 06163 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R : 06164 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R : 06165 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R : 06166 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R : 06167 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R : 06168 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0; 06169 } 06170 int mode = 06171 interpolation == INTER_NEAREST ? IPPI_INTER_NN : 06172 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : 06173 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0; 06174 CV_Assert(mode && ippFunc); 06175 06176 double coeffs[3][3]; 06177 for( int i = 0; i < 3; i++ ) 06178 for( int j = 0; j < 3; j++ ) 06179 coeffs[i][j] = matM.at<double>(i, j); 06180 06181 bool ok; 06182 Range range(0, dst.rows); 06183 IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok); 06184 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 06185 if( ok ) 06186 { 06187 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 06188 return; 06189 } 06190 setIppErrorStatus(); 06191 } 06192 } 06193 #endif 06194 06195 if( !(flags & WARP_INVERSE_MAP) ) 06196 invert(matM, matM); 06197 06198 Range range(0, dst.rows); 06199 WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue); 06200 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 06201 } 06202 06203 06204 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale ) 06205 { 06206 angle *= CV_PI/180; 06207 double alpha = cos(angle)*scale; 06208 double beta = sin(angle)*scale; 06209 06210 Mat M(2, 3, CV_64F); 06211 double* m = M.ptr<double>(); 06212 06213 m[0] = alpha; 06214 m[1] = beta; 06215 m[2] = (1-alpha)*center.x - beta*center.y; 06216 m[3] = -beta; 06217 m[4] = alpha; 06218 m[5] = beta*center.x + (1-alpha)*center.y; 06219 06220 return M; 06221 } 06222 06223 /* Calculates coefficients of perspective transformation 06224 * which maps (xi,yi) to (ui,vi), (i=1,2,3,4): 06225 * 06226 * c00*xi + c01*yi + c02 06227 * ui = --------------------- 06228 * c20*xi + c21*yi + c22 06229 * 06230 * c10*xi + c11*yi + c12 06231 * vi = --------------------- 06232 * c20*xi + c21*yi + c22 06233 * 06234 * Coefficients are calculated by solving linear system: 06235 * / x0 y0 1 0 0 0 -x0*u0 -y0*u0 \ /c00\ /u0\ 06236 * | x1 y1 1 0 0 0 -x1*u1 -y1*u1 | |c01| |u1| 06237 * | x2 y2 1 0 0 0 -x2*u2 -y2*u2 | |c02| |u2| 06238 * | x3 y3 1 0 0 0 -x3*u3 -y3*u3 |.|c10|=|u3|, 06239 * | 0 0 0 x0 y0 1 -x0*v0 -y0*v0 | |c11| |v0| 06240 * | 0 0 0 x1 y1 1 -x1*v1 -y1*v1 | |c12| |v1| 06241 * | 0 0 0 x2 y2 1 -x2*v2 -y2*v2 | |c20| |v2| 06242 * \ 0 0 0 x3 y3 1 -x3*v3 -y3*v3 / \c21/ \v3/ 06243 * 06244 * where: 06245 * cij - matrix coefficients, c22 = 1 06246 */ 06247 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] ) 06248 { 06249 Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr()); 06250 double a[8][8], b[8]; 06251 Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b); 06252 06253 for( int i = 0; i < 4; ++i ) 06254 { 06255 a[i][0] = a[i+4][3] = src[i].x; 06256 a[i][1] = a[i+4][4] = src[i].y; 06257 a[i][2] = a[i+4][5] = 1; 06258 a[i][3] = a[i][4] = a[i][5] = 06259 a[i+4][0] = a[i+4][1] = a[i+4][2] = 0; 06260 a[i][6] = -src[i].x*dst[i].x; 06261 a[i][7] = -src[i].y*dst[i].x; 06262 a[i+4][6] = -src[i].x*dst[i].y; 06263 a[i+4][7] = -src[i].y*dst[i].y; 06264 b[i] = dst[i].x; 06265 b[i+4] = dst[i].y; 06266 } 06267 06268 solve( A, B, X, DECOMP_SVD ); 06269 M.ptr<double>()[8] = 1.; 06270 06271 return M; 06272 } 06273 06274 /* Calculates coefficients of affine transformation 06275 * which maps (xi,yi) to (ui,vi), (i=1,2,3): 06276 * 06277 * ui = c00*xi + c01*yi + c02 06278 * 06279 * vi = c10*xi + c11*yi + c12 06280 * 06281 * Coefficients are calculated by solving linear system: 06282 * / x0 y0 1 0 0 0 \ /c00\ /u0\ 06283 * | x1 y1 1 0 0 0 | |c01| |u1| 06284 * | x2 y2 1 0 0 0 | |c02| |u2| 06285 * | 0 0 0 x0 y0 1 | |c10| |v0| 06286 * | 0 0 0 x1 y1 1 | |c11| |v1| 06287 * \ 0 0 0 x2 y2 1 / |c12| |v2| 06288 * 06289 * where: 06290 * cij - matrix coefficients 06291 */ 06292 06293 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] ) 06294 { 06295 Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr()); 06296 double a[6*6], b[6]; 06297 Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b); 06298 06299 for( int i = 0; i < 3; i++ ) 06300 { 06301 int j = i*12; 06302 int k = i*12+6; 06303 a[j] = a[k+3] = src[i].x; 06304 a[j+1] = a[k+4] = src[i].y; 06305 a[j+2] = a[k+5] = 1; 06306 a[j+3] = a[j+4] = a[j+5] = 0; 06307 a[k] = a[k+1] = a[k+2] = 0; 06308 b[i*2] = dst[i].x; 06309 b[i*2+1] = dst[i].y; 06310 } 06311 06312 solve( A, B, X ); 06313 return M; 06314 } 06315 06316 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM) 06317 { 06318 Mat matM = _matM.getMat(); 06319 CV_Assert(matM.rows == 2 && matM.cols == 3); 06320 __iM.create(2, 3, matM.type()); 06321 Mat _iM = __iM.getMat(); 06322 06323 if( matM.type() == CV_32F ) 06324 { 06325 const float* M = matM.ptr<float>(); 06326 float* iM = _iM.ptr<float>(); 06327 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0])); 06328 06329 double D = M[0]*M[step+1] - M[1]*M[step]; 06330 D = D != 0 ? 1./D : 0; 06331 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D; 06332 double b1 = -A11*M[2] - A12*M[step+2]; 06333 double b2 = -A21*M[2] - A22*M[step+2]; 06334 06335 iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1; 06336 iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2; 06337 } 06338 else if( matM.type() == CV_64F ) 06339 { 06340 const double* M = matM.ptr<double>(); 06341 double* iM = _iM.ptr<double>(); 06342 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0])); 06343 06344 double D = M[0]*M[step+1] - M[1]*M[step]; 06345 D = D != 0 ? 1./D : 0; 06346 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D; 06347 double b1 = -A11*M[2] - A12*M[step+2]; 06348 double b2 = -A21*M[2] - A22*M[step+2]; 06349 06350 iM[0] = A11; iM[1] = A12; iM[2] = b1; 06351 iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2; 06352 } 06353 else 06354 CV_Error( CV_StsUnsupportedFormat, "" ); 06355 } 06356 06357 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst) 06358 { 06359 Mat src = _src.getMat(), dst = _dst.getMat(); 06360 CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4); 06361 return getPerspectiveTransform((const Point2f *)src.data, (const Point2f *)dst.data); 06362 } 06363 06364 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst) 06365 { 06366 Mat src = _src.getMat(), dst = _dst.getMat(); 06367 CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3); 06368 return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data); 06369 } 06370 06371 CV_IMPL void 06372 cvResize( const CvArr* srcarr, CvArr* dstarr, int method ) 06373 { 06374 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 06375 CV_Assert( src.type() == dst.type() ); 06376 cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols, 06377 (double)dst.rows/src.rows, method ); 06378 } 06379 06380 06381 CV_IMPL void 06382 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr, 06383 int flags, CvScalar fillval ) 06384 { 06385 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 06386 cv::Mat matrix = cv::cvarrToMat(marr); 06387 CV_Assert( src.type() == dst.type() ); 06388 cv::warpAffine( src, dst, matrix, dst.size(), flags, 06389 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT, 06390 fillval ); 06391 } 06392 06393 CV_IMPL void 06394 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr, 06395 int flags, CvScalar fillval ) 06396 { 06397 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 06398 cv::Mat matrix = cv::cvarrToMat(marr); 06399 CV_Assert( src.type() == dst.type() ); 06400 cv::warpPerspective( src, dst, matrix, dst.size(), flags, 06401 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT, 06402 fillval ); 06403 } 06404 06405 CV_IMPL void 06406 cvRemap( const CvArr* srcarr, CvArr* dstarr, 06407 const CvArr* _mapx, const CvArr* _mapy, 06408 int flags, CvScalar fillval ) 06409 { 06410 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst; 06411 cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy); 06412 CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() ); 06413 cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX, 06414 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT, 06415 fillval ); 06416 CV_Assert( dst0.data == dst.data ); 06417 } 06418 06419 06420 CV_IMPL CvMat* 06421 cv2DRotationMatrix( CvPoint2D32f center, double angle, 06422 double scale, CvMat* matrix ) 06423 { 06424 cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale); 06425 CV_Assert( M.size() == M0.size() ); 06426 M.convertTo(M0, M0.type()); 06427 return matrix; 06428 } 06429 06430 06431 CV_IMPL CvMat* 06432 cvGetPerspectiveTransform( const CvPoint2D32f* src, 06433 const CvPoint2D32f* dst, 06434 CvMat* matrix ) 06435 { 06436 cv::Mat M0 = cv::cvarrToMat(matrix), 06437 M = cv::getPerspectiveTransform((const cv::Point2f *)src, (const cv::Point2f *)dst); 06438 CV_Assert( M.size() == M0.size() ); 06439 M.convertTo(M0, M0.type()); 06440 return matrix; 06441 } 06442 06443 06444 CV_IMPL CvMat* 06445 cvGetAffineTransform( const CvPoint2D32f* src, 06446 const CvPoint2D32f* dst, 06447 CvMat* matrix ) 06448 { 06449 cv::Mat M0 = cv::cvarrToMat(matrix), 06450 M = cv::getAffineTransform((const cv::Point2f *)src, (const cv::Point2f *)dst); 06451 CV_Assert( M.size() == M0.size() ); 06452 M.convertTo(M0, M0.type()); 06453 return matrix; 06454 } 06455 06456 06457 CV_IMPL void 06458 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 ) 06459 { 06460 cv::Mat map1 = cv::cvarrToMat(arr1), map2; 06461 cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2; 06462 06463 if( arr2 ) 06464 map2 = cv::cvarrToMat(arr2); 06465 if( dstarr2 ) 06466 { 06467 dstmap2 = cv::cvarrToMat(dstarr2); 06468 if( dstmap2.type() == CV_16SC1 ) 06469 dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step); 06470 } 06471 06472 cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false ); 06473 } 06474 06475 /****************************************************************************************\ 06476 * Log-Polar Transform * 06477 \****************************************************************************************/ 06478 06479 /* now it is done via Remap; more correct implementation should use 06480 some super-sampling technique outside of the "fovea" circle */ 06481 CV_IMPL void 06482 cvLogPolar( const CvArr* srcarr, CvArr* dstarr, 06483 CvPoint2D32f center, double M, int flags ) 06484 { 06485 cv::Ptr<CvMat> mapx, mapy; 06486 06487 CvMat srcstub, *src = cvGetMat(srcarr, &srcstub); 06488 CvMat dststub, *dst = cvGetMat(dstarr, &dststub); 06489 CvSize ssize, dsize; 06490 06491 if( !CV_ARE_TYPES_EQ( src, dst )) 06492 CV_Error( CV_StsUnmatchedFormats, "" ); 06493 06494 if( M <= 0 ) 06495 CV_Error( CV_StsOutOfRange, "M should be >0" ); 06496 06497 ssize = cvGetMatSize(src); 06498 dsize = cvGetMatSize(dst); 06499 06500 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F )); 06501 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F )); 06502 06503 if( !(flags & CV_WARP_INVERSE_MAP) ) 06504 { 06505 int phi, rho; 06506 cv::AutoBuffer<double> _exp_tab(dsize.width); 06507 double* exp_tab = _exp_tab; 06508 06509 for( rho = 0; rho < dst->width; rho++ ) 06510 exp_tab[rho] = std::exp(rho/M); 06511 06512 for( phi = 0; phi < dsize.height; phi++ ) 06513 { 06514 double cp = cos(phi*2*CV_PI/dsize.height); 06515 double sp = sin(phi*2*CV_PI/dsize.height); 06516 float* mx = (float*)(mapx->data.ptr + phi*mapx->step); 06517 float* my = (float*)(mapy->data.ptr + phi*mapy->step); 06518 06519 for( rho = 0; rho < dsize.width; rho++ ) 06520 { 06521 double r = exp_tab[rho]; 06522 double x = r*cp + center.x; 06523 double y = r*sp + center.y; 06524 06525 mx[rho] = (float)x; 06526 my[rho] = (float)y; 06527 } 06528 } 06529 } 06530 else 06531 { 06532 int x, y; 06533 CvMat bufx, bufy, bufp, bufa; 06534 double ascale = ssize.height/(2*CV_PI); 06535 cv::AutoBuffer<float> _buf(4*dsize.width); 06536 float* buf = _buf; 06537 06538 bufx = cvMat( 1, dsize.width, CV_32F, buf ); 06539 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width ); 06540 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 ); 06541 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 ); 06542 06543 for( x = 0; x < dsize.width; x++ ) 06544 bufx.data.fl[x] = (float)x - center.x; 06545 06546 for( y = 0; y < dsize.height; y++ ) 06547 { 06548 float* mx = (float*)(mapx->data.ptr + y*mapx->step); 06549 float* my = (float*)(mapy->data.ptr + y*mapy->step); 06550 06551 for( x = 0; x < dsize.width; x++ ) 06552 bufy.data.fl[x] = (float)y - center.y; 06553 06554 #if 1 06555 cvCartToPolar( &bufx, &bufy, &bufp, &bufa ); 06556 06557 for( x = 0; x < dsize.width; x++ ) 06558 bufp.data.fl[x] += 1.f; 06559 06560 cvLog( &bufp, &bufp ); 06561 06562 for( x = 0; x < dsize.width; x++ ) 06563 { 06564 double rho = bufp.data.fl[x]*M; 06565 double phi = bufa.data.fl[x]*ascale; 06566 06567 mx[x] = (float)rho; 06568 my[x] = (float)phi; 06569 } 06570 #else 06571 for( x = 0; x < dsize.width; x++ ) 06572 { 06573 double xx = bufx.data.fl[x]; 06574 double yy = bufy.data.fl[x]; 06575 06576 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M; 06577 double a = atan2(yy,xx); 06578 if( a < 0 ) 06579 a = 2*CV_PI + a; 06580 a *= ascale; 06581 06582 mx[x] = (float)p; 06583 my[x] = (float)a; 06584 } 06585 #endif 06586 } 06587 } 06588 06589 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) ); 06590 } 06591 06592 void cv::logPolar( InputArray _src, OutputArray _dst, 06593 Point2f center, double M, int flags ) 06594 { 06595 Mat src = _src.getMat(); 06596 _dst.create( src.size(), src.type() ); 06597 CvMat c_src = src, c_dst = _dst.getMat(); 06598 cvLogPolar( &c_src, &c_dst, center, M, flags ); 06599 } 06600 06601 /**************************************************************************************** 06602 Linear-Polar Transform 06603 J.L. Blanco, Apr 2009 06604 ****************************************************************************************/ 06605 CV_IMPL 06606 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr, 06607 CvPoint2D32f center, double maxRadius, int flags ) 06608 { 06609 cv::Ptr<CvMat> mapx, mapy; 06610 06611 CvMat srcstub, *src = (CvMat*)srcarr; 06612 CvMat dststub, *dst = (CvMat*)dstarr; 06613 CvSize ssize, dsize; 06614 06615 src = cvGetMat( srcarr, &srcstub,0,0 ); 06616 dst = cvGetMat( dstarr, &dststub,0,0 ); 06617 06618 if( !CV_ARE_TYPES_EQ( src, dst )) 06619 CV_Error( CV_StsUnmatchedFormats, "" ); 06620 06621 ssize.width = src->cols; 06622 ssize.height = src->rows; 06623 dsize.width = dst->cols; 06624 dsize.height = dst->rows; 06625 06626 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F )); 06627 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F )); 06628 06629 if( !(flags & CV_WARP_INVERSE_MAP) ) 06630 { 06631 int phi, rho; 06632 06633 for( phi = 0; phi < dsize.height; phi++ ) 06634 { 06635 double cp = cos(phi*2*CV_PI/dsize.height); 06636 double sp = sin(phi*2*CV_PI/dsize.height); 06637 float* mx = (float*)(mapx->data.ptr + phi*mapx->step); 06638 float* my = (float*)(mapy->data.ptr + phi*mapy->step); 06639 06640 for( rho = 0; rho < dsize.width; rho++ ) 06641 { 06642 double r = maxRadius*(rho+1)/dsize.width; 06643 double x = r*cp + center.x; 06644 double y = r*sp + center.y; 06645 06646 mx[rho] = (float)x; 06647 my[rho] = (float)y; 06648 } 06649 } 06650 } 06651 else 06652 { 06653 int x, y; 06654 CvMat bufx, bufy, bufp, bufa; 06655 const double ascale = ssize.height/(2*CV_PI); 06656 const double pscale = ssize.width/maxRadius; 06657 06658 cv::AutoBuffer<float> _buf(4*dsize.width); 06659 float* buf = _buf; 06660 06661 bufx = cvMat( 1, dsize.width, CV_32F, buf ); 06662 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width ); 06663 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 ); 06664 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 ); 06665 06666 for( x = 0; x < dsize.width; x++ ) 06667 bufx.data.fl[x] = (float)x - center.x; 06668 06669 for( y = 0; y < dsize.height; y++ ) 06670 { 06671 float* mx = (float*)(mapx->data.ptr + y*mapx->step); 06672 float* my = (float*)(mapy->data.ptr + y*mapy->step); 06673 06674 for( x = 0; x < dsize.width; x++ ) 06675 bufy.data.fl[x] = (float)y - center.y; 06676 06677 cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 ); 06678 06679 for( x = 0; x < dsize.width; x++ ) 06680 bufp.data.fl[x] += 1.f; 06681 06682 for( x = 0; x < dsize.width; x++ ) 06683 { 06684 double rho = bufp.data.fl[x]*pscale; 06685 double phi = bufa.data.fl[x]*ascale; 06686 mx[x] = (float)rho; 06687 my[x] = (float)phi; 06688 } 06689 } 06690 } 06691 06692 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) ); 06693 } 06694 06695 void cv::linearPolar( InputArray _src, OutputArray _dst, 06696 Point2f center, double maxRadius, int flags ) 06697 { 06698 Mat src = _src.getMat(); 06699 _dst.create( src.size(), src.type() ); 06700 CvMat c_src = src, c_dst = _dst.getMat(); 06701 cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags ); 06702 } 06703 06704 /* End of file. */ 06705
Generated on Tue Jul 12 2022 14:47:12 by
