gr-peach-opencv-project-sd-card_update

Renesas GR-PEACH OpenCV Development » Code » Documentation
Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update
Fork of gr-peach-opencv-project-sd-card by the do
Embed: (wiki syntax)
Show/hide line numbers imgwarp.cpp Source File
00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 /* ////////////////////////////////////////////////////////////////////
00045 //
00046 //  Geometrical transforms on images and matrices: rotation, zoom etc.
00047 //
00048 // */
00049 
00050 #include "precomp.hpp"
00051 #include "opencl_kernels_imgproc.hpp"
00052 
00053 namespace cv
00054 {
00055 #if IPP_VERSION_X100 >= 710
00056     typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
00057     typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
00058     typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
00059 #endif
00060 
00061 #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK
00062     typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
00063     typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
00064     typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
00065 
00066     template <int channels, typename Type>
00067     bool IPPSetSimple(cv::Scalar  value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
00068     {
00069         Type values[channels];
00070         for( int i = 0; i < channels; i++ )
00071             values[i] = saturate_cast<Type>(value[i]);
00072         return func(values, dataPointer, step, size) >= 0;
00073     }
00074 
00075     static bool IPPSet(const cv::Scalar  &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
00076     {
00077         if( channels == 1 )
00078         {
00079             switch( depth )
00080             {
00081             case CV_8U:
00082                 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0;
00083             case CV_16U:
00084                 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0;
00085             case CV_32F:
00086                 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0;
00087             }
00088         }
00089         else
00090         {
00091             if( channels == 3 )
00092             {
00093                 switch( depth )
00094                 {
00095                 case CV_8U:
00096                     return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
00097                 case CV_16U:
00098                     return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
00099                 case CV_32F:
00100                     return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
00101                 }
00102             }
00103             else if( channels == 4 )
00104             {
00105                 switch( depth )
00106                 {
00107                 case CV_8U:
00108                     return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
00109                 case CV_16U:
00110                     return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
00111                 case CV_32F:
00112                     return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
00113                 }
00114             }
00115         }
00116         return false;
00117     }
00118 #endif
00119 
00120 /************** interpolation formulas and tables ***************/
00121 
00122 const int INTER_RESIZE_COEF_BITS=11;
00123 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
00124 
00125 const int INTER_REMAP_COEF_BITS=15;
00126 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
00127 
00128 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
00129 
00130 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
00131 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
00132 
00133 #if CV_SSE2 || CV_NEON
00134 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
00135 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
00136 #endif
00137 
00138 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
00139 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
00140 
00141 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
00142 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
00143 
00144 static inline void interpolateLinear( float x, float* coeffs )
00145 {
00146     coeffs[0] = 1.f - x;
00147     coeffs[1] = x;
00148 }
00149 
00150 static inline void interpolateCubic( float x, float* coeffs )
00151 {
00152     const float A = -0.75f;
00153 
00154     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
00155     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
00156     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
00157     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
00158 }
00159 
00160 static inline void interpolateLanczos4( float x, float* coeffs )
00161 {
00162     static const double s45 = 0.70710678118654752440084436210485;
00163     static const double cs[][2]=
00164     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
00165 
00166     if( x < FLT_EPSILON )
00167     {
00168         for( int i = 0; i < 8; i++ )
00169             coeffs[i] = 0;
00170         coeffs[3] = 1;
00171         return;
00172     }
00173 
00174     float sum = 0;
00175     double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
00176     for(int i = 0; i < 8; i++ )
00177     {
00178         double y = -(x+3-i)*CV_PI*0.25;
00179         coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
00180         sum += coeffs[i];
00181     }
00182 
00183     sum = 1.f/sum;
00184     for(int i = 0; i < 8; i++ )
00185         coeffs[i] *= sum;
00186 }
00187 
00188 static void initInterTab1D(int method, float* tab, int tabsz)
00189 {
00190     float scale = 1.f/tabsz;
00191     if( method == INTER_LINEAR )
00192     {
00193         for( int i = 0; i < tabsz; i++, tab += 2 )
00194             interpolateLinear( i*scale, tab );
00195     }
00196     else if( method == INTER_CUBIC )
00197     {
00198         for( int i = 0; i < tabsz; i++, tab += 4 )
00199             interpolateCubic( i*scale, tab );
00200     }
00201     else if( method == INTER_LANCZOS4 )
00202     {
00203         for( int i = 0; i < tabsz; i++, tab += 8 )
00204             interpolateLanczos4( i*scale, tab );
00205     }
00206     else
00207         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
00208 }
00209 
00210 
00211 static const void* initInterTab2D( int method, bool fixpt )
00212 {
00213     static bool inittab[INTER_MAX+1] = {false};
00214     float* tab = 0;
00215     short* itab = 0;
00216     int ksize = 0;
00217     if( method == INTER_LINEAR )
00218         tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
00219     else if( method == INTER_CUBIC )
00220         tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
00221     else if( method == INTER_LANCZOS4 )
00222         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
00223     else
00224         CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
00225 
00226     if( !inittab[method] )
00227     {
00228         AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
00229         int i, j, k1, k2;
00230         initInterTab1D(method, _tab, INTER_TAB_SIZE);
00231         for( i = 0; i < INTER_TAB_SIZE; i++ )
00232             for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
00233             {
00234                 int isum = 0;
00235                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
00236                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
00237 
00238                 for( k1 = 0; k1 < ksize; k1++ )
00239                 {
00240                     float vy = _tab[i*ksize + k1];
00241                     for( k2 = 0; k2 < ksize; k2++ )
00242                     {
00243                         float v = vy*_tab[j*ksize + k2];
00244                         tab[k1*ksize + k2] = v;
00245                         isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
00246                     }
00247                 }
00248 
00249                 if( isum != INTER_REMAP_COEF_SCALE )
00250                 {
00251                     int diff = isum - INTER_REMAP_COEF_SCALE;
00252                     int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
00253                     for( k1 = ksize2; k1 < ksize2+2; k1++ )
00254                         for( k2 = ksize2; k2 < ksize2+2; k2++ )
00255                         {
00256                             if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
00257                                 mk1 = k1, mk2 = k2;
00258                             else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
00259                                 Mk1 = k1, Mk2 = k2;
00260                         }
00261                     if( diff < 0 )
00262                         itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
00263                     else
00264                         itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
00265                 }
00266             }
00267         tab -= INTER_TAB_SIZE2*ksize*ksize;
00268         itab -= INTER_TAB_SIZE2*ksize*ksize;
00269 #if CV_SSE2 || CV_NEON
00270         if( method == INTER_LINEAR )
00271         {
00272             for( i = 0; i < INTER_TAB_SIZE2; i++ )
00273                 for( j = 0; j < 4; j++ )
00274                 {
00275                     BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
00276                     BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
00277                     BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
00278                     BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
00279                 }
00280         }
00281 #endif
00282         inittab[method] = true;
00283     }
00284     return fixpt ? (const void*)itab : (const void*)tab;
00285 }
00286 
00287 #ifndef __MINGW32__
00288 static bool initAllInterTab2D()
00289 {
00290     return  initInterTab2D( INTER_LINEAR, false ) &&
00291             initInterTab2D( INTER_LINEAR, true ) &&
00292             initInterTab2D( INTER_CUBIC, false ) &&
00293             initInterTab2D( INTER_CUBIC, true ) &&
00294             initInterTab2D( INTER_LANCZOS4, false ) &&
00295             initInterTab2D( INTER_LANCZOS4, true );
00296 }
00297 
00298 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
00299 #endif
00300 
00301 template<typename ST, typename DT> struct Cast
00302 {
00303     typedef ST type1;
00304     typedef DT rtype;
00305 
00306     DT operator()(ST val) const { return saturate_cast<DT>(val); }
00307 };
00308 
00309 template<typename ST, typename DT, int bits> struct FixedPtCast
00310 {
00311     typedef ST type1;
00312     typedef DT rtype;
00313     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
00314 
00315     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
00316 };
00317 
00318 /****************************************************************************************\
00319 *                                         Resize                                         *
00320 \****************************************************************************************/
00321 
00322 class resizeNNInvoker :
00323     public ParallelLoopBody
00324 {
00325 public:
00326     resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
00327         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
00328         ify(_ify)
00329     {
00330     }
00331 
00332     virtual void operator() (const Range& range) const
00333     {
00334         Size ssize = src.size(), dsize = dst.size();
00335         int y, x, pix_size = (int)src.elemSize();
00336 
00337         for( y = range.start; y < range.end; y++ )
00338         {
00339             uchar* D = dst.data + dst.step*y;
00340             int sy = std::min(cvFloor(y*ify), ssize.height-1);
00341             const uchar* S = src.ptr(sy);
00342 
00343             switch( pix_size )
00344             {
00345             case 1:
00346                 for( x = 0; x <= dsize.width - 2; x += 2 )
00347                 {
00348                     uchar t0 = S[x_ofs[x]];
00349                     uchar t1 = S[x_ofs[x+1]];
00350                     D[x] = t0;
00351                     D[x+1] = t1;
00352                 }
00353 
00354                 for( ; x < dsize.width; x++ )
00355                     D[x] = S[x_ofs[x]];
00356                 break;
00357             case 2:
00358                 for( x = 0; x < dsize.width; x++ )
00359                     *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
00360                 break;
00361             case 3:
00362                 for( x = 0; x < dsize.width; x++, D += 3 )
00363                 {
00364                     const uchar* _tS = S + x_ofs[x];
00365                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
00366                 }
00367                 break;
00368             case 4:
00369                 for( x = 0; x < dsize.width; x++ )
00370                     *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
00371                 break;
00372             case 6:
00373                 for( x = 0; x < dsize.width; x++, D += 6 )
00374                 {
00375                     const ushort* _tS = (const ushort*)(S + x_ofs[x]);
00376                     ushort* _tD = (ushort*)D;
00377                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
00378                 }
00379                 break;
00380             case 8:
00381                 for( x = 0; x < dsize.width; x++, D += 8 )
00382                 {
00383                     const int* _tS = (const int*)(S + x_ofs[x]);
00384                     int* _tD = (int*)D;
00385                     _tD[0] = _tS[0]; _tD[1] = _tS[1];
00386                 }
00387                 break;
00388             case 12:
00389                 for( x = 0; x < dsize.width; x++, D += 12 )
00390                 {
00391                     const int* _tS = (const int*)(S + x_ofs[x]);
00392                     int* _tD = (int*)D;
00393                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
00394                 }
00395                 break;
00396             default:
00397                 for( x = 0; x < dsize.width; x++, D += pix_size )
00398                 {
00399                     const int* _tS = (const int*)(S + x_ofs[x]);
00400                     int* _tD = (int*)D;
00401                     for( int k = 0; k < pix_size4; k++ )
00402                         _tD[k] = _tS[k];
00403                 }
00404             }
00405         }
00406     }
00407 
00408 private:
00409     const Mat src;
00410     Mat dst;
00411     int* x_ofs, pix_size4;
00412     double ify;
00413 
00414     resizeNNInvoker(const resizeNNInvoker&);
00415     resizeNNInvoker& operator=(const resizeNNInvoker&);
00416 };
00417 
00418 static void
00419 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
00420 {
00421     Size ssize = src.size(), dsize = dst.size();
00422     AutoBuffer<int> _x_ofs(dsize.width);
00423     int* x_ofs = _x_ofs;
00424     int pix_size = (int)src.elemSize();
00425     int pix_size4 = (int)(pix_size / sizeof(int));
00426     double ifx = 1./fx, ify = 1./fy;
00427     int x;
00428 
00429     for( x = 0; x < dsize.width; x++ )
00430     {
00431         int sx = cvFloor(x*ifx);
00432         x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
00433     }
00434 
00435     Range range(0, dsize.height);
00436     resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
00437     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
00438 }
00439 
00440 
00441 struct VResizeNoVec
00442 {
00443     int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
00444 };
00445 
00446 struct HResizeNoVec
00447 {
00448     int operator()(const uchar**, uchar**, int, const int*,
00449         const uchar*, int, int, int, int, int) const { return 0; }
00450 };
00451 
00452 #if CV_SSE2
00453 
00454 struct VResizeLinearVec_32s8u
00455 {
00456     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
00457     {
00458         if( !checkHardwareSupport(CV_CPU_SSE2) )
00459             return 0;
00460 
00461         const int** src = (const int**)_src;
00462         const short* beta = (const short*)_beta;
00463         const int *S0 = src[0], *S1 = src[1];
00464         int x = 0;
00465         __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
00466         __m128i delta = _mm_set1_epi16(2);
00467 
00468         if( (((size_t)S0|(size_t)S1)&15) == 0 )
00469             for( ; x <= width - 16; x += 16 )
00470             {
00471                 __m128i x0, x1, x2, y0, y1, y2;
00472                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
00473                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
00474                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
00475                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
00476                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
00477                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
00478 
00479                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
00480                 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
00481                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
00482                 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
00483                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
00484                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
00485 
00486                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
00487                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
00488 
00489                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
00490                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
00491                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
00492             }
00493         else
00494             for( ; x <= width - 16; x += 16 )
00495             {
00496                 __m128i x0, x1, x2, y0, y1, y2;
00497                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
00498                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
00499                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
00500                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
00501                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
00502                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
00503 
00504                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
00505                 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
00506                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
00507                 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
00508                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
00509                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
00510 
00511                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
00512                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
00513 
00514                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
00515                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
00516                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
00517             }
00518 
00519         for( ; x < width - 4; x += 4 )
00520         {
00521             __m128i x0, y0;
00522             x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
00523             y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
00524             x0 = _mm_packs_epi32(x0, x0);
00525             y0 = _mm_packs_epi32(y0, y0);
00526             x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
00527             x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
00528             x0 = _mm_packus_epi16(x0, x0);
00529             *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
00530         }
00531 
00532         return x;
00533     }
00534 };
00535 
00536 
00537 template<int shiftval> struct VResizeLinearVec_32f16
00538 {
00539     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00540     {
00541         if( !checkHardwareSupport(CV_CPU_SSE2) )
00542             return 0;
00543 
00544         const float** src = (const float**)_src;
00545         const float* beta = (const float*)_beta;
00546         const float *S0 = src[0], *S1 = src[1];
00547         ushort* dst = (ushort*)_dst;
00548         int x = 0;
00549 
00550         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
00551         __m128i preshift = _mm_set1_epi32(shiftval);
00552         __m128i postshift = _mm_set1_epi16((short)shiftval);
00553 
00554         if( (((size_t)S0|(size_t)S1)&15) == 0 )
00555             for( ; x <= width - 16; x += 16 )
00556             {
00557                 __m128 x0, x1, y0, y1;
00558                 __m128i t0, t1, t2;
00559                 x0 = _mm_load_ps(S0 + x);
00560                 x1 = _mm_load_ps(S0 + x + 4);
00561                 y0 = _mm_load_ps(S1 + x);
00562                 y1 = _mm_load_ps(S1 + x + 4);
00563 
00564                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00565                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00566                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00567                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
00568                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
00569 
00570                 x0 = _mm_load_ps(S0 + x + 8);
00571                 x1 = _mm_load_ps(S0 + x + 12);
00572                 y0 = _mm_load_ps(S1 + x + 8);
00573                 y1 = _mm_load_ps(S1 + x + 12);
00574 
00575                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00576                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00577                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00578                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
00579                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
00580 
00581                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
00582                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
00583             }
00584         else
00585             for( ; x <= width - 16; x += 16 )
00586             {
00587                 __m128 x0, x1, y0, y1;
00588                 __m128i t0, t1, t2;
00589                 x0 = _mm_loadu_ps(S0 + x);
00590                 x1 = _mm_loadu_ps(S0 + x + 4);
00591                 y0 = _mm_loadu_ps(S1 + x);
00592                 y1 = _mm_loadu_ps(S1 + x + 4);
00593 
00594                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00595                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00596                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00597                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
00598                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
00599 
00600                 x0 = _mm_loadu_ps(S0 + x + 8);
00601                 x1 = _mm_loadu_ps(S0 + x + 12);
00602                 y0 = _mm_loadu_ps(S1 + x + 8);
00603                 y1 = _mm_loadu_ps(S1 + x + 12);
00604 
00605                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00606                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00607                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00608                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
00609                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
00610 
00611                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
00612                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
00613             }
00614 
00615         for( ; x < width - 4; x += 4 )
00616         {
00617             __m128 x0, y0;
00618             __m128i t0;
00619             x0 = _mm_loadu_ps(S0 + x);
00620             y0 = _mm_loadu_ps(S1 + x);
00621 
00622             x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00623             t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00624             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
00625             _mm_storel_epi64( (__m128i*)(dst + x), t0);
00626         }
00627 
00628         return x;
00629     }
00630 };
00631 
00632 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
00633 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
00634 
00635 struct VResizeLinearVec_32f
00636 {
00637     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00638     {
00639         if( !checkHardwareSupport(CV_CPU_SSE) )
00640             return 0;
00641 
00642         const float** src = (const float**)_src;
00643         const float* beta = (const float*)_beta;
00644         const float *S0 = src[0], *S1 = src[1];
00645         float* dst = (float*)_dst;
00646         int x = 0;
00647 
00648         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
00649 
00650         if( (((size_t)S0|(size_t)S1)&15) == 0 )
00651             for( ; x <= width - 8; x += 8 )
00652             {
00653                 __m128 x0, x1, y0, y1;
00654                 x0 = _mm_load_ps(S0 + x);
00655                 x1 = _mm_load_ps(S0 + x + 4);
00656                 y0 = _mm_load_ps(S1 + x);
00657                 y1 = _mm_load_ps(S1 + x + 4);
00658 
00659                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00660                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00661 
00662                 _mm_storeu_ps( dst + x, x0);
00663                 _mm_storeu_ps( dst + x + 4, x1);
00664             }
00665         else
00666             for( ; x <= width - 8; x += 8 )
00667             {
00668                 __m128 x0, x1, y0, y1;
00669                 x0 = _mm_loadu_ps(S0 + x);
00670                 x1 = _mm_loadu_ps(S0 + x + 4);
00671                 y0 = _mm_loadu_ps(S1 + x);
00672                 y1 = _mm_loadu_ps(S1 + x + 4);
00673 
00674                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00675                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00676 
00677                 _mm_storeu_ps( dst + x, x0);
00678                 _mm_storeu_ps( dst + x + 4, x1);
00679             }
00680 
00681         return x;
00682     }
00683 };
00684 
00685 
00686 struct VResizeCubicVec_32s8u
00687 {
00688     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
00689     {
00690         if( !checkHardwareSupport(CV_CPU_SSE2) )
00691             return 0;
00692 
00693         const int** src = (const int**)_src;
00694         const short* beta = (const short*)_beta;
00695         const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
00696         int x = 0;
00697         float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
00698         __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
00699             b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
00700 
00701         if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
00702             for( ; x <= width - 8; x += 8 )
00703             {
00704                 __m128i x0, x1, y0, y1;
00705                 __m128 s0, s1, f0, f1;
00706                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
00707                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
00708                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
00709                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
00710 
00711                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
00712                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
00713                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
00714                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
00715                 s0 = _mm_add_ps(s0, f0);
00716                 s1 = _mm_add_ps(s1, f1);
00717 
00718                 x0 = _mm_load_si128((const __m128i*)(S2 + x));
00719                 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
00720                 y0 = _mm_load_si128((const __m128i*)(S3 + x));
00721                 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
00722 
00723                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
00724                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
00725                 s0 = _mm_add_ps(s0, f0);
00726                 s1 = _mm_add_ps(s1, f1);
00727                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
00728                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
00729                 s0 = _mm_add_ps(s0, f0);
00730                 s1 = _mm_add_ps(s1, f1);
00731 
00732                 x0 = _mm_cvtps_epi32(s0);
00733                 x1 = _mm_cvtps_epi32(s1);
00734 
00735                 x0 = _mm_packs_epi32(x0, x1);
00736                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
00737             }
00738         else
00739             for( ; x <= width - 8; x += 8 )
00740             {
00741                 __m128i x0, x1, y0, y1;
00742                 __m128 s0, s1, f0, f1;
00743                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
00744                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
00745                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
00746                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
00747 
00748                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
00749                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
00750                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
00751                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
00752                 s0 = _mm_add_ps(s0, f0);
00753                 s1 = _mm_add_ps(s1, f1);
00754 
00755                 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
00756                 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
00757                 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
00758                 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
00759 
00760                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
00761                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
00762                 s0 = _mm_add_ps(s0, f0);
00763                 s1 = _mm_add_ps(s1, f1);
00764                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
00765                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
00766                 s0 = _mm_add_ps(s0, f0);
00767                 s1 = _mm_add_ps(s1, f1);
00768 
00769                 x0 = _mm_cvtps_epi32(s0);
00770                 x1 = _mm_cvtps_epi32(s1);
00771 
00772                 x0 = _mm_packs_epi32(x0, x1);
00773                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
00774             }
00775 
00776         return x;
00777     }
00778 };
00779 
00780 
00781 template<int shiftval> struct VResizeCubicVec_32f16
00782 {
00783     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00784     {
00785         if( !checkHardwareSupport(CV_CPU_SSE2) )
00786             return 0;
00787 
00788         const float** src = (const float**)_src;
00789         const float* beta = (const float*)_beta;
00790         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
00791         ushort* dst = (ushort*)_dst;
00792         int x = 0;
00793         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
00794             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
00795         __m128i preshift = _mm_set1_epi32(shiftval);
00796         __m128i postshift = _mm_set1_epi16((short)shiftval);
00797 
00798         for( ; x <= width - 8; x += 8 )
00799         {
00800             __m128 x0, x1, y0, y1, s0, s1;
00801             __m128i t0, t1;
00802             x0 = _mm_loadu_ps(S0 + x);
00803             x1 = _mm_loadu_ps(S0 + x + 4);
00804             y0 = _mm_loadu_ps(S1 + x);
00805             y1 = _mm_loadu_ps(S1 + x + 4);
00806 
00807             s0 = _mm_mul_ps(x0, b0);
00808             s1 = _mm_mul_ps(x1, b0);
00809             y0 = _mm_mul_ps(y0, b1);
00810             y1 = _mm_mul_ps(y1, b1);
00811             s0 = _mm_add_ps(s0, y0);
00812             s1 = _mm_add_ps(s1, y1);
00813 
00814             x0 = _mm_loadu_ps(S2 + x);
00815             x1 = _mm_loadu_ps(S2 + x + 4);
00816             y0 = _mm_loadu_ps(S3 + x);
00817             y1 = _mm_loadu_ps(S3 + x + 4);
00818 
00819             x0 = _mm_mul_ps(x0, b2);
00820             x1 = _mm_mul_ps(x1, b2);
00821             y0 = _mm_mul_ps(y0, b3);
00822             y1 = _mm_mul_ps(y1, b3);
00823             s0 = _mm_add_ps(s0, x0);
00824             s1 = _mm_add_ps(s1, x1);
00825             s0 = _mm_add_ps(s0, y0);
00826             s1 = _mm_add_ps(s1, y1);
00827 
00828             t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
00829             t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
00830 
00831             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
00832             _mm_storeu_si128( (__m128i*)(dst + x), t0);
00833         }
00834 
00835         return x;
00836     }
00837 };
00838 
00839 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
00840 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
00841 
00842 struct VResizeCubicVec_32f
00843 {
00844     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00845     {
00846         if( !checkHardwareSupport(CV_CPU_SSE) )
00847             return 0;
00848 
00849         const float** src = (const float**)_src;
00850         const float* beta = (const float*)_beta;
00851         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
00852         float* dst = (float*)_dst;
00853         int x = 0;
00854         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
00855             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
00856 
00857         for( ; x <= width - 8; x += 8 )
00858         {
00859             __m128 x0, x1, y0, y1, s0, s1;
00860             x0 = _mm_loadu_ps(S0 + x);
00861             x1 = _mm_loadu_ps(S0 + x + 4);
00862             y0 = _mm_loadu_ps(S1 + x);
00863             y1 = _mm_loadu_ps(S1 + x + 4);
00864 
00865             s0 = _mm_mul_ps(x0, b0);
00866             s1 = _mm_mul_ps(x1, b0);
00867             y0 = _mm_mul_ps(y0, b1);
00868             y1 = _mm_mul_ps(y1, b1);
00869             s0 = _mm_add_ps(s0, y0);
00870             s1 = _mm_add_ps(s1, y1);
00871 
00872             x0 = _mm_loadu_ps(S2 + x);
00873             x1 = _mm_loadu_ps(S2 + x + 4);
00874             y0 = _mm_loadu_ps(S3 + x);
00875             y1 = _mm_loadu_ps(S3 + x + 4);
00876 
00877             x0 = _mm_mul_ps(x0, b2);
00878             x1 = _mm_mul_ps(x1, b2);
00879             y0 = _mm_mul_ps(y0, b3);
00880             y1 = _mm_mul_ps(y1, b3);
00881             s0 = _mm_add_ps(s0, x0);
00882             s1 = _mm_add_ps(s1, x1);
00883             s0 = _mm_add_ps(s0, y0);
00884             s1 = _mm_add_ps(s1, y1);
00885 
00886             _mm_storeu_ps( dst + x, s0);
00887             _mm_storeu_ps( dst + x + 4, s1);
00888         }
00889 
00890         return x;
00891     }
00892 };
00893 
00894 #if CV_SSE4_1
00895 
00896 struct VResizeLanczos4Vec_32f16u
00897 {
00898     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00899     {
00900         const float** src = (const float**)_src;
00901         const float* beta = (const float*)_beta;
00902         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
00903                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
00904         short * dst = (short*)_dst;
00905         int x = 0;
00906         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
00907                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
00908                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
00909                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
00910 
00911         for( ; x <= width - 8; x += 8 )
00912         {
00913             __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
00914             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
00915             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
00916             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
00917             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
00918             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
00919             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
00920             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
00921 
00922             __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
00923             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
00924             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
00925             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
00926             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
00927             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
00928             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
00929             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
00930 
00931             __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
00932             __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
00933 
00934             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
00935         }
00936 
00937         return x;
00938     }
00939 };
00940 
00941 #else
00942 
00943 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
00944 
00945 #endif
00946 
00947 struct VResizeLanczos4Vec_32f16s
00948 {
00949     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00950     {
00951         const float** src = (const float**)_src;
00952         const float* beta = (const float*)_beta;
00953         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
00954                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
00955         short * dst = (short*)_dst;
00956         int x = 0;
00957         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
00958                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
00959                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
00960                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
00961 
00962         for( ; x <= width - 8; x += 8 )
00963         {
00964             __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
00965             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
00966             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
00967             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
00968             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
00969             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
00970             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
00971             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
00972 
00973             __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
00974             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
00975             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
00976             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
00977             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
00978             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
00979             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
00980             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
00981 
00982             __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
00983             __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
00984 
00985             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
00986         }
00987 
00988         return x;
00989     }
00990 };
00991 
00992 
00993 struct VResizeLanczos4Vec_32f
00994 {
00995     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00996     {
00997         const float** src = (const float**)_src;
00998         const float* beta = (const float*)_beta;
00999         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
01000                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
01001         float* dst = (float*)_dst;
01002         int x = 0;
01003 
01004         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
01005                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
01006                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
01007                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
01008 
01009         for( ; x <= width - 4; x += 4 )
01010         {
01011             __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
01012             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
01013             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
01014             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
01015             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
01016             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
01017             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
01018             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
01019 
01020             _mm_storeu_ps(dst + x, v_dst);
01021         }
01022 
01023         return x;
01024     }
01025 };
01026 
01027 
01028 #elif CV_NEON
01029 
01030 struct VResizeLinearVec_32s8u
01031 {
01032     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
01033     {
01034         const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
01035         const short* beta = (const short*)_beta;
01036         int x = 0;
01037         int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
01038 
01039         for( ; x <= width - 16; x += 16)
01040         {
01041             int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
01042             int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
01043 
01044             int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
01045             int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
01046 
01047             int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
01048                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
01049             v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
01050 
01051             v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
01052             v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
01053             v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
01054             v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
01055 
01056             v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
01057             v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
01058 
01059             int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
01060                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
01061             v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
01062 
01063             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
01064         }
01065 
01066         return x;
01067     }
01068 };
01069 
01070 struct VResizeLinearVec_32f16u
01071 {
01072     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01073     {
01074         const float** src = (const float**)_src;
01075         const float* beta = (const float*)_beta;
01076         const float *S0 = src[0], *S1 = src[1];
01077         ushort* dst = (ushort*)_dst;
01078         int x = 0;
01079 
01080         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
01081 
01082         for( ; x <= width - 8; x += 8 )
01083         {
01084             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
01085             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
01086 
01087             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
01088             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
01089 
01090             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
01091                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
01092         }
01093 
01094         return x;
01095     }
01096 };
01097 
01098 struct VResizeLinearVec_32f16s
01099 {
01100     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01101     {
01102         const float** src = (const float**)_src;
01103         const float* beta = (const float*)_beta;
01104         const float *S0 = src[0], *S1 = src[1];
01105         short* dst = (short*)_dst;
01106         int x = 0;
01107 
01108         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
01109 
01110         for( ; x <= width - 8; x += 8 )
01111         {
01112             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
01113             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
01114 
01115             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
01116             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
01117 
01118             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
01119                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
01120         }
01121 
01122         return x;
01123     }
01124 };
01125 
01126 struct VResizeLinearVec_32f
01127 {
01128     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01129     {
01130         const float** src = (const float**)_src;
01131         const float* beta = (const float*)_beta;
01132         const float *S0 = src[0], *S1 = src[1];
01133         float* dst = (float*)_dst;
01134         int x = 0;
01135 
01136         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
01137 
01138         for( ; x <= width - 8; x += 8 )
01139         {
01140             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
01141             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
01142 
01143             vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
01144             vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
01145         }
01146 
01147         return x;
01148     }
01149 };
01150 
01151 typedef VResizeNoVec VResizeCubicVec_32s8u;
01152 
01153 struct VResizeCubicVec_32f16u
01154 {
01155     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01156     {
01157         const float** src = (const float**)_src;
01158         const float* beta = (const float*)_beta;
01159         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
01160         ushort* dst = (ushort*)_dst;
01161         int x = 0;
01162         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01163                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
01164 
01165         for( ; x <= width - 8; x += 8 )
01166         {
01167             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01168                                                                          v_b1, vld1q_f32(S1 + x)),
01169                                                                          v_b2, vld1q_f32(S2 + x)),
01170                                                                          v_b3, vld1q_f32(S3 + x));
01171             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01172                                                                          v_b1, vld1q_f32(S1 + x + 4)),
01173                                                                          v_b2, vld1q_f32(S2 + x + 4)),
01174                                                                          v_b3, vld1q_f32(S3 + x + 4));
01175 
01176             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
01177                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
01178         }
01179 
01180         return x;
01181     }
01182 };
01183 
01184 struct VResizeCubicVec_32f16s
01185 {
01186     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01187     {
01188         const float** src = (const float**)_src;
01189         const float* beta = (const float*)_beta;
01190         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
01191         short* dst = (short*)_dst;
01192         int x = 0;
01193         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01194                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
01195 
01196         for( ; x <= width - 8; x += 8 )
01197         {
01198             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01199                                                                          v_b1, vld1q_f32(S1 + x)),
01200                                                                          v_b2, vld1q_f32(S2 + x)),
01201                                                                          v_b3, vld1q_f32(S3 + x));
01202             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01203                                                                          v_b1, vld1q_f32(S1 + x + 4)),
01204                                                                          v_b2, vld1q_f32(S2 + x + 4)),
01205                                                                          v_b3, vld1q_f32(S3 + x + 4));
01206 
01207             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
01208                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
01209         }
01210 
01211         return x;
01212     }
01213 };
01214 
01215 struct VResizeCubicVec_32f
01216 {
01217     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01218     {
01219         const float** src = (const float**)_src;
01220         const float* beta = (const float*)_beta;
01221         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
01222         float* dst = (float*)_dst;
01223         int x = 0;
01224         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01225                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
01226 
01227         for( ; x <= width - 8; x += 8 )
01228         {
01229             vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01230                                                                        v_b1, vld1q_f32(S1 + x)),
01231                                                                        v_b2, vld1q_f32(S2 + x)),
01232                                                                        v_b3, vld1q_f32(S3 + x)));
01233             vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01234                                                                           v_b1, vld1q_f32(S1 + x + 4)),
01235                                                                           v_b2, vld1q_f32(S2 + x + 4)),
01236                                                                           v_b3, vld1q_f32(S3 + x + 4)));
01237         }
01238 
01239         return x;
01240     }
01241 };
01242 
01243 struct VResizeLanczos4Vec_32f16u
01244 {
01245     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01246     {
01247         const float** src = (const float**)_src;
01248         const float* beta = (const float*)_beta;
01249         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
01250                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
01251         ushort * dst = (ushort*)_dst;
01252         int x = 0;
01253         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01254                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
01255                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
01256                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
01257 
01258         for( ; x <= width - 8; x += 8 )
01259         {
01260             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01261                                                                          v_b1, vld1q_f32(S1 + x)),
01262                                                                          v_b2, vld1q_f32(S2 + x)),
01263                                                                          v_b3, vld1q_f32(S3 + x));
01264             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
01265                                                                          v_b5, vld1q_f32(S5 + x)),
01266                                                                          v_b6, vld1q_f32(S6 + x)),
01267                                                                          v_b7, vld1q_f32(S7 + x));
01268             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
01269 
01270             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01271                                                              v_b1, vld1q_f32(S1 + x + 4)),
01272                                                              v_b2, vld1q_f32(S2 + x + 4)),
01273                                                              v_b3, vld1q_f32(S3 + x + 4));
01274             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
01275                                                              v_b5, vld1q_f32(S5 + x + 4)),
01276                                                              v_b6, vld1q_f32(S6 + x + 4)),
01277                                                              v_b7, vld1q_f32(S7 + x + 4));
01278             v_dst1 = vaddq_f32(v_dst0, v_dst1);
01279 
01280             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
01281                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
01282         }
01283 
01284         return x;
01285     }
01286 };
01287 
01288 struct VResizeLanczos4Vec_32f16s
01289 {
01290     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01291     {
01292         const float** src = (const float**)_src;
01293         const float* beta = (const float*)_beta;
01294         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
01295                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
01296         short * dst = (short*)_dst;
01297         int x = 0;
01298         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01299                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
01300                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
01301                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
01302 
01303         for( ; x <= width - 8; x += 8 )
01304         {
01305             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01306                                                                          v_b1, vld1q_f32(S1 + x)),
01307                                                                          v_b2, vld1q_f32(S2 + x)),
01308                                                                          v_b3, vld1q_f32(S3 + x));
01309             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
01310                                                                          v_b5, vld1q_f32(S5 + x)),
01311                                                                          v_b6, vld1q_f32(S6 + x)),
01312                                                                          v_b7, vld1q_f32(S7 + x));
01313             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
01314 
01315             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01316                                                              v_b1, vld1q_f32(S1 + x + 4)),
01317                                                              v_b2, vld1q_f32(S2 + x + 4)),
01318                                                              v_b3, vld1q_f32(S3 + x + 4));
01319             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
01320                                                              v_b5, vld1q_f32(S5 + x + 4)),
01321                                                              v_b6, vld1q_f32(S6 + x + 4)),
01322                                                              v_b7, vld1q_f32(S7 + x + 4));
01323             v_dst1 = vaddq_f32(v_dst0, v_dst1);
01324 
01325             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
01326                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
01327         }
01328 
01329         return x;
01330     }
01331 };
01332 
01333 struct VResizeLanczos4Vec_32f
01334 {
01335     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01336     {
01337         const float** src = (const float**)_src;
01338         const float* beta = (const float*)_beta;
01339         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
01340                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
01341         float* dst = (float*)_dst;
01342         int x = 0;
01343         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01344                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
01345                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
01346                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
01347 
01348         for( ; x <= width - 4; x += 4 )
01349         {
01350             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01351                                                                          v_b1, vld1q_f32(S1 + x)),
01352                                                                          v_b2, vld1q_f32(S2 + x)),
01353                                                                          v_b3, vld1q_f32(S3 + x));
01354             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
01355                                                                          v_b5, vld1q_f32(S5 + x)),
01356                                                                          v_b6, vld1q_f32(S6 + x)),
01357                                                                          v_b7, vld1q_f32(S7 + x));
01358             vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
01359         }
01360 
01361         return x;
01362     }
01363 };
01364 
01365 #else
01366 
01367 typedef VResizeNoVec VResizeLinearVec_32s8u;
01368 typedef VResizeNoVec VResizeLinearVec_32f16u;
01369 typedef VResizeNoVec VResizeLinearVec_32f16s;
01370 typedef VResizeNoVec VResizeLinearVec_32f;
01371 
01372 typedef VResizeNoVec VResizeCubicVec_32s8u;
01373 typedef VResizeNoVec VResizeCubicVec_32f16u;
01374 typedef VResizeNoVec VResizeCubicVec_32f16s;
01375 typedef VResizeNoVec VResizeCubicVec_32f;
01376 
01377 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
01378 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
01379 typedef VResizeNoVec VResizeLanczos4Vec_32f;
01380 
01381 #endif
01382 
01383 typedef HResizeNoVec HResizeLinearVec_8u32s;
01384 typedef HResizeNoVec HResizeLinearVec_16u32f;
01385 typedef HResizeNoVec HResizeLinearVec_16s32f;
01386 typedef HResizeNoVec HResizeLinearVec_32f;
01387 typedef HResizeNoVec HResizeLinearVec_64f;
01388 
01389 
01390 template<typename T, typename WT, typename AT, int ONE, class VecOp>
01391 struct HResizeLinear
01392 {
01393     typedef T value_type;
01394     typedef WT buf_type;
01395     typedef AT alpha_type;
01396 
01397     void operator()(const T** src, WT** dst, int count,
01398                     const int* xofs, const AT* alpha,
01399                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
01400     {
01401         int dx, k;
01402         VecOp vecOp;
01403 
01404         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
01405             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
01406 
01407         for( k = 0; k <= count - 2; k++ )
01408         {
01409             const T *S0 = src[k], *S1 = src[k+1];
01410             WT *D0 = dst[k], *D1 = dst[k+1];
01411             for( dx = dx0; dx < xmax; dx++ )
01412             {
01413                 int sx = xofs[dx];
01414                 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
01415                 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
01416                 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
01417                 D0[dx] = t0; D1[dx] = t1;
01418             }
01419 
01420             for( ; dx < dwidth; dx++ )
01421             {
01422                 int sx = xofs[dx];
01423                 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
01424             }
01425         }
01426 
01427         for( ; k < count; k++ )
01428         {
01429             const T *S = src[k];
01430             WT *D = dst[k];
01431             for( dx = 0; dx < xmax; dx++ )
01432             {
01433                 int sx = xofs[dx];
01434                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
01435             }
01436 
01437             for( ; dx < dwidth; dx++ )
01438                 D[dx] = WT(S[xofs[dx]]*ONE);
01439         }
01440     }
01441 };
01442 
01443 
01444 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
01445 struct VResizeLinear
01446 {
01447     typedef T value_type;
01448     typedef WT buf_type;
01449     typedef AT alpha_type;
01450 
01451     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
01452     {
01453         WT b0 = beta[0], b1 = beta[1];
01454         const WT *S0 = src[0], *S1 = src[1];
01455         CastOp castOp;
01456         VecOp vecOp;
01457 
01458         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
01459         #if CV_ENABLE_UNROLLED
01460         for( ; x <= width - 4; x += 4 )
01461         {
01462             WT t0, t1;
01463             t0 = S0[x]*b0 + S1[x]*b1;
01464             t1 = S0[x+1]*b0 + S1[x+1]*b1;
01465             dst[x] = castOp(t0); dst[x+1] = castOp(t1);
01466             t0 = S0[x+2]*b0 + S1[x+2]*b1;
01467             t1 = S0[x+3]*b0 + S1[x+3]*b1;
01468             dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
01469         }
01470         #endif
01471         for( ; x < width; x++ )
01472             dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
01473     }
01474 };
01475 
01476 template<>
01477 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
01478 {
01479     typedef uchar value_type;
01480     typedef int buf_type;
01481     typedef short alpha_type;
01482 
01483     void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
01484     {
01485         alpha_type b0 = beta[0], b1 = beta[1];
01486         const buf_type *S0 = src[0], *S1 = src[1];
01487         VResizeLinearVec_32s8u vecOp;
01488 
01489         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
01490         #if CV_ENABLE_UNROLLED
01491         for( ; x <= width - 4; x += 4 )
01492         {
01493             dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
01494             dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
01495             dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
01496             dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
01497         }
01498         #endif
01499         for( ; x < width; x++ )
01500             dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
01501     }
01502 };
01503 
01504 
01505 template<typename T, typename WT, typename AT>
01506 struct HResizeCubic
01507 {
01508     typedef T value_type;
01509     typedef WT buf_type;
01510     typedef AT alpha_type;
01511 
01512     void operator()(const T** src, WT** dst, int count,
01513                     const int* xofs, const AT* alpha,
01514                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
01515     {
01516         for( int k = 0; k < count; k++ )
01517         {
01518             const T *S = src[k];
01519             WT *D = dst[k];
01520             int dx = 0, limit = xmin;
01521             for(;;)
01522             {
01523                 for( ; dx < limit; dx++, alpha += 4 )
01524                 {
01525                     int j, sx = xofs[dx] - cn;
01526                     WT v = 0;
01527                     for( j = 0; j < 4; j++ )
01528                     {
01529                         int sxj = sx + j*cn;
01530                         if( (unsigned)sxj >= (unsigned)swidth )
01531                         {
01532                             while( sxj < 0 )
01533                                 sxj += cn;
01534                             while( sxj >= swidth )
01535                                 sxj -= cn;
01536                         }
01537                         v += S[sxj]*alpha[j];
01538                     }
01539                     D[dx] = v;
01540                 }
01541                 if( limit == dwidth )
01542                     break;
01543                 for( ; dx < xmax; dx++, alpha += 4 )
01544                 {
01545                     int sx = xofs[dx];
01546                     D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
01547                         S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
01548                 }
01549                 limit = dwidth;
01550             }
01551             alpha -= dwidth*4;
01552         }
01553     }
01554 };
01555 
01556 
01557 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
01558 struct VResizeCubic
01559 {
01560     typedef T value_type;
01561     typedef WT buf_type;
01562     typedef AT alpha_type;
01563 
01564     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
01565     {
01566         WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
01567         const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
01568         CastOp castOp;
01569         VecOp vecOp;
01570 
01571         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
01572         for( ; x < width; x++ )
01573             dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
01574     }
01575 };
01576 
01577 
01578 template<typename T, typename WT, typename AT>
01579 struct HResizeLanczos4
01580 {
01581     typedef T value_type;
01582     typedef WT buf_type;
01583     typedef AT alpha_type;
01584 
01585     void operator()(const T** src, WT** dst, int count,
01586                     const int* xofs, const AT* alpha,
01587                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
01588     {
01589         for( int k = 0; k < count; k++ )
01590         {
01591             const T *S = src[k];
01592             WT *D = dst[k];
01593             int dx = 0, limit = xmin;
01594             for(;;)
01595             {
01596                 for( ; dx < limit; dx++, alpha += 8 )
01597                 {
01598                     int j, sx = xofs[dx] - cn*3;
01599                     WT v = 0;
01600                     for( j = 0; j < 8; j++ )
01601                     {
01602                         int sxj = sx + j*cn;
01603                         if( (unsigned)sxj >= (unsigned)swidth )
01604                         {
01605                             while( sxj < 0 )
01606                                 sxj += cn;
01607                             while( sxj >= swidth )
01608                                 sxj -= cn;
01609                         }
01610                         v += S[sxj]*alpha[j];
01611                     }
01612                     D[dx] = v;
01613                 }
01614                 if( limit == dwidth )
01615                     break;
01616                 for( ; dx < xmax; dx++, alpha += 8 )
01617                 {
01618                     int sx = xofs[dx];
01619                     D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
01620                         S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
01621                         S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
01622                         S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
01623                 }
01624                 limit = dwidth;
01625             }
01626             alpha -= dwidth*8;
01627         }
01628     }
01629 };
01630 
01631 
01632 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
01633 struct VResizeLanczos4
01634 {
01635     typedef T value_type;
01636     typedef WT buf_type;
01637     typedef AT alpha_type;
01638 
01639     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
01640     {
01641         CastOp castOp;
01642         VecOp vecOp;
01643         int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
01644         #if CV_ENABLE_UNROLLED
01645         for( ; x <= width - 4; x += 4 )
01646         {
01647             WT b = beta[0];
01648             const WT* S = src[0];
01649             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
01650 
01651             for( k = 1; k < 8; k++ )
01652             {
01653                 b = beta[k]; S = src[k];
01654                 s0 += S[x]*b; s1 += S[x+1]*b;
01655                 s2 += S[x+2]*b; s3 += S[x+3]*b;
01656             }
01657 
01658             dst[x] = castOp(s0); dst[x+1] = castOp(s1);
01659             dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
01660         }
01661         #endif
01662         for( ; x < width; x++ )
01663         {
01664             dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
01665                 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
01666                 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
01667         }
01668     }
01669 };
01670 
01671 
01672 static inline int clip(int x, int a, int b)
01673 {
01674     return x >= a ? (x < b ? x : b-1) : a;
01675 }
01676 
01677 static const int MAX_ESIZE=16;
01678 
01679 template <typename HResize, typename VResize>
01680 class resizeGeneric_Invoker :
01681     public ParallelLoopBody
01682 {
01683 public:
01684     typedef typename HResize::value_type T;
01685     typedef typename HResize::buf_type WT;
01686     typedef typename HResize::alpha_type AT;
01687 
01688     resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
01689         const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
01690         int _ksize, int _xmin, int _xmax) :
01691         ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
01692         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
01693         ksize(_ksize), xmin(_xmin), xmax(_xmax)
01694     {
01695         CV_Assert(ksize <= MAX_ESIZE);
01696     }
01697 
01698 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
01699 # pragma GCC diagnostic push
01700 # pragma GCC diagnostic ignored "-Warray-bounds"
01701 #endif
01702     virtual void operator() (const Range& range) const
01703     {
01704         int dy, cn = src.channels();
01705         HResize hresize;
01706         VResize vresize;
01707 
01708         int bufstep = (int)alignSize(dsize.width, 16);
01709         AutoBuffer<WT> _buffer(bufstep*ksize);
01710         const T* srows[MAX_ESIZE]={0};
01711         WT* rows[MAX_ESIZE]={0};
01712         int prev_sy[MAX_ESIZE];
01713 
01714         for(int k = 0; k < ksize; k++ )
01715         {
01716             prev_sy[k] = -1;
01717             rows[k] = (WT*)_buffer + bufstep*k;
01718         }
01719 
01720         const AT* beta = _beta + ksize * range.start;
01721 
01722         for( dy = range.start; dy < range.end; dy++, beta += ksize )
01723         {
01724             int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
01725 
01726             for(int k = 0; k < ksize; k++ )
01727             {
01728                 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
01729                 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
01730                 {
01731                     if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
01732                     {
01733                         if( k1 > k )
01734                             memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
01735                         break;
01736                     }
01737                 }
01738                 if( k1 == ksize )
01739                     k0 = std::min(k0, k); // remember the first row that needs to be computed
01740                 srows[k] = src.template ptr<T>(sy);
01741                 prev_sy[k] = sy;
01742             }
01743 
01744             if( k0 < ksize )
01745                 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
01746                         ssize.width, dsize.width, cn, xmin, xmax );
01747             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
01748         }
01749     }
01750 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
01751 # pragma GCC diagnostic pop
01752 #endif
01753 
01754 private:
01755     Mat src;
01756     Mat dst;
01757     const int* xofs, *yofs;
01758     const AT* alpha, *_beta;
01759     Size ssize, dsize;
01760     const int ksize, xmin, xmax;
01761 
01762     resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
01763 };
01764 
01765 template<class HResize, class VResize>
01766 static void resizeGeneric_( const Mat& src, Mat& dst,
01767                             const int* xofs, const void* _alpha,
01768                             const int* yofs, const void* _beta,
01769                             int xmin, int xmax, int ksize )
01770 {
01771     typedef typename HResize::alpha_type AT;
01772 
01773     const AT* beta = (const AT*)_beta;
01774     Size ssize = src.size(), dsize = dst.size();
01775     int cn = src.channels();
01776     ssize.width *= cn;
01777     dsize.width *= cn;
01778     xmin *= cn;
01779     xmax *= cn;
01780     // image resize is a separable operation. In case of not too strong
01781 
01782     Range range(0, dsize.height);
01783     resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
01784         ssize, dsize, ksize, xmin, xmax);
01785     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
01786 }
01787 
01788 template <typename T, typename WT>
01789 struct ResizeAreaFastNoVec
01790 {
01791     ResizeAreaFastNoVec(int, int) { }
01792     ResizeAreaFastNoVec(int, int, int, int) { }
01793     int operator() (const T*, T*, int) const
01794     { return 0; }
01795 };
01796 
01797 #if CV_NEON
01798 
01799 class ResizeAreaFastVec_SIMD_8u
01800 {
01801 public:
01802     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
01803         cn(_cn), step(_step)
01804     {
01805     }
01806 
01807     int operator() (const uchar* S, uchar* D, int w) const
01808     {
01809         int dx = 0;
01810         const uchar* S0 = S, * S1 = S0 + step;
01811 
01812         uint16x8_t v_2 = vdupq_n_u16(2);
01813 
01814         if (cn == 1)
01815         {
01816             for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
01817             {
01818                 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
01819 
01820                 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
01821                 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
01822                 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
01823 
01824                 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
01825                 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
01826                 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
01827 
01828                 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
01829             }
01830         }
01831         else if (cn == 4)
01832         {
01833             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
01834             {
01835                 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
01836 
01837                 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
01838                 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
01839                 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
01840                 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
01841 
01842                 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
01843                                            vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
01844                 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
01845                                            vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
01846                 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
01847 
01848                 vst1_u8(D, vmovn_u16(v_dst));
01849             }
01850         }
01851 
01852         return dx;
01853     }
01854 
01855 private:
01856     int cn, step;
01857 };
01858 
01859 class ResizeAreaFastVec_SIMD_16u
01860 {
01861 public:
01862     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
01863         cn(_cn), step(_step)
01864     {
01865     }
01866 
01867     int operator() (const ushort * S, ushort * D, int w) const
01868     {
01869         int dx = 0;
01870         const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
01871 
01872         uint32x4_t v_2 = vdupq_n_u32(2);
01873 
01874         if (cn == 1)
01875         {
01876             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
01877             {
01878                 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
01879 
01880                 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
01881                 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
01882                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
01883 
01884                 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
01885                 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
01886                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
01887 
01888                 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
01889             }
01890         }
01891         else if (cn == 4)
01892         {
01893             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
01894             {
01895                 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
01896                 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
01897                                              vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
01898                 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
01899             }
01900         }
01901 
01902         return dx;
01903     }
01904 
01905 private:
01906     int cn, step;
01907 };
01908 
01909 class ResizeAreaFastVec_SIMD_16s
01910 {
01911 public:
01912     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
01913         cn(_cn), step(_step)
01914     {
01915     }
01916 
01917     int operator() (const short * S, short * D, int w) const
01918     {
01919         int dx = 0;
01920         const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
01921 
01922         int32x4_t v_2 = vdupq_n_s32(2);
01923 
01924         if (cn == 1)
01925         {
01926             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
01927             {
01928                 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
01929 
01930                 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
01931                 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
01932                 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
01933 
01934                 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
01935                 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
01936                 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
01937 
01938                 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
01939             }
01940         }
01941         else if (cn == 4)
01942         {
01943             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
01944             {
01945                 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
01946                 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
01947                                             vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
01948                 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
01949             }
01950         }
01951 
01952         return dx;
01953     }
01954 
01955 private:
01956     int cn, step;
01957 };
01958 
01959 struct ResizeAreaFastVec_SIMD_32f
01960 {
01961     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
01962         cn(_cn), step(_step)
01963     {
01964         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
01965     }
01966 
01967     int operator() (const float * S, float * D, int w) const
01968     {
01969         if (!fast_mode)
01970             return 0;
01971 
01972         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
01973         int dx = 0;
01974 
01975         float32x4_t v_025 = vdupq_n_f32(0.25f);
01976 
01977         if (cn == 1)
01978         {
01979             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
01980             {
01981                 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
01982 
01983                 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
01984                 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
01985 
01986                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
01987             }
01988         }
01989         else if (cn == 4)
01990         {
01991             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
01992             {
01993                 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
01994                 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
01995 
01996                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
01997             }
01998         }
01999 
02000         return dx;
02001     }
02002 
02003 private:
02004     int cn;
02005     bool fast_mode;
02006     int step;
02007 };
02008 
02009 #elif CV_SSE2
02010 
02011 class ResizeAreaFastVec_SIMD_8u
02012 {
02013 public:
02014     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
02015         cn(_cn), step(_step)
02016     {
02017         use_simd = checkHardwareSupport(CV_CPU_SSE2);
02018     }
02019 
02020     int operator() (const uchar* S, uchar* D, int w) const
02021     {
02022         if (!use_simd)
02023             return 0;
02024 
02025         int dx = 0;
02026         const uchar* S0 = S;
02027         const uchar* S1 = S0 + step;
02028         __m128i zero = _mm_setzero_si128();
02029         __m128i delta2 = _mm_set1_epi16(2);
02030 
02031         if (cn == 1)
02032         {
02033             __m128i masklow = _mm_set1_epi16(0x00ff);
02034             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
02035             {
02036                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02037                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02038 
02039                 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
02040                 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
02041                 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
02042                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
02043 
02044                 _mm_storel_epi64((__m128i*)D, s0);
02045             }
02046         }
02047         else if (cn == 3)
02048             for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
02049             {
02050                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02051                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02052 
02053                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
02054                 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
02055                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
02056                 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
02057 
02058                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
02059                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
02060                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
02061                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
02062                 _mm_storel_epi64((__m128i*)D, s0);
02063 
02064                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
02065                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
02066                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
02067                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
02068                 _mm_storel_epi64((__m128i*)(D+3), s0);
02069             }
02070         else
02071         {
02072             CV_Assert(cn == 4);
02073             int v[] = { 0, 0, -1, -1 };
02074             __m128i mask = _mm_loadu_si128((const __m128i*)v);
02075 
02076             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
02077             {
02078                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02079                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02080 
02081                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
02082                 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
02083                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
02084                 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
02085 
02086                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
02087                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
02088                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
02089                 __m128i res0 = _mm_srli_epi16(s0, 2);
02090 
02091                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
02092                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
02093                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
02094                 __m128i res1 = _mm_srli_epi16(s0, 2);
02095                 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
02096                                                    _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
02097                 _mm_storel_epi64((__m128i*)(D), s0);
02098             }
02099         }
02100 
02101         return dx;
02102     }
02103 
02104 private:
02105     int cn;
02106     bool use_simd;
02107     int step;
02108 };
02109 
02110 class ResizeAreaFastVec_SIMD_16u
02111 {
02112 public:
02113     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
02114         cn(_cn), step(_step)
02115     {
02116         use_simd = checkHardwareSupport(CV_CPU_SSE2);
02117     }
02118 
02119     int operator() (const ushort* S, ushort* D, int w) const
02120     {
02121         if (!use_simd)
02122             return 0;
02123 
02124         int dx = 0;
02125         const ushort* S0 = (const ushort*)S;
02126         const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
02127         __m128i masklow = _mm_set1_epi32(0x0000ffff);
02128         __m128i zero = _mm_setzero_si128();
02129         __m128i delta2 = _mm_set1_epi32(2);
02130 
02131 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
02132 
02133         if (cn == 1)
02134         {
02135             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02136             {
02137                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02138                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02139 
02140                 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
02141                 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
02142                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
02143                 s0 = _mm_srli_epi32(s0, 2);
02144                 s0 = _mm_packus_epi32(s0, zero);
02145 
02146                 _mm_storel_epi64((__m128i*)D, s0);
02147             }
02148         }
02149         else if (cn == 3)
02150             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
02151             {
02152                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02153                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02154 
02155                 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
02156                 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
02157                 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
02158                 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
02159 
02160                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
02161                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
02162                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
02163                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
02164                 _mm_storel_epi64((__m128i*)D, s0);
02165             }
02166         else
02167         {
02168             CV_Assert(cn == 4);
02169             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02170             {
02171                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02172                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02173 
02174                 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
02175                 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
02176                 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
02177                 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
02178 
02179                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
02180                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
02181                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
02182                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
02183                 _mm_storel_epi64((__m128i*)D, s0);
02184             }
02185         }
02186 
02187 #undef _mm_packus_epi32
02188 
02189         return dx;
02190     }
02191 
02192 private:
02193     int cn;
02194     int step;
02195     bool use_simd;
02196 };
02197 
02198 class ResizeAreaFastVec_SIMD_16s
02199 {
02200 public:
02201     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
02202         cn(_cn), step(_step)
02203     {
02204         use_simd = checkHardwareSupport(CV_CPU_SSE2);
02205     }
02206 
02207     int operator() (const short* S, short* D, int w) const
02208     {
02209         if (!use_simd)
02210             return 0;
02211 
02212         int dx = 0;
02213         const short* S0 = (const short*)S;
02214         const short* S1 = (const short*)((const uchar*)(S) + step);
02215         __m128i masklow = _mm_set1_epi32(0x0000ffff);
02216         __m128i zero = _mm_setzero_si128();
02217         __m128i delta2 = _mm_set1_epi32(2);
02218 
02219         if (cn == 1)
02220         {
02221             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02222             {
02223                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02224                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02225 
02226                 __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
02227                     _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
02228                 __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
02229                     _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
02230                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
02231                 s0 = _mm_srai_epi32(s0, 2);
02232                 s0 = _mm_packs_epi32(s0, zero);
02233 
02234                 _mm_storel_epi64((__m128i*)D, s0);
02235             }
02236         }
02237         else if (cn == 3)
02238             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
02239             {
02240                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02241                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02242 
02243                 __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
02244                 __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
02245                 __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
02246                 __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
02247 
02248                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
02249                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
02250                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
02251                 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
02252                 _mm_storel_epi64((__m128i*)D, s0);
02253             }
02254         else
02255         {
02256             CV_Assert(cn == 4);
02257             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02258             {
02259                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02260                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02261 
02262                 __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
02263                 __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
02264                 __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
02265                 __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
02266 
02267                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
02268                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
02269                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
02270                 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
02271                 _mm_storel_epi64((__m128i*)D, s0);
02272             }
02273         }
02274 
02275         return dx;
02276     }
02277 
02278 private:
02279     int cn;
02280     int step;
02281     bool use_simd;
02282 };
02283 
02284 struct ResizeAreaFastVec_SIMD_32f
02285 {
02286     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
02287         cn(_cn), step(_step)
02288     {
02289         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
02290         fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
02291     }
02292 
02293     int operator() (const float * S, float * D, int w) const
02294     {
02295         if (!fast_mode)
02296             return 0;
02297 
02298         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
02299         int dx = 0;
02300 
02301         __m128 v_025 = _mm_set1_ps(0.25f);
02302 
02303         if (cn == 1)
02304         {
02305             const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
02306             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02307             {
02308                 __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
02309                        v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
02310 
02311                 __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
02312                                            _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
02313                 __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
02314                                            _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
02315 
02316                 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
02317             }
02318         }
02319         else if (cn == 4)
02320         {
02321             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02322             {
02323                 __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
02324                 __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
02325 
02326                 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
02327             }
02328         }
02329 
02330         return dx;
02331     }
02332 
02333 private:
02334     int cn;
02335     bool fast_mode;
02336     int step;
02337 };
02338 
02339 #else
02340 
02341 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
02342 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
02343 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
02344 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
02345 
02346 #endif
02347 
02348 template<typename T, typename SIMDVecOp>
02349 struct ResizeAreaFastVec
02350 {
02351     ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
02352         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
02353     {
02354         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
02355     }
02356 
02357     int operator() (const T* S, T* D, int w) const
02358     {
02359         if (!fast_mode)
02360             return 0;
02361 
02362         const T* nextS = (const T*)((const uchar*)S + step);
02363         int dx = vecOp(S, D, w);
02364 
02365         if (cn == 1)
02366             for( ; dx < w; ++dx )
02367             {
02368                 int index = dx*2;
02369                 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
02370             }
02371         else if (cn == 3)
02372             for( ; dx < w; dx += 3 )
02373             {
02374                 int index = dx*2;
02375                 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
02376                 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
02377                 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
02378             }
02379         else
02380             {
02381                 CV_Assert(cn == 4);
02382                 for( ; dx < w; dx += 4 )
02383                 {
02384                     int index = dx*2;
02385                     D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
02386                     D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
02387                     D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
02388                     D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
02389                 }
02390             }
02391 
02392         return dx;
02393     }
02394 
02395 private:
02396     int scale_x, scale_y;
02397     int cn;
02398     bool fast_mode;
02399     int step;
02400     SIMDVecOp vecOp;
02401 };
02402 
02403 template <typename T, typename WT, typename VecOp>
02404 class resizeAreaFast_Invoker :
02405     public ParallelLoopBody
02406 {
02407 public:
02408     resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
02409         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
02410         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
02411         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
02412     {
02413     }
02414 
02415     virtual void operator() (const Range& range) const
02416     {
02417         Size ssize = src.size(), dsize = dst.size();
02418         int cn = src.channels();
02419         int area = scale_x*scale_y;
02420         float scale = 1.f/(area);
02421         int dwidth1 = (ssize.width/scale_x)*cn;
02422         dsize.width *= cn;
02423         ssize.width *= cn;
02424         int dy, dx, k = 0;
02425 
02426         VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
02427 
02428         for( dy = range.start; dy < range.end; dy++ )
02429         {
02430             T* D = (T*)(dst.data + dst.step*dy);
02431             int sy0 = dy*scale_y;
02432             int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
02433 
02434             if( sy0 >= ssize.height )
02435             {
02436                 for( dx = 0; dx < dsize.width; dx++ )
02437                     D[dx] = 0;
02438                 continue;
02439             }
02440 
02441             dx = vop(src.template ptr<T>(sy0), D, w);
02442             for( ; dx < w; dx++ )
02443             {
02444                 const T* S = src.template ptr<T>(sy0) + xofs[dx];
02445                 WT sum = 0;
02446                 k = 0;
02447                 #if CV_ENABLE_UNROLLED
02448                 for( ; k <= area - 4; k += 4 )
02449                     sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
02450                 #endif
02451                 for( ; k < area; k++ )
02452                     sum += S[ofs[k]];
02453 
02454                 D[dx] = saturate_cast<T>(sum * scale);
02455             }
02456 
02457             for( ; dx < dsize.width; dx++ )
02458             {
02459                 WT sum = 0;
02460                 int count = 0, sx0 = xofs[dx];
02461                 if( sx0 >= ssize.width )
02462                     D[dx] = 0;
02463 
02464                 for( int sy = 0; sy < scale_y; sy++ )
02465                 {
02466                     if( sy0 + sy >= ssize.height )
02467                         break;
02468                     const T* S = src.template ptr<T>(sy0 + sy) + sx0;
02469                     for( int sx = 0; sx < scale_x*cn; sx += cn )
02470                     {
02471                         if( sx0 + sx >= ssize.width )
02472                             break;
02473                         sum += S[sx];
02474                         count++;
02475                     }
02476                 }
02477 
02478                 D[dx] = saturate_cast<T>((float)sum/count);
02479             }
02480         }
02481     }
02482 
02483 private:
02484     Mat src;
02485     Mat dst;
02486     int scale_x, scale_y;
02487     const int *ofs, *xofs;
02488 };
02489 
02490 template<typename T, typename WT, typename VecOp>
02491 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
02492                              int scale_x, int scale_y )
02493 {
02494     Range range(0, dst.rows);
02495     resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
02496         scale_y, ofs, xofs);
02497     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
02498 }
02499 
02500 struct DecimateAlpha
02501 {
02502     int si, di;
02503     float alpha;
02504 };
02505 
02506 
02507 template<typename T, typename WT> class ResizeArea_Invoker :
02508     public ParallelLoopBody
02509 {
02510 public:
02511     ResizeArea_Invoker( const Mat& _src, Mat& _dst,
02512                         const DecimateAlpha* _xtab, int _xtab_size,
02513                         const DecimateAlpha* _ytab, int _ytab_size,
02514                         const int* _tabofs )
02515     {
02516         src = &_src;
02517         dst = &_dst;
02518         xtab0 = _xtab;
02519         xtab_size0 = _xtab_size;
02520         ytab = _ytab;
02521         ytab_size = _ytab_size;
02522         tabofs = _tabofs;
02523     }
02524 
02525     virtual void operator() (const Range& range) const
02526     {
02527         Size dsize = dst->size();
02528         int cn = dst->channels();
02529         dsize.width *= cn;
02530         AutoBuffer<WT> _buffer(dsize.width*2);
02531         const DecimateAlpha* xtab = xtab0;
02532         int xtab_size = xtab_size0;
02533         WT *buf = _buffer, *sum = buf + dsize.width;
02534         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
02535 
02536         for( dx = 0; dx < dsize.width; dx++ )
02537             sum[dx] = (WT)0;
02538 
02539         for( j = j_start; j < j_end; j++ )
02540         {
02541             WT beta = ytab[j].alpha;
02542             int dy = ytab[j].di;
02543             int sy = ytab[j].si;
02544 
02545             {
02546                 const T* S = src->template ptr<T>(sy);
02547                 for( dx = 0; dx < dsize.width; dx++ )
02548                     buf[dx] = (WT)0;
02549 
02550                 if( cn == 1 )
02551                     for( k = 0; k < xtab_size; k++ )
02552                     {
02553                         int dxn = xtab[k].di;
02554                         WT alpha = xtab[k].alpha;
02555                         buf[dxn] += S[xtab[k].si]*alpha;
02556                     }
02557                 else if( cn == 2 )
02558                     for( k = 0; k < xtab_size; k++ )
02559                     {
02560                         int sxn = xtab[k].si;
02561                         int dxn = xtab[k].di;
02562                         WT alpha = xtab[k].alpha;
02563                         WT t0 = buf[dxn] + S[sxn]*alpha;
02564                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
02565                         buf[dxn] = t0; buf[dxn+1] = t1;
02566                     }
02567                 else if( cn == 3 )
02568                     for( k = 0; k < xtab_size; k++ )
02569                     {
02570                         int sxn = xtab[k].si;
02571                         int dxn = xtab[k].di;
02572                         WT alpha = xtab[k].alpha;
02573                         WT t0 = buf[dxn] + S[sxn]*alpha;
02574                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
02575                         WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
02576                         buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
02577                     }
02578                 else if( cn == 4 )
02579                 {
02580                     for( k = 0; k < xtab_size; k++ )
02581                     {
02582                         int sxn = xtab[k].si;
02583                         int dxn = xtab[k].di;
02584                         WT alpha = xtab[k].alpha;
02585                         WT t0 = buf[dxn] + S[sxn]*alpha;
02586                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
02587                         buf[dxn] = t0; buf[dxn+1] = t1;
02588                         t0 = buf[dxn+2] + S[sxn+2]*alpha;
02589                         t1 = buf[dxn+3] + S[sxn+3]*alpha;
02590                         buf[dxn+2] = t0; buf[dxn+3] = t1;
02591                     }
02592                 }
02593                 else
02594                 {
02595                     for( k = 0; k < xtab_size; k++ )
02596                     {
02597                         int sxn = xtab[k].si;
02598                         int dxn = xtab[k].di;
02599                         WT alpha = xtab[k].alpha;
02600                         for( int c = 0; c < cn; c++ )
02601                             buf[dxn + c] += S[sxn + c]*alpha;
02602                     }
02603                 }
02604             }
02605 
02606             if( dy != prev_dy )
02607             {
02608                 T* D = dst->template ptr<T>(prev_dy);
02609 
02610                 for( dx = 0; dx < dsize.width; dx++ )
02611                 {
02612                     D[dx] = saturate_cast<T>(sum[dx]);
02613                     sum[dx] = beta*buf[dx];
02614                 }
02615                 prev_dy = dy;
02616             }
02617             else
02618             {
02619                 for( dx = 0; dx < dsize.width; dx++ )
02620                     sum[dx] += beta*buf[dx];
02621             }
02622         }
02623 
02624         {
02625         T* D = dst->template ptr<T>(prev_dy);
02626         for( dx = 0; dx < dsize.width; dx++ )
02627             D[dx] = saturate_cast<T>(sum[dx]);
02628         }
02629     }
02630 
02631 private:
02632     const Mat* src;
02633     Mat* dst;
02634     const DecimateAlpha* xtab0;
02635     const DecimateAlpha* ytab;
02636     int xtab_size0, ytab_size;
02637     const int* tabofs;
02638 };
02639 
02640 
02641 template <typename T, typename WT>
02642 static void resizeArea_( const Mat& src, Mat& dst,
02643                          const DecimateAlpha* xtab, int xtab_size,
02644                          const DecimateAlpha* ytab, int ytab_size,
02645                          const int* tabofs )
02646 {
02647     parallel_for_(Range(0, dst.rows),
02648                  ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
02649                  dst.total()/((double)(1 << 16)));
02650 }
02651 
02652 
02653 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
02654                             const int* xofs, const void* alpha,
02655                             const int* yofs, const void* beta,
02656                             int xmin, int xmax, int ksize );
02657 
02658 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
02659                                     const int* ofs, const int *xofs,
02660                                     int scale_x, int scale_y );
02661 
02662 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
02663                                 const DecimateAlpha* xtab, int xtab_size,
02664                                 const DecimateAlpha* ytab, int ytab_size,
02665                                 const int* yofs);
02666 
02667 
02668 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
02669 {
02670     int k = 0;
02671     for(int dx = 0; dx < dsize; dx++ )
02672     {
02673         double fsx1 = dx * scale;
02674         double fsx2 = fsx1 + scale;
02675         double cellWidth = std::min(scale, ssize - fsx1);
02676 
02677         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
02678 
02679         sx2 = std::min(sx2, ssize - 1);
02680         sx1 = std::min(sx1, sx2);
02681 
02682         if( sx1 - fsx1 > 1e-3 )
02683         {
02684             assert( k < ssize*2 );
02685             tab[k].di = dx * cn;
02686             tab[k].si = (sx1 - 1) * cn;
02687             tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
02688         }
02689 
02690         for(int sx = sx1; sx < sx2; sx++ )
02691         {
02692             assert( k < ssize*2 );
02693             tab[k].di = dx * cn;
02694             tab[k].si = sx * cn;
02695             tab[k++].alpha = float(1.0 / cellWidth);
02696         }
02697 
02698         if( fsx2 - sx2 > 1e-3 )
02699         {
02700             assert( k < ssize*2 );
02701             tab[k].di = dx * cn;
02702             tab[k].si = sx2 * cn;
02703             tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
02704         }
02705     }
02706     return k;
02707 }
02708 
02709 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
02710 
02711 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
02712     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
02713     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
02714     specBuf.allocate(specSize);\
02715     pSpec = (uchar*)specBuf;\
02716     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
02717 
02718 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
02719     if (mode == (int)ippCubic) { *ok = false; return; } \
02720     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
02721     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
02722     specBuf.allocate(specSize);\
02723     pSpec = (uchar*)specBuf;\
02724     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
02725     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
02726     getSrcOffsetFunc =  (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
02727 
02728 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
02729     func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
02730     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
02731     specBuf.allocate(specSize);\
02732     pSpec = (uchar*)specBuf;\
02733     AutoBuffer<uchar> buf(initSize);\
02734     uchar* pInit = (uchar*)buf;\
02735     CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
02736 
02737 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
02738     if (mode == (int)ippLinear)     { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
02739     else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
02740     else { *ok = false; return; } \
02741     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
02742     getSrcOffsetFunc =  (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
02743 
02744 #if IPP_VERSION_X100 >= 710
02745 class IPPresizeInvoker :
02746     public ParallelLoopBody
02747 {
02748 public:
02749     IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
02750         ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
02751         inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
02752         func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
02753     {
02754         *ok = true;
02755         IppiSize srcSize, dstSize;
02756         int type = src.type(), specSize = 0, initSize = 0;
02757         srcSize.width  = src.cols;
02758         srcSize.height = src.rows;
02759         dstSize.width  = dst.cols;
02760         dstSize.height = dst.rows;
02761 
02762         switch (type)
02763         {
02764 #if IPP_DISABLE_BLOCK // disabled since it breaks tests for CascadeClassifier
02765             case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
02766             case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
02767             case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
02768 #endif
02769             case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
02770             case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
02771             case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
02772             case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
02773             case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
02774             case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
02775             case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
02776             case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
02777             case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
02778             case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
02779             case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
02780             case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
02781             default: { *ok = false; return; } break;
02782         }
02783     }
02784 
02785     ~IPPresizeInvoker()
02786     {
02787     }
02788 
02789     virtual void operator() (const Range& range) const
02790     {
02791         if (*ok == false)
02792             return;
02793 
02794         int cn = src.channels();
02795         int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
02796         int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
02797         int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
02798 
02799         IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
02800         IppiSize  dstSize   = { dstwidth, dstheight - dsty };
02801         int bufsize = 0, itemSize = (int)src.elemSize1();
02802 
02803         CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
02804         CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
02805 
02806         const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
02807         Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
02808 
02809         AutoBuffer<uchar> buf(bufsize + 64);
02810         uchar* bufptr = alignPtr((uchar*)buf, 32);
02811 
02812         if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
02813             *ok = false;
02814         else
02815         {
02816             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
02817         }
02818     }
02819 private:
02820     const Mat & src;
02821     Mat & dst;
02822     double inv_scale_x;
02823     double inv_scale_y;
02824     void *pSpec;
02825     AutoBuffer<uchar> specBuf;
02826     int mode;
02827     ippiResizeFunc func;
02828     ippiResizeGetBufferSize getBufferSizeFunc;
02829     ippiResizeGetSrcOffset getSrcOffsetFunc;
02830     bool *ok;
02831     const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
02832 };
02833 
02834 #endif
02835 
02836 #ifdef HAVE_OPENCL
02837 
02838 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
02839                                       float * const alpha_tab, int * const ofs_tab)
02840 {
02841     int k = 0, dx = 0;
02842     for ( ; dx < dsize; dx++)
02843     {
02844         ofs_tab[dx] = k;
02845 
02846         double fsx1 = dx * scale;
02847         double fsx2 = fsx1 + scale;
02848         double cellWidth = std::min(scale, ssize - fsx1);
02849 
02850         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
02851 
02852         sx2 = std::min(sx2, ssize - 1);
02853         sx1 = std::min(sx1, sx2);
02854 
02855         if (sx1 - fsx1 > 1e-3)
02856         {
02857             map_tab[k] = sx1 - 1;
02858             alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
02859         }
02860 
02861         for (int sx = sx1; sx < sx2; sx++)
02862         {
02863             map_tab[k] = sx;
02864             alpha_tab[k++] = float(1.0 / cellWidth);
02865         }
02866 
02867         if (fsx2 - sx2 > 1e-3)
02868         {
02869             map_tab[k] = sx2;
02870             alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
02871         }
02872     }
02873     ofs_tab[dx] = k;
02874 }
02875 
02876 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
02877                         double fx, double fy, int interpolation)
02878 {
02879     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
02880 
02881     double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
02882     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
02883     int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
02884     bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
02885         std::abs(inv_fy - iscale_y) < DBL_EPSILON;
02886 
02887     // in case of scale_x && scale_y is equal to 2
02888     // INTER_AREA (fast) also is equal to INTER_LINEAR
02889     if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
02890         /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
02891 
02892     if( !(cn <= 4 &&
02893            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
02894             (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
02895         return false;
02896 
02897     UMat src = _src.getUMat();
02898     _dst.create(dsize, type);
02899     UMat dst = _dst.getUMat();
02900 
02901     Size ssize = src.size();
02902     ocl::Kernel k;
02903     size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
02904 
02905     ocl::Image2D srcImage;
02906 
02907     // See if this could be done with a sampler.  We stick with integer
02908     // datatypes because the observed error is low.
02909     bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
02910                        ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
02911                        ocl::Image2D::isFormatSupported(depth, cn, true) &&
02912                        src.offset==0);
02913     if (useSampler)
02914     {
02915         int wdepth = std::max(depth, CV_32S);
02916         char buf[2][32];
02917         cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
02918                         "-D convertToDT=%s -D cn=%d",
02919                         depth, ocl::typeToStr(type), ocl::typeToStr(depth),
02920                         ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
02921                         cn);
02922         k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
02923 
02924         if (k.empty())
02925             useSampler = false;
02926         else
02927         {
02928             // Convert the input into an OpenCL image type, using normalized channel data types
02929             // and aliasing the UMat.
02930             srcImage = ocl::Image2D(src, true, true);
02931             k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
02932                    (float)inv_fx, (float)inv_fy);
02933         }
02934     }
02935 
02936     if (interpolation == INTER_LINEAR && !useSampler)
02937     {
02938         char buf[2][32];
02939 
02940         // integer path is slower because of CPU part, so it's disabled
02941         if (depth == CV_8U && ((void)0, 0))
02942         {
02943             AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
02944             int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
02945             short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
02946             float fxx, fyy;
02947             int sx, sy;
02948 
02949             for (int dx = 0; dx < dsize.width; dx++)
02950             {
02951                 fxx = (float)((dx+0.5)*inv_fx - 0.5);
02952                 sx = cvFloor(fxx);
02953                 fxx -= sx;
02954 
02955                 if (sx < 0)
02956                     fxx = 0, sx = 0;
02957 
02958                 if (sx >= ssize.width-1)
02959                     fxx = 0, sx = ssize.width-1;
02960 
02961                 xofs[dx] = sx;
02962                 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
02963                 ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
02964             }
02965 
02966             for (int dy = 0; dy < dsize.height; dy++)
02967             {
02968                 fyy = (float)((dy+0.5)*inv_fy - 0.5);
02969                 sy = cvFloor(fyy);
02970                 fyy -= sy;
02971 
02972                 yofs[dy] = sy;
02973                 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
02974                 ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
02975             }
02976 
02977             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
02978             UMat coeffs;
02979             Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
02980 
02981             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
02982                      format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
02983                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
02984                             "-D INTER_RESIZE_COEF_BITS=%d",
02985                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
02986                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
02987                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
02988                             cn, INTER_RESIZE_COEF_BITS));
02989             if (k.empty())
02990                 return false;
02991 
02992             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
02993                    ocl::KernelArg::PtrReadOnly(coeffs));
02994         }
02995         else
02996         {
02997             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
02998             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
02999                      format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
03000                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
03001                             "-D INTER_RESIZE_COEF_BITS=%d",
03002                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
03003                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
03004                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
03005                             cn, INTER_RESIZE_COEF_BITS));
03006             if (k.empty())
03007                 return false;
03008 
03009             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
03010                    (float)inv_fx, (float)inv_fy);
03011         }
03012     }
03013     else if (interpolation == INTER_NEAREST)
03014     {
03015         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
03016                  format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
03017                         ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
03018         if (k.empty())
03019             return false;
03020 
03021         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
03022                (float)inv_fx, (float)inv_fy);
03023     }
03024     else if (interpolation == INTER_AREA)
03025     {
03026         int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
03027         int wtype = CV_MAKE_TYPE(wdepth, cn);
03028 
03029         char cvt[2][40];
03030         String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
03031                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
03032                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
03033 
03034         UMat alphaOcl, tabofsOcl, mapOcl;
03035         UMat dmap, smap;
03036 
03037         if (is_area_fast)
03038         {
03039             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
03040             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
03041                                                 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
03042                                                 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
03043                                                 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
03044                                     iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
03045 
03046             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
03047             if (k.empty())
03048                 return false;
03049         }
03050         else
03051         {
03052             buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
03053             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
03054             if (k.empty())
03055                 return false;
03056 
03057             int xytab_size = (ssize.width + ssize.height) << 1;
03058             int tabofs_size = dsize.height + dsize.width + 2;
03059 
03060             AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
03061             AutoBuffer<float> _xyalpha_tab(xytab_size);
03062             int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
03063             float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
03064             int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
03065 
03066             ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
03067             ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
03068 
03069             // loading precomputed arrays to GPU
03070             Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
03071             Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
03072             Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
03073         }
03074 
03075         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
03076 
03077         if (is_area_fast)
03078             k.args(srcarg, dstarg);
03079         else
03080             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
03081                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
03082 
03083         return k.run(2, globalsize, NULL, false);
03084     }
03085 
03086     return k.run(2, globalsize, 0, false);
03087 }
03088 
03089 #endif
03090 
03091 #if IPP_VERSION_X100 >= 710
03092 static bool ipp_resize_mt(    Mat src, Mat dst,
03093                         double inv_scale_x, double inv_scale_y, int interpolation)
03094 {
03095     int mode = -1;
03096     if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
03097         mode = ippLinear;
03098     else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
03099         mode = ippCubic;
03100     else
03101         return false;
03102 
03103     bool ok = true;
03104     Range range(0, src.rows);
03105     IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
03106     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
03107     if( ok )
03108         return true;
03109 
03110     return false;
03111 }
03112 #endif
03113 
03114 }
03115 
03116 
03117 
03118 //////////////////////////////////////////////////////////////////////////////////////////
03119 
03120 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
03121                  double inv_scale_x, double inv_scale_y, int interpolation )
03122 {
03123     static ResizeFunc linear_tab[] =
03124     {
03125         resizeGeneric_<
03126             HResizeLinear<uchar, int, short,
03127                 INTER_RESIZE_COEF_SCALE,
03128                 HResizeLinearVec_8u32s>,
03129             VResizeLinear<uchar, int, short,
03130                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
03131                 VResizeLinearVec_32s8u> >,
03132         0,
03133         resizeGeneric_<
03134             HResizeLinear<ushort, float, float, 1,
03135                 HResizeLinearVec_16u32f>,
03136             VResizeLinear<ushort, float, float, Cast<float, ushort>,
03137                 VResizeLinearVec_32f16u> >,
03138         resizeGeneric_<
03139             HResizeLinear<short, float, float, 1,
03140                 HResizeLinearVec_16s32f>,
03141             VResizeLinear<short, float, float, Cast<float, short>,
03142                 VResizeLinearVec_32f16s> >,
03143         0,
03144         resizeGeneric_<
03145             HResizeLinear<float, float, float, 1,
03146                 HResizeLinearVec_32f>,
03147             VResizeLinear<float, float, float, Cast<float, float>,
03148                 VResizeLinearVec_32f> >,
03149         resizeGeneric_<
03150             HResizeLinear<double, double, float, 1,
03151                 HResizeNoVec>,
03152             VResizeLinear<double, double, float, Cast<double, double>,
03153                 VResizeNoVec> >,
03154         0
03155     };
03156 
03157     static ResizeFunc cubic_tab[] =
03158     {
03159         resizeGeneric_<
03160             HResizeCubic<uchar, int, short>,
03161             VResizeCubic<uchar, int, short,
03162                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
03163                 VResizeCubicVec_32s8u> >,
03164         0,
03165         resizeGeneric_<
03166             HResizeCubic<ushort, float, float>,
03167             VResizeCubic<ushort, float, float, Cast<float, ushort>,
03168             VResizeCubicVec_32f16u> >,
03169         resizeGeneric_<
03170             HResizeCubic<short, float, float>,
03171             VResizeCubic<short, float, float, Cast<float, short>,
03172             VResizeCubicVec_32f16s> >,
03173         0,
03174         resizeGeneric_<
03175             HResizeCubic<float, float, float>,
03176             VResizeCubic<float, float, float, Cast<float, float>,
03177             VResizeCubicVec_32f> >,
03178         resizeGeneric_<
03179             HResizeCubic<double, double, float>,
03180             VResizeCubic<double, double, float, Cast<double, double>,
03181             VResizeNoVec> >,
03182         0
03183     };
03184 
03185     static ResizeFunc lanczos4_tab[] =
03186     {
03187         resizeGeneric_<HResizeLanczos4<uchar, int, short>,
03188             VResizeLanczos4<uchar, int, short,
03189             FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
03190             VResizeNoVec> >,
03191         0,
03192         resizeGeneric_<HResizeLanczos4<ushort, float, float>,
03193             VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
03194             VResizeLanczos4Vec_32f16u> >,
03195         resizeGeneric_<HResizeLanczos4<short, float, float>,
03196             VResizeLanczos4<short, float, float, Cast<float, short>,
03197             VResizeLanczos4Vec_32f16s> >,
03198         0,
03199         resizeGeneric_<HResizeLanczos4<float, float, float>,
03200             VResizeLanczos4<float, float, float, Cast<float, float>,
03201             VResizeLanczos4Vec_32f> >,
03202         resizeGeneric_<HResizeLanczos4<double, double, float>,
03203             VResizeLanczos4<double, double, float, Cast<double, double>,
03204             VResizeNoVec> >,
03205         0
03206     };
03207 
03208     static ResizeAreaFastFunc areafast_tab[] =
03209     {
03210         resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
03211         0,
03212         resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
03213         resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
03214         0,
03215         resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
03216         resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
03217         0
03218     };
03219 
03220     static ResizeAreaFunc area_tab[] =
03221     {
03222         resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
03223         resizeArea_<short, float>, 0, resizeArea_<float, float>,
03224         resizeArea_<double, double>, 0
03225     };
03226 
03227     Size ssize = _src.size();
03228 
03229     CV_Assert( ssize.area() > 0 );
03230     CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
03231     if( dsize.area() == 0 )
03232     {
03233         dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
03234                      saturate_cast<int>(ssize.height*inv_scale_y));
03235         CV_Assert( dsize.area() > 0 );
03236     }
03237     else
03238     {
03239         inv_scale_x = (double)dsize.width/ssize.width;
03240         inv_scale_y = (double)dsize.height/ssize.height;
03241     }
03242 
03243 
03244     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
03245     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
03246 
03247     int iscale_x = saturate_cast<int>(scale_x);
03248     int iscale_y = saturate_cast<int>(scale_y);
03249 
03250     bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
03251             std::abs(scale_y - iscale_y) < DBL_EPSILON;
03252 
03253 #ifdef HAVE_OPENCL
03254     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
03255                ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
03256 #endif
03257 
03258     Mat src = _src.getMat();
03259     _dst.create(dsize, src.type());
03260     Mat dst = _dst.getMat();
03261 
03262     if (dsize == ssize) {
03263       // Source and destination are of same size. Use simple copy.
03264       src.copyTo(dst);
03265       return;
03266     }
03267 
03268 #ifdef HAVE_TEGRA_OPTIMIZATION
03269     if (tegra::useTegra() && tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
03270         return;
03271 #endif
03272 
03273 #ifdef HAVE_IPP
03274     int mode = -1;
03275     if (interpolation == INTER_LINEAR && _src.rows() >= 2 && _src.cols() >= 2)
03276         mode = INTER_LINEAR;
03277     else if (interpolation == INTER_CUBIC && _src.rows() >= 4 && _src.cols() >= 4)
03278         mode = INTER_CUBIC;
03279 
03280     const double IPP_RESIZE_EPS = 1e-10;
03281     double ex = fabs((double)dsize.width / _src.cols()  - inv_scale_x) / inv_scale_x;
03282     double ey = fabs((double)dsize.height / _src.rows() - inv_scale_y) / inv_scale_y;
03283 #endif
03284     CV_IPP_RUN(IPP_VERSION_X100 >= 710 && ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
03285         (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
03286         !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U) &&
03287         mode >= 0 && (cn == 1 || cn == 3 || cn == 4) && (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
03288         (depth == CV_64F && mode == INTER_LINEAR)), ipp_resize_mt(src, dst, inv_scale_x, inv_scale_y, interpolation))
03289 
03290 
03291     if( interpolation == INTER_NEAREST )
03292     {
03293         resizeNN( src, dst, inv_scale_x, inv_scale_y );
03294         return;
03295     }
03296 
03297     int k, sx, sy, dx, dy;
03298 
03299 
03300     {
03301         // in case of scale_x && scale_y is equal to 2
03302         // INTER_AREA (fast) also is equal to INTER_LINEAR
03303         if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
03304             interpolation = INTER_AREA;
03305 
03306         // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
03307         // In other cases it is emulated using some variant of bilinear interpolation
03308         if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
03309         {
03310             if( is_area_fast )
03311             {
03312                 int area = iscale_x*iscale_y;
03313                 size_t srcstep = src.step / src.elemSize1();
03314                 AutoBuffer<int>  _ofs(area + dsize.width*cn);
03315                 int* ofs = _ofs;
03316                 int* xofs = ofs + area;
03317                 ResizeAreaFastFunc func = areafast_tab[depth];
03318                 CV_Assert( func != 0 );
03319 
03320                 for( sy = 0, k = 0; sy < iscale_y; sy++ )
03321                     for( sx = 0; sx < iscale_x; sx++ )
03322                         ofs[k++] = (int)(sy*srcstep + sx*cn);
03323 
03324                 for( dx = 0; dx < dsize.width; dx++ )
03325                 {
03326                     int j = dx * cn;
03327                     sx = iscale_x * j;
03328                     for( k = 0; k < cn; k++ )
03329                         xofs[j + k] = sx + k;
03330                 }
03331 
03332                 func( src, dst, ofs, xofs, iscale_x, iscale_y );
03333                 return;
03334             }
03335 
03336             ResizeAreaFunc func = area_tab[depth];
03337             CV_Assert( func != 0 && cn <= 4 );
03338 
03339             AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
03340             DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
03341 
03342             int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
03343             int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
03344 
03345             AutoBuffer<int>  _tabofs(dsize.height + 1);
03346             int* tabofs = _tabofs;
03347             for( k = 0, dy = 0; k < ytab_size; k++ )
03348             {
03349                 if( k == 0 || ytab[k].di != ytab[k-1].di )
03350                 {
03351                     assert( ytab[k].di == dy );
03352                     tabofs[dy++] = k;
03353                 }
03354             }
03355             tabofs[dy] = ytab_size;
03356 
03357             func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
03358             return;
03359         }
03360     }
03361 
03362     int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
03363     bool area_mode = interpolation == INTER_AREA;
03364     bool fixpt = depth == CV_8U;
03365     float fx, fy;
03366     ResizeFunc func=0;
03367     int ksize=0, ksize2;
03368     if( interpolation == INTER_CUBIC )
03369         ksize = 4, func = cubic_tab[depth];
03370     else if( interpolation == INTER_LANCZOS4 )
03371         ksize = 8, func = lanczos4_tab[depth];
03372     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
03373         ksize = 2, func = linear_tab[depth];
03374     else
03375         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
03376     ksize2 = ksize/2;
03377 
03378     CV_Assert( func != 0 );
03379 
03380     AutoBuffer<uchar>  _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
03381     int* xofs = (int*)(uchar*)_buffer;
03382     int* yofs = xofs + width;
03383     float* alpha = (float*)(yofs + dsize.height);
03384     short* ialpha = (short*)alpha;
03385     float* beta = alpha + width*ksize;
03386     short* ibeta = ialpha + width*ksize;
03387     float cbuf[MAX_ESIZE];
03388 
03389     for( dx = 0; dx < dsize.width; dx++ )
03390     {
03391         if( !area_mode )
03392         {
03393             fx = (float)((dx+0.5)*scale_x - 0.5);
03394             sx = cvFloor(fx);
03395             fx -= sx;
03396         }
03397         else
03398         {
03399             sx = cvFloor(dx*scale_x);
03400             fx = (float)((dx+1) - (sx+1)*inv_scale_x);
03401             fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
03402         }
03403 
03404         if( sx < ksize2-1 )
03405         {
03406             xmin = dx+1;
03407             if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
03408                 fx = 0, sx = 0;
03409         }
03410 
03411         if( sx + ksize2 >= ssize.width )
03412         {
03413             xmax = std::min( xmax, dx );
03414             if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
03415                 fx = 0, sx = ssize.width-1;
03416         }
03417 
03418         for( k = 0, sx *= cn; k < cn; k++ )
03419             xofs[dx*cn + k] = sx + k;
03420 
03421         if( interpolation == INTER_CUBIC )
03422             interpolateCubic( fx, cbuf );
03423         else if( interpolation == INTER_LANCZOS4 )
03424             interpolateLanczos4( fx, cbuf );
03425         else
03426         {
03427             cbuf[0] = 1.f - fx;
03428             cbuf[1] = fx;
03429         }
03430         if( fixpt )
03431         {
03432             for( k = 0; k < ksize; k++ )
03433                 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
03434             for( ; k < cn*ksize; k++ )
03435                 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
03436         }
03437         else
03438         {
03439             for( k = 0; k < ksize; k++ )
03440                 alpha[dx*cn*ksize + k] = cbuf[k];
03441             for( ; k < cn*ksize; k++ )
03442                 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
03443         }
03444     }
03445 
03446     for( dy = 0; dy < dsize.height; dy++ )
03447     {
03448         if( !area_mode )
03449         {
03450             fy = (float)((dy+0.5)*scale_y - 0.5);
03451             sy = cvFloor(fy);
03452             fy -= sy;
03453         }
03454         else
03455         {
03456             sy = cvFloor(dy*scale_y);
03457             fy = (float)((dy+1) - (sy+1)*inv_scale_y);
03458             fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
03459         }
03460 
03461         yofs[dy] = sy;
03462         if( interpolation == INTER_CUBIC )
03463             interpolateCubic( fy, cbuf );
03464         else if( interpolation == INTER_LANCZOS4 )
03465             interpolateLanczos4( fy, cbuf );
03466         else
03467         {
03468             cbuf[0] = 1.f - fy;
03469             cbuf[1] = fy;
03470         }
03471 
03472         if( fixpt )
03473         {
03474             for( k = 0; k < ksize; k++ )
03475                 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
03476         }
03477         else
03478         {
03479             for( k = 0; k < ksize; k++ )
03480                 beta[dy*ksize + k] = cbuf[k];
03481         }
03482     }
03483 
03484     func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
03485           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
03486 }
03487 
03488 
03489 /****************************************************************************************\
03490 *                       General warping (affine, perspective, remap)                     *
03491 \****************************************************************************************/
03492 
03493 namespace cv
03494 {
03495 
03496 template<typename T>
03497 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
03498                           int borderType, const Scalar& _borderValue )
03499 {
03500     Size ssize = _src.size(), dsize = _dst.size();
03501     int cn = _src.channels();
03502     const T* S0 = _src.ptr<T>();
03503     size_t sstep = _src.step/sizeof(S0[0]);
03504     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
03505         saturate_cast<T>(_borderValue[1]),
03506         saturate_cast<T>(_borderValue[2]),
03507         saturate_cast<T>(_borderValue[3]));
03508     int dx, dy;
03509 
03510     unsigned width1 = ssize.width, height1 = ssize.height;
03511 
03512     if( _dst.isContinuous() && _xy.isContinuous() )
03513     {
03514         dsize.width *= dsize.height;
03515         dsize.height = 1;
03516     }
03517 
03518     for( dy = 0; dy < dsize.height; dy++ )
03519     {
03520         T* D = _dst.ptr<T>(dy);
03521         const short* XY = _xy.ptr<short>(dy);
03522 
03523         if( cn == 1 )
03524         {
03525             for( dx = 0; dx < dsize.width; dx++ )
03526             {
03527                 int sx = XY[dx*2], sy = XY[dx*2+1];
03528                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
03529                     D[dx] = S0[sy*sstep + sx];
03530                 else
03531                 {
03532                     if( borderType == BORDER_REPLICATE )
03533                     {
03534                         sx = clip(sx, 0, ssize.width);
03535                         sy = clip(sy, 0, ssize.height);
03536                         D[dx] = S0[sy*sstep + sx];
03537                     }
03538                     else if( borderType == BORDER_CONSTANT )
03539                         D[dx] = cval[0];
03540                     else if( borderType != BORDER_TRANSPARENT )
03541                     {
03542                         sx = borderInterpolate(sx, ssize.width, borderType);
03543                         sy = borderInterpolate(sy, ssize.height, borderType);
03544                         D[dx] = S0[sy*sstep + sx];
03545                     }
03546                 }
03547             }
03548         }
03549         else
03550         {
03551             for( dx = 0; dx < dsize.width; dx++, D += cn )
03552             {
03553                 int sx = XY[dx*2], sy = XY[dx*2+1], k;
03554                 const T *S;
03555                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
03556                 {
03557                     if( cn == 3 )
03558                     {
03559                         S = S0 + sy*sstep + sx*3;
03560                         D[0] = S[0], D[1] = S[1], D[2] = S[2];
03561                     }
03562                     else if( cn == 4 )
03563                     {
03564                         S = S0 + sy*sstep + sx*4;
03565                         D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
03566                     }
03567                     else
03568                     {
03569                         S = S0 + sy*sstep + sx*cn;
03570                         for( k = 0; k < cn; k++ )
03571                             D[k] = S[k];
03572                     }
03573                 }
03574                 else if( borderType != BORDER_TRANSPARENT )
03575                 {
03576                     if( borderType == BORDER_REPLICATE )
03577                     {
03578                         sx = clip(sx, 0, ssize.width);
03579                         sy = clip(sy, 0, ssize.height);
03580                         S = S0 + sy*sstep + sx*cn;
03581                     }
03582                     else if( borderType == BORDER_CONSTANT )
03583                         S = &cval[0];
03584                     else
03585                     {
03586                         sx = borderInterpolate(sx, ssize.width, borderType);
03587                         sy = borderInterpolate(sy, ssize.height, borderType);
03588                         S = S0 + sy*sstep + sx*cn;
03589                     }
03590                     for( k = 0; k < cn; k++ )
03591                         D[k] = S[k];
03592                 }
03593             }
03594         }
03595     }
03596 }
03597 
03598 
03599 struct RemapNoVec
03600 {
03601     int operator()( const Mat&, void*, const short*, const ushort*,
03602                     const void*, int ) const { return 0; }
03603 };
03604 
03605 #if CV_SSE2
03606 
03607 struct RemapVec_8u
03608 {
03609     int operator()( const Mat& _src, void* _dst, const short* XY,
03610                     const ushort* FXY, const void* _wtab, int width ) const
03611     {
03612         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
03613 
03614         if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
03615             sstep > 0x8000 )
03616             return 0;
03617 
03618         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
03619         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
03620         uchar* D = (uchar*)_dst;
03621         __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
03622         __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
03623         __m128i z = _mm_setzero_si128();
03624         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
03625 
03626         if( cn == 1 )
03627         {
03628             for( ; x <= width - 8; x += 8 )
03629             {
03630                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
03631                 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
03632                 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
03633                 unsigned i0, i1;
03634 
03635                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
03636                 xy1 = _mm_madd_epi16( xy1, xy2ofs );
03637                 _mm_store_si128( (__m128i*)iofs0, xy0 );
03638                 _mm_store_si128( (__m128i*)iofs1, xy1 );
03639 
03640                 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
03641                 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
03642                 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
03643                 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
03644                 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
03645                 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
03646                 v0 = _mm_unpacklo_epi8(v0, z);
03647                 v1 = _mm_unpacklo_epi8(v1, z);
03648 
03649                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
03650                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
03651                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
03652                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
03653                 b0 = _mm_unpacklo_epi64(a0, a1);
03654                 b1 = _mm_unpackhi_epi64(a0, a1);
03655                 v0 = _mm_madd_epi16(v0, b0);
03656                 v1 = _mm_madd_epi16(v1, b1);
03657                 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
03658 
03659                 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
03660                 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
03661                 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
03662                 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
03663                 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
03664                 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
03665                 v2 = _mm_unpacklo_epi8(v2, z);
03666                 v3 = _mm_unpacklo_epi8(v3, z);
03667 
03668                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
03669                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
03670                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
03671                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
03672                 b0 = _mm_unpacklo_epi64(a0, a1);
03673                 b1 = _mm_unpackhi_epi64(a0, a1);
03674                 v2 = _mm_madd_epi16(v2, b0);
03675                 v3 = _mm_madd_epi16(v3, b1);
03676                 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
03677 
03678                 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
03679                 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
03680                 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
03681                 _mm_storel_epi64( (__m128i*)(D + x), v0 );
03682             }
03683         }
03684         else if( cn == 3 )
03685         {
03686             for( ; x <= width - 5; x += 4, D += 12 )
03687             {
03688                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
03689                 __m128i u0, v0, u1, v1;
03690 
03691                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
03692                 _mm_store_si128( (__m128i*)iofs0, xy0 );
03693                 const __m128i *w0, *w1;
03694                 w0 = (const __m128i*)(wtab + FXY[x]*16);
03695                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
03696 
03697                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
03698                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
03699                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
03700                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
03701                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
03702                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
03703                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
03704                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
03705                 u0 = _mm_unpacklo_epi8(u0, z);
03706                 v0 = _mm_unpacklo_epi8(v0, z);
03707                 u1 = _mm_unpacklo_epi8(u1, z);
03708                 v1 = _mm_unpacklo_epi8(v1, z);
03709                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
03710                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
03711                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
03712                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
03713                 u0 = _mm_slli_si128(u0, 4);
03714                 u0 = _mm_packs_epi32(u0, u1);
03715                 u0 = _mm_packus_epi16(u0, u0);
03716                 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
03717 
03718                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
03719                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
03720 
03721                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
03722                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
03723                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
03724                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
03725                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
03726                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
03727                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
03728                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
03729                 u0 = _mm_unpacklo_epi8(u0, z);
03730                 v0 = _mm_unpacklo_epi8(v0, z);
03731                 u1 = _mm_unpacklo_epi8(u1, z);
03732                 v1 = _mm_unpacklo_epi8(v1, z);
03733                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
03734                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
03735                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
03736                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
03737                 u0 = _mm_slli_si128(u0, 4);
03738                 u0 = _mm_packs_epi32(u0, u1);
03739                 u0 = _mm_packus_epi16(u0, u0);
03740                 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
03741             }
03742         }
03743         else if( cn == 4 )
03744         {
03745             for( ; x <= width - 4; x += 4, D += 16 )
03746             {
03747                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
03748                 __m128i u0, v0, u1, v1;
03749 
03750                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
03751                 _mm_store_si128( (__m128i*)iofs0, xy0 );
03752                 const __m128i *w0, *w1;
03753                 w0 = (const __m128i*)(wtab + FXY[x]*16);
03754                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
03755 
03756                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
03757                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
03758                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
03759                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
03760                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
03761                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
03762                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
03763                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
03764                 u0 = _mm_unpacklo_epi8(u0, z);
03765                 v0 = _mm_unpacklo_epi8(v0, z);
03766                 u1 = _mm_unpacklo_epi8(u1, z);
03767                 v1 = _mm_unpacklo_epi8(v1, z);
03768                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
03769                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
03770                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
03771                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
03772                 u0 = _mm_packs_epi32(u0, u1);
03773                 u0 = _mm_packus_epi16(u0, u0);
03774                 _mm_storel_epi64((__m128i*)D, u0);
03775 
03776                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
03777                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
03778 
03779                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
03780                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
03781                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
03782                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
03783                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
03784                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
03785                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
03786                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
03787                 u0 = _mm_unpacklo_epi8(u0, z);
03788                 v0 = _mm_unpacklo_epi8(v0, z);
03789                 u1 = _mm_unpacklo_epi8(u1, z);
03790                 v1 = _mm_unpacklo_epi8(v1, z);
03791                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
03792                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
03793                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
03794                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
03795                 u0 = _mm_packs_epi32(u0, u1);
03796                 u0 = _mm_packus_epi16(u0, u0);
03797                 _mm_storel_epi64((__m128i*)(D + 8), u0);
03798             }
03799         }
03800 
03801         return x;
03802     }
03803 };
03804 
03805 #else
03806 
03807 typedef RemapNoVec RemapVec_8u;
03808 
03809 #endif
03810 
03811 
03812 template<class CastOp, class VecOp, typename AT>
03813 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
03814                            const Mat& _fxy, const void* _wtab,
03815                            int borderType, const Scalar& _borderValue )
03816 {
03817     typedef typename CastOp::rtype T;
03818     typedef typename CastOp::type1 WT;
03819     Size ssize = _src.size(), dsize = _dst.size();
03820     int k, cn = _src.channels();
03821     const AT* wtab = (const AT*)_wtab;
03822     const T* S0 = _src.ptr<T>();
03823     size_t sstep = _src.step/sizeof(S0[0]);
03824     T cval[CV_CN_MAX];
03825     int dx, dy;
03826     CastOp castOp;
03827     VecOp vecOp;
03828 
03829     for( k = 0; k < cn; k++ )
03830         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
03831 
03832     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
03833     CV_Assert( ssize.area() > 0 );
03834 #if CV_SSE2
03835     if( _src.type() == CV_8UC3 )
03836         width1 = std::max(ssize.width-2, 0);
03837 #endif
03838 
03839     for( dy = 0; dy < dsize.height; dy++ )
03840     {
03841         T* D = _dst.ptr<T>(dy);
03842         const short* XY = _xy.ptr<short>(dy);
03843         const ushort* FXY = _fxy.ptr<ushort>(dy);
03844         int X0 = 0;
03845         bool prevInlier = false;
03846 
03847         for( dx = 0; dx <= dsize.width; dx++ )
03848         {
03849             bool curInlier = dx < dsize.width ?
03850                 (unsigned)XY[dx*2] < width1 &&
03851                 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
03852             if( curInlier == prevInlier )
03853                 continue;
03854 
03855             int X1 = dx;
03856             dx = X0;
03857             X0 = X1;
03858             prevInlier = curInlier;
03859 
03860             if( !curInlier )
03861             {
03862                 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
03863                 D += len*cn;
03864                 dx += len;
03865 
03866                 if( cn == 1 )
03867                 {
03868                     for( ; dx < X1; dx++, D++ )
03869                     {
03870                         int sx = XY[dx*2], sy = XY[dx*2+1];
03871                         const AT* w = wtab + FXY[dx]*4;
03872                         const T* S = S0 + sy*sstep + sx;
03873                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
03874                     }
03875                 }
03876                 else if( cn == 2 )
03877                     for( ; dx < X1; dx++, D += 2 )
03878                     {
03879                         int sx = XY[dx*2], sy = XY[dx*2+1];
03880                         const AT* w = wtab + FXY[dx]*4;
03881                         const T* S = S0 + sy*sstep + sx*2;
03882                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
03883                         WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
03884                         D[0] = castOp(t0); D[1] = castOp(t1);
03885                     }
03886                 else if( cn == 3 )
03887                     for( ; dx < X1; dx++, D += 3 )
03888                     {
03889                         int sx = XY[dx*2], sy = XY[dx*2+1];
03890                         const AT* w = wtab + FXY[dx]*4;
03891                         const T* S = S0 + sy*sstep + sx*3;
03892                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
03893                         WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
03894                         WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
03895                         D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
03896                     }
03897                 else if( cn == 4 )
03898                     for( ; dx < X1; dx++, D += 4 )
03899                     {
03900                         int sx = XY[dx*2], sy = XY[dx*2+1];
03901                         const AT* w = wtab + FXY[dx]*4;
03902                         const T* S = S0 + sy*sstep + sx*4;
03903                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
03904                         WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
03905                         D[0] = castOp(t0); D[1] = castOp(t1);
03906                         t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
03907                         t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
03908                         D[2] = castOp(t0); D[3] = castOp(t1);
03909                     }
03910                 else
03911                     for( ; dx < X1; dx++, D += cn )
03912                     {
03913                         int sx = XY[dx*2], sy = XY[dx*2+1];
03914                         const AT* w = wtab + FXY[dx]*4;
03915                         const T* S = S0 + sy*sstep + sx*cn;
03916                         for( k = 0; k < cn; k++ )
03917                         {
03918                             WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
03919                             D[k] = castOp(t0);
03920                         }
03921                     }
03922             }
03923             else
03924             {
03925                 if( borderType == BORDER_TRANSPARENT && cn != 3 )
03926                 {
03927                     D += (X1 - dx)*cn;
03928                     dx = X1;
03929                     continue;
03930                 }
03931 
03932                 if( cn == 1 )
03933                     for( ; dx < X1; dx++, D++ )
03934                     {
03935                         int sx = XY[dx*2], sy = XY[dx*2+1];
03936                         if( borderType == BORDER_CONSTANT &&
03937                             (sx >= ssize.width || sx+1 < 0 ||
03938                              sy >= ssize.height || sy+1 < 0) )
03939                         {
03940                             D[0] = cval[0];
03941                         }
03942                         else
03943                         {
03944                             int sx0, sx1, sy0, sy1;
03945                             T v0, v1, v2, v3;
03946                             const AT* w = wtab + FXY[dx]*4;
03947                             if( borderType == BORDER_REPLICATE )
03948                             {
03949                                 sx0 = clip(sx, 0, ssize.width);
03950                                 sx1 = clip(sx+1, 0, ssize.width);
03951                                 sy0 = clip(sy, 0, ssize.height);
03952                                 sy1 = clip(sy+1, 0, ssize.height);
03953                                 v0 = S0[sy0*sstep + sx0];
03954                                 v1 = S0[sy0*sstep + sx1];
03955                                 v2 = S0[sy1*sstep + sx0];
03956                                 v3 = S0[sy1*sstep + sx1];
03957                             }
03958                             else
03959                             {
03960                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
03961                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
03962                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
03963                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
03964                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
03965                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
03966                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
03967                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
03968                             }
03969                             D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
03970                         }
03971                     }
03972                 else
03973                     for( ; dx < X1; dx++, D += cn )
03974                     {
03975                         int sx = XY[dx*2], sy = XY[dx*2+1];
03976                         if( borderType == BORDER_CONSTANT &&
03977                             (sx >= ssize.width || sx+1 < 0 ||
03978                              sy >= ssize.height || sy+1 < 0) )
03979                         {
03980                             for( k = 0; k < cn; k++ )
03981                                 D[k] = cval[k];
03982                         }
03983                         else
03984                         {
03985                             int sx0, sx1, sy0, sy1;
03986                             const T *v0, *v1, *v2, *v3;
03987                             const AT* w = wtab + FXY[dx]*4;
03988                             if( borderType == BORDER_REPLICATE )
03989                             {
03990                                 sx0 = clip(sx, 0, ssize.width);
03991                                 sx1 = clip(sx+1, 0, ssize.width);
03992                                 sy0 = clip(sy, 0, ssize.height);
03993                                 sy1 = clip(sy+1, 0, ssize.height);
03994                                 v0 = S0 + sy0*sstep + sx0*cn;
03995                                 v1 = S0 + sy0*sstep + sx1*cn;
03996                                 v2 = S0 + sy1*sstep + sx0*cn;
03997                                 v3 = S0 + sy1*sstep + sx1*cn;
03998                             }
03999                             else if( borderType == BORDER_TRANSPARENT &&
04000                                 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
04001                                 (unsigned)sy >= (unsigned)(ssize.height-1)))
04002                                 continue;
04003                             else
04004                             {
04005                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
04006                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
04007                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
04008                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
04009                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
04010                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
04011                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
04012                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
04013                             }
04014                             for( k = 0; k < cn; k++ )
04015                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
04016                         }
04017                     }
04018             }
04019         }
04020     }
04021 }
04022 
04023 
04024 template<class CastOp, typename AT, int ONE>
04025 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
04026                           const Mat& _fxy, const void* _wtab,
04027                           int borderType, const Scalar& _borderValue )
04028 {
04029     typedef typename CastOp::rtype T;
04030     typedef typename CastOp::type1 WT;
04031     Size ssize = _src.size(), dsize = _dst.size();
04032     int cn = _src.channels();
04033     const AT* wtab = (const AT*)_wtab;
04034     const T* S0 = _src.ptr<T>();
04035     size_t sstep = _src.step/sizeof(S0[0]);
04036     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
04037         saturate_cast<T>(_borderValue[1]),
04038         saturate_cast<T>(_borderValue[2]),
04039         saturate_cast<T>(_borderValue[3]));
04040     int dx, dy;
04041     CastOp castOp;
04042     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
04043 
04044     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
04045 
04046     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
04047     {
04048         dsize.width *= dsize.height;
04049         dsize.height = 1;
04050     }
04051 
04052     for( dy = 0; dy < dsize.height; dy++ )
04053     {
04054         T* D = _dst.ptr<T>(dy);
04055         const short* XY = _xy.ptr<short>(dy);
04056         const ushort* FXY = _fxy.ptr<ushort>(dy);
04057 
04058         for( dx = 0; dx < dsize.width; dx++, D += cn )
04059         {
04060             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
04061             const AT* w = wtab + FXY[dx]*16;
04062             int i, k;
04063             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
04064             {
04065                 const T* S = S0 + sy*sstep + sx*cn;
04066                 for( k = 0; k < cn; k++ )
04067                 {
04068                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
04069                     S += sstep;
04070                     sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
04071                     S += sstep;
04072                     sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
04073                     S += sstep;
04074                     sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
04075                     S += 1 - sstep*3;
04076                     D[k] = castOp(sum);
04077                 }
04078             }
04079             else
04080             {
04081                 int x[4], y[4];
04082                 if( borderType == BORDER_TRANSPARENT &&
04083                     ((unsigned)(sx+1) >= (unsigned)ssize.width ||
04084                     (unsigned)(sy+1) >= (unsigned)ssize.height) )
04085                     continue;
04086 
04087                 if( borderType1 == BORDER_CONSTANT &&
04088                     (sx >= ssize.width || sx+4 <= 0 ||
04089                     sy >= ssize.height || sy+4 <= 0))
04090                 {
04091                     for( k = 0; k < cn; k++ )
04092                         D[k] = cval[k];
04093                     continue;
04094                 }
04095 
04096                 for( i = 0; i < 4; i++ )
04097                 {
04098                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
04099                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
04100                 }
04101 
04102                 for( k = 0; k < cn; k++, S0++, w -= 16 )
04103                 {
04104                     WT cv = cval[k], sum = cv*ONE;
04105                     for( i = 0; i < 4; i++, w += 4 )
04106                     {
04107                         int yi = y[i];
04108                         const T* S = S0 + yi*sstep;
04109                         if( yi < 0 )
04110                             continue;
04111                         if( x[0] >= 0 )
04112                             sum += (S[x[0]] - cv)*w[0];
04113                         if( x[1] >= 0 )
04114                             sum += (S[x[1]] - cv)*w[1];
04115                         if( x[2] >= 0 )
04116                             sum += (S[x[2]] - cv)*w[2];
04117                         if( x[3] >= 0 )
04118                             sum += (S[x[3]] - cv)*w[3];
04119                     }
04120                     D[k] = castOp(sum);
04121                 }
04122                 S0 -= cn;
04123             }
04124         }
04125     }
04126 }
04127 
04128 
04129 template<class CastOp, typename AT, int ONE>
04130 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
04131                            const Mat& _fxy, const void* _wtab,
04132                            int borderType, const Scalar& _borderValue )
04133 {
04134     typedef typename CastOp::rtype T;
04135     typedef typename CastOp::type1 WT;
04136     Size ssize = _src.size(), dsize = _dst.size();
04137     int cn = _src.channels();
04138     const AT* wtab = (const AT*)_wtab;
04139     const T* S0 = _src.ptr<T>();
04140     size_t sstep = _src.step/sizeof(S0[0]);
04141     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
04142         saturate_cast<T>(_borderValue[1]),
04143         saturate_cast<T>(_borderValue[2]),
04144         saturate_cast<T>(_borderValue[3]));
04145     int dx, dy;
04146     CastOp castOp;
04147     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
04148 
04149     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
04150 
04151     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
04152     {
04153         dsize.width *= dsize.height;
04154         dsize.height = 1;
04155     }
04156 
04157     for( dy = 0; dy < dsize.height; dy++ )
04158     {
04159         T* D = _dst.ptr<T>(dy);
04160         const short* XY = _xy.ptr<short>(dy);
04161         const ushort* FXY = _fxy.ptr<ushort>(dy);
04162 
04163         for( dx = 0; dx < dsize.width; dx++, D += cn )
04164         {
04165             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
04166             const AT* w = wtab + FXY[dx]*64;
04167             const T* S = S0 + sy*sstep + sx*cn;
04168             int i, k;
04169             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
04170             {
04171                 for( k = 0; k < cn; k++ )
04172                 {
04173                     WT sum = 0;
04174                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
04175                         sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
04176                             S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
04177                     w -= 64;
04178                     S -= sstep*8 - 1;
04179                     D[k] = castOp(sum);
04180                 }
04181             }
04182             else
04183             {
04184                 int x[8], y[8];
04185                 if( borderType == BORDER_TRANSPARENT &&
04186                     ((unsigned)(sx+3) >= (unsigned)ssize.width ||
04187                     (unsigned)(sy+3) >= (unsigned)ssize.height) )
04188                     continue;
04189 
04190                 if( borderType1 == BORDER_CONSTANT &&
04191                     (sx >= ssize.width || sx+8 <= 0 ||
04192                     sy >= ssize.height || sy+8 <= 0))
04193                 {
04194                     for( k = 0; k < cn; k++ )
04195                         D[k] = cval[k];
04196                     continue;
04197                 }
04198 
04199                 for( i = 0; i < 8; i++ )
04200                 {
04201                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
04202                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
04203                 }
04204 
04205                 for( k = 0; k < cn; k++, S0++, w -= 64 )
04206                 {
04207                     WT cv = cval[k], sum = cv*ONE;
04208                     for( i = 0; i < 8; i++, w += 8 )
04209                     {
04210                         int yi = y[i];
04211                         const T* S1 = S0 + yi*sstep;
04212                         if( yi < 0 )
04213                             continue;
04214                         if( x[0] >= 0 )
04215                             sum += (S1[x[0]] - cv)*w[0];
04216                         if( x[1] >= 0 )
04217                             sum += (S1[x[1]] - cv)*w[1];
04218                         if( x[2] >= 0 )
04219                             sum += (S1[x[2]] - cv)*w[2];
04220                         if( x[3] >= 0 )
04221                             sum += (S1[x[3]] - cv)*w[3];
04222                         if( x[4] >= 0 )
04223                             sum += (S1[x[4]] - cv)*w[4];
04224                         if( x[5] >= 0 )
04225                             sum += (S1[x[5]] - cv)*w[5];
04226                         if( x[6] >= 0 )
04227                             sum += (S1[x[6]] - cv)*w[6];
04228                         if( x[7] >= 0 )
04229                             sum += (S1[x[7]] - cv)*w[7];
04230                     }
04231                     D[k] = castOp(sum);
04232                 }
04233                 S0 -= cn;
04234             }
04235         }
04236     }
04237 }
04238 
04239 
04240 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
04241                             int borderType, const Scalar& _borderValue );
04242 
04243 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
04244                           const Mat& _fxy, const void* _wtab,
04245                           int borderType, const Scalar& _borderValue);
04246 
04247 class RemapInvoker :
04248     public ParallelLoopBody
04249 {
04250 public:
04251     RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
04252                  const Mat *_m2, int _borderType, const Scalar &_borderValue,
04253                  int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
04254         ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
04255         borderType(_borderType), borderValue(_borderValue),
04256         planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
04257     {
04258     }
04259 
04260     virtual void operator() (const Range& range) const
04261     {
04262         int x, y, x1, y1;
04263         const int buf_size = 1 << 14;
04264         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
04265         int bcols0 = std::min(buf_size/brows0, dst->cols);
04266         brows0 = std::min(buf_size/bcols0, dst->rows);
04267     #if CV_SSE2
04268         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
04269     #endif
04270 
04271         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
04272         if( !nnfunc )
04273             _bufa.create(brows0, bcols0, CV_16UC1);
04274 
04275         for( y = range.start; y < range.end; y += brows0 )
04276         {
04277             for( x = 0; x < dst->cols; x += bcols0 )
04278             {
04279                 int brows = std::min(brows0, range.end - y);
04280                 int bcols = std::min(bcols0, dst->cols - x);
04281                 Mat dpart(*dst, Rect(x, y, bcols, brows));
04282                 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
04283 
04284                 if( nnfunc )
04285                 {
04286                     if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
04287                         bufxy = (*m1)(Rect(x, y, bcols, brows));
04288                     else if( map_depth != CV_32F )
04289                     {
04290                         for( y1 = 0; y1 < brows; y1++ )
04291                         {
04292                             short* XY = bufxy.ptr<short>(y1);
04293                             const short* sXY = m1->ptr<short>(y+y1) + x*2;
04294                             const ushort* sA = m2->ptr<ushort>(y+y1) + x;
04295 
04296                             for( x1 = 0; x1 < bcols; x1++ )
04297                             {
04298                                 int a = sA[x1] & (INTER_TAB_SIZE2-1);
04299                                 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
04300                                 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
04301                             }
04302                         }
04303                     }
04304                     else if( !planar_input )
04305                         (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
04306                     else
04307                     {
04308                         for( y1 = 0; y1 < brows; y1++ )
04309                         {
04310                             short* XY = bufxy.ptr<short>(y1);
04311                             const float* sX = m1->ptr<float>(y+y1) + x;
04312                             const float* sY = m2->ptr<float>(y+y1) + x;
04313                             x1 = 0;
04314 
04315                         #if CV_SSE2
04316                             if( useSIMD )
04317                             {
04318                                 for( ; x1 <= bcols - 8; x1 += 8 )
04319                                 {
04320                                     __m128 fx0 = _mm_loadu_ps(sX + x1);
04321                                     __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
04322                                     __m128 fy0 = _mm_loadu_ps(sY + x1);
04323                                     __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
04324                                     __m128i ix0 = _mm_cvtps_epi32(fx0);
04325                                     __m128i ix1 = _mm_cvtps_epi32(fx1);
04326                                     __m128i iy0 = _mm_cvtps_epi32(fy0);
04327                                     __m128i iy1 = _mm_cvtps_epi32(fy1);
04328                                     ix0 = _mm_packs_epi32(ix0, ix1);
04329                                     iy0 = _mm_packs_epi32(iy0, iy1);
04330                                     ix1 = _mm_unpacklo_epi16(ix0, iy0);
04331                                     iy1 = _mm_unpackhi_epi16(ix0, iy0);
04332                                     _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
04333                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
04334                                 }
04335                             }
04336                         #endif
04337 
04338                             for( ; x1 < bcols; x1++ )
04339                             {
04340                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
04341                                 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
04342                             }
04343                         }
04344                     }
04345                     nnfunc( *src, dpart, bufxy, borderType, borderValue );
04346                     continue;
04347                 }
04348 
04349                 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
04350                 for( y1 = 0; y1 < brows; y1++ )
04351                 {
04352                     short* XY = bufxy.ptr<short>(y1);
04353                     ushort* A = bufa.ptr<ushort>(y1);
04354 
04355                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
04356                     {
04357                         bufxy = (*m1)(Rect(x, y, bcols, brows));
04358 
04359                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
04360                         x1 = 0;
04361 
04362                     #if CV_NEON
04363                         uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
04364                         for ( ; x1 <= bcols - 8; x1 += 8)
04365                             vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
04366                     #elif CV_SSE2
04367                         __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1);
04368                         for ( ; x1 <= bcols - 8; x1 += 8)
04369                             _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale));
04370                     #endif
04371 
04372                         for( ; x1 < bcols; x1++ )
04373                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
04374                     }
04375                     else if( planar_input )
04376                     {
04377                         const float* sX = m1->ptr<float>(y+y1) + x;
04378                         const float* sY = m2->ptr<float>(y+y1) + x;
04379 
04380                         x1 = 0;
04381                     #if CV_SSE2
04382                         if( useSIMD )
04383                         {
04384                             __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
04385                             __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
04386                             for( ; x1 <= bcols - 8; x1 += 8 )
04387                             {
04388                                 __m128 fx0 = _mm_loadu_ps(sX + x1);
04389                                 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
04390                                 __m128 fy0 = _mm_loadu_ps(sY + x1);
04391                                 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
04392                                 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
04393                                 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
04394                                 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
04395                                 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
04396                                 __m128i mx0 = _mm_and_si128(ix0, mask);
04397                                 __m128i mx1 = _mm_and_si128(ix1, mask);
04398                                 __m128i my0 = _mm_and_si128(iy0, mask);
04399                                 __m128i my1 = _mm_and_si128(iy1, mask);
04400                                 mx0 = _mm_packs_epi32(mx0, mx1);
04401                                 my0 = _mm_packs_epi32(my0, my1);
04402                                 my0 = _mm_slli_epi16(my0, INTER_BITS);
04403                                 mx0 = _mm_or_si128(mx0, my0);
04404                                 _mm_storeu_si128((__m128i*)(A + x1), mx0);
04405                                 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
04406                                 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
04407                                 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
04408                                 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
04409                                 ix0 = _mm_packs_epi32(ix0, ix1);
04410                                 iy0 = _mm_packs_epi32(iy0, iy1);
04411                                 ix1 = _mm_unpacklo_epi16(ix0, iy0);
04412                                 iy1 = _mm_unpackhi_epi16(ix0, iy0);
04413                                 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
04414                                 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
04415                             }
04416                         }
04417                     #elif CV_NEON
04418                         float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
04419                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
04420 
04421                         for( ; x1 <= bcols - 4; x1 += 4 )
04422                         {
04423                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
04424                                       v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
04425                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
04426                                                       vandq_s32(v_sy, v_scale2));
04427                             vst1_u16(A + x1, vqmovun_s32(v_v));
04428 
04429                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
04430                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
04431                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
04432                         }
04433                     #endif
04434 
04435                         for( ; x1 < bcols; x1++ )
04436                         {
04437                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
04438                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
04439                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
04440                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
04441                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
04442                             A[x1] = (ushort)v;
04443                         }
04444                     }
04445                     else
04446                     {
04447                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
04448                         x1 = 0;
04449 
04450                     #if CV_NEON
04451                         float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
04452                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
04453 
04454                         for( ; x1 <= bcols - 4; x1 += 4 )
04455                         {
04456                             float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
04457                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
04458                             int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
04459                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
04460                                                       vandq_s32(v_sy, v_scale2));
04461                             vst1_u16(A + x1, vqmovun_s32(v_v));
04462 
04463                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
04464                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
04465                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
04466                         }
04467                     #endif
04468 
04469                         for( x1 = 0; x1 < bcols; x1++ )
04470                         {
04471                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
04472                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
04473                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
04474                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
04475                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
04476                             A[x1] = (ushort)v;
04477                         }
04478                     }
04479                 }
04480                 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
04481             }
04482         }
04483     }
04484 
04485 private:
04486     const Mat* src;
04487     Mat* dst;
04488     const Mat *m1, *m2;
04489     int borderType;
04490     Scalar borderValue;
04491     int planar_input;
04492     RemapNNFunc nnfunc;
04493     RemapFunc ifunc;
04494     const void *ctab;
04495 };
04496 
04497 #ifdef HAVE_OPENCL
04498 
04499 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
04500                       int interpolation, int borderType, const Scalar& borderValue)
04501 {
04502     const ocl::Device & dev = ocl::Device::getDefault();
04503     int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
04504             rowsPerWI = dev.isIntel() ? 4 : 1;
04505 
04506     if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
04507             || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
04508         return false;
04509 
04510     UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
04511 
04512     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
04513         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
04514     {
04515         if (map1.type() != CV_16SC2)
04516             std::swap(map1, map2);
04517     }
04518     else
04519         CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
04520 
04521     _dst.create(map1.size(), type);
04522     UMat dst = _dst.getUMat();
04523 
04524     String kernelName = "remap";
04525     if (map1.type() == CV_32FC2 && map2.empty())
04526         kernelName += "_32FC2";
04527     else if (map1.type() == CV_16SC2)
04528     {
04529         kernelName += "_16SC2";
04530         if (!map2.empty())
04531             kernelName += "_16UC1";
04532     }
04533     else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
04534         kernelName += "_2_32FC1";
04535     else
04536         CV_Error(Error::StsBadArg, "Unsupported map types");
04537 
04538     static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
04539     static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
04540                            "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
04541     String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
04542                                  interMap[interpolation], borderMap[borderType],
04543                                  ocl::typeToStr(type), rowsPerWI);
04544 
04545     if (interpolation != INTER_NEAREST)
04546     {
04547         char cvt[3][40];
04548         int wdepth = std::max(CV_32F, depth);
04549         buildOptions = buildOptions
04550                       + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
04551                                " -D convertToWT2=%s -D WT2=%s",
04552                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
04553                                ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
04554                                ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
04555                                ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
04556                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
04557     }
04558     int scalarcn = cn == 3 ? 4 : cn;
04559     int sctype = CV_MAKETYPE(depth, scalarcn);
04560     buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
04561                            ocl::typeToStr(type), ocl::typeToStr(depth),
04562                            cn, ocl::typeToStr(sctype), depth);
04563 
04564     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
04565 
04566     Mat scalar(1, 1, sctype, borderValue);
04567     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
04568             map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
04569             scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
04570 
04571     if (map2.empty())
04572         k.args(srcarg, dstarg, map1arg, scalararg);
04573     else
04574         k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
04575 
04576     size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
04577     return k.run(2, globalThreads, NULL, false);
04578 }
04579 
04580 #endif
04581 
04582 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && IPP_DISABLE_BLOCK
04583 
04584 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
04585                                            const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
04586                                            void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
04587 
04588 class IPPRemapInvoker :
04589         public ParallelLoopBody
04590 {
04591 public:
04592     IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
04593                     int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
04594         ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
04595         ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
04596     {
04597         *ok = true;
04598     }
04599 
04600     virtual void operator() (const Range & range) const
04601     {
04602         IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
04603         Mat dstRoi = dst.rowRange(range);
04604         IppiSize dstRoiSize = ippiSize(dstRoi.size());
04605         int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
04606 
04607         if (borderType == BORDER_CONSTANT &&
04608                 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
04609         {
04610             *ok = false;
04611             return;
04612         }
04613 
04614         if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
04615                     map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
04616                     dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0)
04617             *ok = false;
04618         else
04619         {
04620             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
04621         }
04622     }
04623 
04624 private:
04625     Mat & src, & dst, & map1, & map2;
04626     ippiRemap ippFunc;
04627     int ippInterpolation, borderType;
04628     Scalar borderValue;
04629     bool * ok;
04630 };
04631 
04632 #endif
04633 
04634 }
04635 
04636 void cv::remap( InputArray _src, OutputArray _dst,
04637                 InputArray _map1, InputArray _map2,
04638                 int interpolation, int borderType, const Scalar & borderValue )
04639 {
04640     static RemapNNFunc nn_tab[] =
04641     {
04642         remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
04643         remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
04644     };
04645 
04646     static RemapFunc linear_tab[] =
04647     {
04648         remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
04649         remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
04650         remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
04651         remapBilinear<Cast<float, float>, RemapNoVec, float>,
04652         remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
04653     };
04654 
04655     static RemapFunc cubic_tab[] =
04656     {
04657         remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
04658         remapBicubic<Cast<float, ushort>, float, 1>,
04659         remapBicubic<Cast<float, short>, float, 1>, 0,
04660         remapBicubic<Cast<float, float>, float, 1>,
04661         remapBicubic<Cast<double, double>, float, 1>, 0
04662     };
04663 
04664     static RemapFunc lanczos4_tab[] =
04665     {
04666         remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
04667         remapLanczos4<Cast<float, ushort>, float, 1>,
04668         remapLanczos4<Cast<float, short>, float, 1>, 0,
04669         remapLanczos4<Cast<float, float>, float, 1>,
04670         remapLanczos4<Cast<double, double>, float, 1>, 0
04671     };
04672 
04673     CV_Assert( _map1.size().area() > 0 );
04674     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
04675 
04676 #ifdef HAVE_OPENCL
04677     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
04678                ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
04679 #endif
04680 
04681     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
04682     _dst.create( map1.size(), src.type() );
04683     Mat dst = _dst.getMat();
04684     if( dst.data == src.data )
04685         src = src.clone();
04686 
04687     if( interpolation == INTER_AREA )
04688         interpolation = INTER_LINEAR;
04689 
04690     int type = src.type(), depth = CV_MAT_DEPTH(type);
04691 
04692 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && IPP_DISABLE_BLOCK
04693     CV_IPP_CHECK()
04694     {
04695         if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
04696                 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
04697                 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
04698         {
04699             int ippInterpolation =
04700                 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
04701                 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
04702 
04703             ippiRemap ippFunc =
04704                 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
04705                 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
04706                 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
04707                 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
04708                 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
04709                 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
04710                 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
04711                 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
04712                 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
04713 
04714             if (ippFunc)
04715             {
04716                 bool ok;
04717                 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
04718                                         borderType, borderValue, &ok);
04719                 Range range(0, dst.rows);
04720                 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
04721 
04722                 if (ok)
04723                 {
04724                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
04725                     return;
04726                 }
04727                 setIppErrorStatus();
04728             }
04729         }
04730     }
04731 #endif
04732 
04733     RemapNNFunc nnfunc = 0;
04734     RemapFunc ifunc = 0;
04735     const void* ctab = 0;
04736     bool fixpt = depth == CV_8U;
04737     bool planar_input = false;
04738 
04739     if( interpolation == INTER_NEAREST )
04740     {
04741         nnfunc = nn_tab[depth];
04742         CV_Assert( nnfunc != 0 );
04743     }
04744     else
04745     {
04746         if( interpolation == INTER_LINEAR )
04747             ifunc = linear_tab[depth];
04748         else if( interpolation == INTER_CUBIC )
04749             ifunc = cubic_tab[depth];
04750         else if( interpolation == INTER_LANCZOS4 )
04751             ifunc = lanczos4_tab[depth];
04752         else
04753             CV_Error( CV_StsBadArg, "Unknown interpolation method" );
04754         CV_Assert( ifunc != 0 );
04755         ctab = initInterTab2D( interpolation, fixpt );
04756     }
04757 
04758     const Mat *m1 = &map1, *m2 = &map2;
04759 
04760     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
04761         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
04762     {
04763         if( map1.type() != CV_16SC2 )
04764             std::swap(m1, m2);
04765     }
04766     else
04767     {
04768         CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
04769             (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
04770         planar_input = map1.channels() == 1;
04771     }
04772 
04773     RemapInvoker invoker(src, dst, m1, m2,
04774                          borderType, borderValue, planar_input, nnfunc, ifunc,
04775                          ctab);
04776     parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
04777 }
04778 
04779 
04780 void cv::convertMaps( InputArray _map1, InputArray _map2,
04781                       OutputArray _dstmap1, OutputArray _dstmap2,
04782                       int dstm1type, bool nninterpolate )
04783 {
04784     Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
04785     Size size = map1.size();
04786     const Mat *m1 = &map1, *m2 = &map2;
04787     int m1type = m1->type(), m2type = m2->type();
04788 
04789     CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
04790                (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
04791                (m1type == CV_32FC1 && m2type == CV_32FC1) ||
04792                (m1type == CV_32FC2 && m2->empty()) );
04793 
04794     if( m2type == CV_16SC2 )
04795     {
04796         std::swap( m1, m2 );
04797         std::swap( m1type, m2type );
04798     }
04799 
04800     if( dstm1type <= 0 )
04801         dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
04802     CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
04803     _dstmap1.create( size, dstm1type );
04804     dstmap1 = _dstmap1.getMat();
04805 
04806     if( !nninterpolate && dstm1type != CV_32FC2 )
04807     {
04808         _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
04809         dstmap2 = _dstmap2.getMat();
04810     }
04811     else
04812         _dstmap2.release();
04813 
04814     if( m1type == dstm1type || (nninterpolate &&
04815         ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
04816         (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
04817     {
04818         m1->convertTo( dstmap1, dstmap1.type() );
04819         if( !dstmap2.empty() && dstmap2.type() == m2->type() )
04820             m2->copyTo( dstmap2 );
04821         return;
04822     }
04823 
04824     if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
04825     {
04826         Mat vdata[] = { *m1, *m2 };
04827         merge( vdata, 2, dstmap1 );
04828         return;
04829     }
04830 
04831     if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
04832     {
04833         Mat mv[] = { dstmap1, dstmap2 };
04834         split( *m1, mv );
04835         return;
04836     }
04837 
04838     if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
04839         dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
04840     {
04841         size.width *= size.height;
04842         size.height = 1;
04843     }
04844 
04845 #if CV_SSE2
04846     bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
04847 #endif
04848 #if CV_SSE4_1
04849     bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
04850 #endif
04851 
04852     const float scale = 1.f/INTER_TAB_SIZE;
04853     int x, y;
04854     for( y = 0; y < size.height; y++ )
04855     {
04856         const float* src1f = m1->ptr<float>(y);
04857         const float* src2f = m2->ptr<float>(y);
04858         const short* src1 = (const short*)src1f;
04859         const ushort* src2 = (const ushort*)src2f;
04860 
04861         float* dst1f = dstmap1.ptr<float>(y);
04862         float* dst2f = dstmap2.ptr<float>(y);
04863         short* dst1 = (short*)dst1f;
04864         ushort* dst2 = (ushort*)dst2f;
04865         x = 0;
04866 
04867         if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
04868         {
04869             if( nninterpolate )
04870             {
04871                 #if CV_NEON
04872                 for( ; x <= size.width - 8; x += 8 )
04873                 {
04874                     int16x8x2_t v_dst;
04875                     v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
04876                                                 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))));
04877                     v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))),
04878                                                 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4))));
04879 
04880                     vst2q_s16(dst1 + (x << 1), v_dst);
04881                 }
04882                 #elif CV_SSE4_1
04883                 if (useSSE4_1)
04884                 {
04885                     for( ; x <= size.width - 16; x += 16 )
04886                     {
04887                         __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
04888                                                          _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
04889                         __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
04890                                                          _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
04891 
04892                         __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
04893                                                          _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
04894                         __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
04895                                                          _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
04896 
04897                         _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
04898 
04899                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
04900                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
04901                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
04902                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
04903                     }
04904                 }
04905                 #endif
04906                 for( ; x < size.width; x++ )
04907                 {
04908                     dst1[x*2] = saturate_cast<short>(src1f[x]);
04909                     dst1[x*2+1] = saturate_cast<short>(src2f[x]);
04910                 }
04911             }
04912             else
04913             {
04914                 #if CV_NEON
04915                 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
04916                 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
04917 
04918                 for( ; x <= size.width - 8; x += 8 )
04919                 {
04920                     int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale));
04921                     int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale));
04922                     int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale));
04923                     int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale));
04924 
04925                     int16x8x2_t v_dst;
04926                     v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
04927                                                 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
04928                     v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
04929                                                 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
04930 
04931                     vst2q_s16(dst1 + (x << 1), v_dst);
04932 
04933                     uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
04934                                                               vandq_s32(v_ix0, v_mask)));
04935                     uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
04936                                                               vandq_s32(v_ix1, v_mask)));
04937                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
04938                 }
04939                 #elif CV_SSE4_1
04940                 if (useSSE4_1)
04941                 {
04942                     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
04943                     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
04944 
04945                     for( ; x <= size.width - 16; x += 16 )
04946                     {
04947                         __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
04948                         __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
04949                         __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
04950                         __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
04951 
04952                         __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
04953                                                           _mm_srai_epi32(v_ix1, INTER_BITS));
04954                         __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
04955                                                           _mm_srai_epi32(v_iy1, INTER_BITS));
04956                         __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
04957                                                         _mm_and_si128(v_ix0, v_its1));
04958                         __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
04959                                                         _mm_and_si128(v_ix1, v_its1));
04960                         _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
04961 
04962                         v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
04963                         v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
04964                         v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
04965                         v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
04966 
04967                         __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
04968                                                           _mm_srai_epi32(v_ix1, INTER_BITS));
04969                         __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
04970                                                           _mm_srai_epi32(v_iy1, INTER_BITS));
04971                         v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
04972                                                 _mm_and_si128(v_ix0, v_its1));
04973                         v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
04974                                                 _mm_and_si128(v_ix1, v_its1));
04975                         _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
04976 
04977                         _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
04978 
04979                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
04980                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
04981                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
04982                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
04983                     }
04984                 }
04985                 #endif
04986                 for( ; x < size.width; x++ )
04987                 {
04988                     int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
04989                     int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
04990                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
04991                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
04992                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
04993                 }
04994             }
04995         }
04996         else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
04997         {
04998             if( nninterpolate )
04999             {
05000                 #if CV_NEON
05001                 for( ; x <= (size.width << 1) - 8; x += 8 )
05002                     vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
05003                                                      vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
05004                 #elif CV_SSE2
05005                 for( ; x <= (size.width << 1) - 8; x += 8 )
05006                 {
05007                     _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
05008                                                                             _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
05009                 }
05010                 #endif
05011                 for( ; x < size.width; x++ )
05012                 {
05013                     dst1[x*2] = saturate_cast<short>(src1f[x*2]);
05014                     dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
05015                 }
05016             }
05017             else
05018             {
05019                 #if CV_NEON
05020                 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
05021                 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
05022 
05023                 for( ; x <= size.width - 8; x += 8 )
05024                 {
05025                     float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8);
05026                     int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale));
05027                     int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale));
05028                     int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale));
05029                     int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale));
05030 
05031                     int16x8x2_t v_dst;
05032                     v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
05033                                                 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
05034                     v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
05035                                                 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
05036 
05037                     vst2q_s16(dst1 + (x << 1), v_dst);
05038 
05039                     uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
05040                                                               vandq_s32(v_ix0, v_mask)));
05041                     uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
05042                                                               vandq_s32(v_ix1, v_mask)));
05043                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
05044                 }
05045                 #elif CV_SSE4_1
05046                 if (useSSE4_1)
05047                 {
05048                     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
05049                     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
05050                     __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
05051 
05052                     for( ; x <= size.width - 4; x += 4 )
05053                     {
05054                         __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
05055                         __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
05056 
05057                         __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
05058                                                          _mm_srai_epi32(v_src1, INTER_BITS));
05059                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
05060 
05061                         // x0 y0 x1 y1 . . .
05062                         v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
05063                                                  _mm_and_si128(v_src1, v_its1));
05064                         __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
05065                                                       _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
05066                         _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
05067                     }
05068                 }
05069                 #endif
05070                 for( ; x < size.width; x++ )
05071                 {
05072                     int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
05073                     int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
05074                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
05075                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
05076                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
05077                 }
05078             }
05079         }
05080         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
05081         {
05082             #if CV_NEON
05083             uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1);
05084             uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1);
05085             float32x4_t v_scale = vdupq_n_f32(scale);
05086 
05087             for( ; x <= size.width - 8; x += 8)
05088             {
05089                 uint32x4_t v_fxy1, v_fxy2;
05090                 if (src2)
05091                 {
05092                     uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2);
05093                     v_fxy1 = vmovl_u16(vget_low_u16(v_src2));
05094                     v_fxy2 = vmovl_u16(vget_high_u16(v_src2));
05095                 }
05096                 else
05097                     v_fxy1 = v_fxy2 = v_zero;
05098 
05099                 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
05100                 float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
05101                                                v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask)));
05102                 float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
05103                                                v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS)));
05104                 vst1q_f32(dst1f + x, v_dst1);
05105                 vst1q_f32(dst2f + x, v_dst2);
05106 
05107                 v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
05108                                    v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask)));
05109                 v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
05110                                    v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS)));
05111                 vst1q_f32(dst1f + x + 4, v_dst1);
05112                 vst1q_f32(dst2f + x + 4, v_dst2);
05113             }
05114             #elif CV_SSE2
05115             __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
05116             __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
05117             __m128 v_scale = _mm_set1_ps(scale);
05118 
05119             for( ; x <= size.width - 16; x += 16)
05120             {
05121                 __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
05122                 __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
05123                 __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
05124                 __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
05125 
05126                 _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
05127 
05128                 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
05129                 __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
05130                 _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
05131                                                     _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
05132                 _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
05133                                                     _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
05134                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
05135                 _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
05136                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
05137                 _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
05138                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
05139 
05140                 v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
05141                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
05142                 _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
05143                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
05144                 _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
05145                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
05146                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
05147                 _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
05148                                                          _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
05149                 _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
05150                                                          _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
05151             }
05152             #endif
05153             for( ; x < size.width; x++ )
05154             {
05155                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
05156                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
05157                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
05158             }
05159         }
05160         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
05161         {
05162             #if CV_NEON
05163             int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1);
05164             int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1);
05165             float32x4_t v_scale = vdupq_n_f32(scale);
05166 
05167             for( ; x <= size.width - 8; x += 8)
05168             {
05169                 int32x4_t v_fxy1, v_fxy2;
05170                 if (src2)
05171                 {
05172                     int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2);
05173                     v_fxy1 = vmovl_s16(vget_low_s16(v_src2));
05174                     v_fxy2 = vmovl_s16(vget_high_s16(v_src2));
05175                 }
05176                 else
05177                     v_fxy1 = v_fxy2 = v_zero;
05178 
05179                 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
05180                 float32x4x2_t v_dst;
05181                 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
05182                                          v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask)));
05183                 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
05184                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS)));
05185                 vst2q_f32(dst1f + (x << 1), v_dst);
05186 
05187                 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
05188                                          v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask)));
05189                 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
05190                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
05191                 vst2q_f32(dst1f + (x << 1) + 8, v_dst);
05192             }
05193             #elif CV_SSE2
05194             if (useSSE2)
05195             {
05196                 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
05197                 __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
05198                 __m128 v_scale = _mm_set1_ps(scale);
05199 
05200                 for ( ; x <= size.width - 8; x += 8)
05201                 {
05202                     __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
05203                     __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
05204                     __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
05205                     __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
05206 
05207                     __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
05208                     _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
05209 
05210                     v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
05211                     _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
05212                 }
05213             }
05214             #endif
05215             for( ; x < size.width; x++ )
05216             {
05217                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
05218                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
05219                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
05220             }
05221         }
05222         else
05223             CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
05224     }
05225 }
05226 
05227 
05228 namespace cv
05229 {
05230 
05231 class WarpAffineInvoker :
05232     public ParallelLoopBody
05233 {
05234 public:
05235     WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
05236                       const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
05237         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
05238         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
05239         M(_M)
05240     {
05241     }
05242 
05243     virtual void operator() (const Range& range) const
05244     {
05245         const int BLOCK_SZ = 64;
05246         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
05247         const int AB_BITS = MAX(10, (int)INTER_BITS);
05248         const int AB_SCALE = 1 << AB_BITS;
05249         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
05250     #if CV_SSE2
05251         bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
05252     #endif
05253     #if CV_SSE4_1
05254         bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
05255     #endif
05256 
05257         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
05258         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
05259         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
05260 
05261         for( y = range.start; y < range.end; y += bh0 )
05262         {
05263             for( x = 0; x < dst.cols; x += bw0 )
05264             {
05265                 int bw = std::min( bw0, dst.cols - x);
05266                 int bh = std::min( bh0, range.end - y);
05267 
05268                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
05269                 Mat dpart(dst, Rect(x, y, bw, bh));
05270 
05271                 for( y1 = 0; y1 < bh; y1++ )
05272                 {
05273                     short* xy = XY + y1*bw*2;
05274                     int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
05275                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
05276 
05277                     if( interpolation == INTER_NEAREST )
05278                     {
05279                         x1 = 0;
05280                         #if CV_NEON
05281                         int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
05282                         for( ; x1 <= bw - 8; x1 += 8 )
05283                         {
05284                             int16x8x2_t v_dst;
05285                             v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)),
05286                                                         vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS)));
05287                             v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)),
05288                                                         vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS)));
05289 
05290                             vst2q_s16(xy + (x1 << 1), v_dst);
05291                         }
05292                         #elif CV_SSE4_1
05293                         if (useSSE4_1)
05294                         {
05295                             __m128i v_X0 = _mm_set1_epi32(X0);
05296                             __m128i v_Y0 = _mm_set1_epi32(Y0);
05297                             for ( ; x1 <= bw - 16; x1 += 16)
05298                             {
05299                                 __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
05300                                                                _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
05301                                 __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
05302                                                                _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
05303 
05304                                 __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
05305                                                                _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
05306                                 __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
05307                                                                _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
05308 
05309                                 _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
05310 
05311                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
05312                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
05313                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
05314                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
05315                             }
05316                         }
05317                         #endif
05318                         for( ; x1 < bw; x1++ )
05319                         {
05320                             int X = (X0 + adelta[x+x1]) >> AB_BITS;
05321                             int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
05322                             xy[x1*2] = saturate_cast<short>(X);
05323                             xy[x1*2+1] = saturate_cast<short>(Y);
05324                         }
05325                     }
05326                     else
05327                     {
05328                         short* alpha = A + y1*bw;
05329                         x1 = 0;
05330                     #if CV_SSE2
05331                         if( useSSE2 )
05332                         {
05333                             __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
05334                             __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
05335                             for( ; x1 <= bw - 8; x1 += 8 )
05336                             {
05337                                 __m128i tx0, tx1, ty0, ty1;
05338                                 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
05339                                 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
05340                                 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
05341                                 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
05342 
05343                                 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
05344                                 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
05345                                 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
05346                                 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
05347 
05348                                 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
05349                                                             _mm_and_si128(tx1, fxy_mask));
05350                                 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
05351                                                             _mm_and_si128(ty1, fxy_mask));
05352                                 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
05353                                                             _mm_srai_epi32(tx1, INTER_BITS));
05354                                 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
05355                                                     _mm_srai_epi32(ty1, INTER_BITS));
05356                                 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
05357 
05358                                 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
05359                                 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
05360                                 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
05361                             }
05362                         }
05363                     #elif CV_NEON
05364                         int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
05365                         for( ; x1 <= bw - 8; x1 += 8 )
05366                         {
05367                             int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
05368                             int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
05369                             int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS);
05370                             int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS);
05371 
05372                             int16x8x2_t v_xy;
05373                             v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS)));
05374                             v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS)));
05375 
05376                             vst2q_s16(xy + (x1 << 1), v_xy);
05377 
05378                             int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS),
05379                                                                      vandq_s32(v_X0, v_mask)));
05380                             int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS),
05381                                                                      vandq_s32(v_X1, v_mask)));
05382                             vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1));
05383                         }
05384                     #endif
05385                         for( ; x1 < bw; x1++ )
05386                         {
05387                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
05388                             int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
05389                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
05390                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
05391                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
05392                                     (X & (INTER_TAB_SIZE-1)));
05393                         }
05394                     }
05395                 }
05396 
05397                 if( interpolation == INTER_NEAREST )
05398                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
05399                 else
05400                 {
05401                     Mat _matA(bh, bw, CV_16U, A);
05402                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
05403                 }
05404             }
05405         }
05406     }
05407 
05408 private:
05409     Mat src;
05410     Mat dst;
05411     int interpolation, borderType;
05412     Scalar borderValue;
05413     int *adelta, *bdelta;
05414     double *M;
05415 };
05416 
05417 
05418 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
05419 class IPPWarpAffineInvoker :
05420     public ParallelLoopBody
05421 {
05422 public:
05423     IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
05424                          const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
05425         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
05426         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
05427     {
05428         *ok = true;
05429     }
05430 
05431     virtual void operator() (const Range& range) const
05432     {
05433         IppiSize srcsize = { src.cols, src.rows };
05434         IppiRect srcroi = { 0, 0, src.cols, src.rows };
05435         IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
05436         int cnn = src.channels();
05437         if( borderType == BORDER_CONSTANT )
05438         {
05439             IppiSize setSize = { dst.cols, range.end - range.start };
05440             void *dataPointer = dst.ptr(range.start);
05441             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
05442             {
05443                 *ok = false;
05444                 return;
05445             }
05446         }
05447 
05448         // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
05449         IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
05450                                 (int)dst.step[0], dstroi, coeffs, mode );
05451         if( status < 0)
05452             *ok = false;
05453         else
05454         {
05455             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05456         }
05457     }
05458 private:
05459     Mat &src;
05460     Mat &dst;
05461     int mode;
05462     double (&coeffs)[2][3];
05463     int borderType;
05464     Scalar borderValue;
05465     ippiWarpAffineBackFunc func;
05466     bool *ok;
05467     const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
05468 };
05469 #endif
05470 
05471 #ifdef HAVE_OPENCL
05472 
05473 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
05474 
05475 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
05476                               Size dsize, int flags, int borderType, const Scalar& borderValue,
05477                               int op_type)
05478 {
05479     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
05480     const ocl::Device & dev = ocl::Device::getDefault();
05481 
05482     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
05483     const bool doubleSupport = dev.doubleFPConfig() > 0;
05484 
05485     int interpolation = flags & INTER_MAX;
05486     if( interpolation == INTER_AREA )
05487         interpolation = INTER_LINEAR;
05488     int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
05489 
05490     if ( !(borderType == cv::BORDER_CONSTANT &&
05491            (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
05492          (!doubleSupport && depth == CV_64F) || cn > 4)
05493         return false;
05494 
05495     const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
05496     ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
05497                 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
05498     const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
05499 
05500     int scalarcn = cn == 3 ? 4 : cn;
05501     bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
05502     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
05503     int sctype = CV_MAKETYPE(wdepth, scalarcn);
05504 
05505     ocl::Kernel k;
05506     String opts;
05507     if (interpolation == INTER_NEAREST)
05508     {
05509         opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
05510                       ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
05511                       ocl::typeToStr(CV_MAT_DEPTH(type)),
05512                       ocl::typeToStr(sctype), cn, rowsPerWI);
05513     }
05514     else
05515     {
05516         char cvt[2][50];
05517         opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
05518                       " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
05519                       interpolationMap[interpolation], ocl::typeToStr(type),
05520                       ocl::typeToStr(CV_MAT_DEPTH(type)),
05521                       ocl::typeToStr(sctype),
05522                       ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
05523                       ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
05524                       ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
05525                       doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
05526     }
05527 
05528     k.create(kernelName, program, opts);
05529     if (k.empty())
05530         return false;
05531 
05532     double borderBuf[] = { 0, 0, 0, 0 };
05533     scalarToRawData(borderValue, borderBuf, sctype);
05534 
05535     UMat src = _src.getUMat(), M0;
05536     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
05537     UMat dst = _dst.getUMat();
05538 
05539     double M[9];
05540     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
05541     Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
05542     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
05543                M1.rows == matRows && M1.cols == 3 );
05544     M1.convertTo(matM, matM.type());
05545 
05546     if( !(flags & WARP_INVERSE_MAP) )
05547     {
05548         if (op_type == OCL_OP_PERSPECTIVE)
05549             invert(matM, matM);
05550         else
05551         {
05552             double D = M[0]*M[4] - M[1]*M[3];
05553             D = D != 0 ? 1./D : 0;
05554             double A11 = M[4]*D, A22=M[0]*D;
05555             M[0] = A11; M[1] *= -D;
05556             M[3] *= -D; M[4] = A22;
05557             double b1 = -M[0]*M[2] - M[1]*M[5];
05558             double b2 = -M[3]*M[2] - M[4]*M[5];
05559             M[2] = b1; M[5] = b2;
05560         }
05561     }
05562     matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
05563 
05564     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
05565            ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
05566 
05567     size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
05568     return k.run(2, globalThreads, NULL, false);
05569 }
05570 
05571 #endif
05572 
05573 }
05574 
05575 
05576 void cv::warpAffine( InputArray _src, OutputArray _dst,
05577                      InputArray _M0, Size dsize,
05578                      int flags, int borderType, const Scalar & borderValue )
05579 {
05580 #ifdef HAVE_OPENCL
05581     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
05582                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
05583                                  borderValue, OCL_OP_AFFINE))
05584 #endif
05585 
05586     Mat src = _src.getMat(), M0 = _M0.getMat();
05587     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
05588     Mat dst = _dst.getMat();
05589     CV_Assert( src.cols > 0 && src.rows > 0 );
05590     if( dst.data == src.data )
05591         src = src.clone();
05592 
05593     double M[6];
05594     Mat matM(2, 3, CV_64F, M);
05595     int interpolation = flags & INTER_MAX;
05596     if( interpolation == INTER_AREA )
05597         interpolation = INTER_LINEAR;
05598 
05599     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
05600     M0.convertTo(matM, matM.type());
05601 
05602 #ifdef HAVE_TEGRA_OPTIMIZATION
05603     if( tegra::useTegra() && tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
05604         return;
05605 #endif
05606 
05607     if( !(flags & WARP_INVERSE_MAP) )
05608     {
05609         double D = M[0]*M[4] - M[1]*M[3];
05610         D = D != 0 ? 1./D : 0;
05611         double A11 = M[4]*D, A22=M[0]*D;
05612         M[0] = A11; M[1] *= -D;
05613         M[3] *= -D; M[4] = A22;
05614         double b1 = -M[0]*M[2] - M[1]*M[5];
05615         double b2 = -M[3]*M[2] - M[4]*M[5];
05616         M[2] = b1; M[5] = b2;
05617     }
05618 
05619     int x;
05620     AutoBuffer<int>  _abdelta(dst.cols*2);
05621     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
05622     const int AB_BITS = MAX(10, (int)INTER_BITS);
05623     const int AB_SCALE = 1 << AB_BITS;
05624 
05625 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
05626     CV_IPP_CHECK()
05627     {
05628         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
05629         if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
05630            ( cn == 1 || cn == 3 || cn == 4 ) &&
05631            ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
05632            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
05633         {
05634             ippiWarpAffineBackFunc ippFunc = 0;
05635             if ((flags & WARP_INVERSE_MAP) != 0)
05636             {
05637                 ippFunc =
05638                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
05639                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
05640                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
05641                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
05642                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
05643                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
05644                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
05645                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
05646                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
05647                 0;
05648             }
05649             else
05650             {
05651                 ippFunc =
05652                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
05653                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
05654                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
05655                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
05656                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
05657                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
05658                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
05659                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
05660                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
05661                 0;
05662             }
05663             int mode =
05664             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
05665             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
05666             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
05667             0;
05668             CV_Assert(mode && ippFunc);
05669 
05670             double coeffs[2][3];
05671             for( int i = 0; i < 2; i++ )
05672                 for( int j = 0; j < 3; j++ )
05673                     coeffs[i][j] = matM.at<double>(i, j);
05674 
05675             bool ok;
05676             Range range(0, dst.rows);
05677             IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
05678             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
05679             if( ok )
05680             {
05681                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05682                 return;
05683             }
05684             setIppErrorStatus();
05685         }
05686     }
05687 #endif
05688 
05689     for( x = 0; x < dst.cols; x++ )
05690     {
05691         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
05692         bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
05693     }
05694 
05695     Range range(0, dst.rows);
05696     WarpAffineInvoker invoker(src, dst, interpolation, borderType,
05697                               borderValue, adelta, bdelta, M);
05698     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
05699 }
05700 
05701 
05702 namespace cv
05703 {
05704 
05705 class WarpPerspectiveInvoker :
05706     public ParallelLoopBody
05707 {
05708 public:
05709     WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
05710                            int _borderType, const Scalar &_borderValue) :
05711         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
05712         borderType(_borderType), borderValue(_borderValue)
05713     {
05714     }
05715 
05716     virtual void operator() (const Range& range) const
05717     {
05718         const int BLOCK_SZ = 32;
05719         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
05720         int x, y, x1, y1, width = dst.cols, height = dst.rows;
05721 
05722         int bh0 = std::min(BLOCK_SZ/2, height);
05723         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
05724         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
05725 
05726         #if CV_SSE4_1
05727         bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
05728         __m128d v_M0 = _mm_set1_pd(M[0]);
05729         __m128d v_M3 = _mm_set1_pd(M[3]);
05730         __m128d v_M6 = _mm_set1_pd(M[6]);
05731         __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
05732         __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
05733         __m128d v_2 = _mm_set1_pd(2),
05734                 v_zero = _mm_setzero_pd(),
05735                 v_1 = _mm_set1_pd(1),
05736                 v_its = _mm_set1_pd(INTER_TAB_SIZE);
05737         __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
05738         #endif
05739 
05740         for( y = range.start; y < range.end; y += bh0 )
05741         {
05742             for( x = 0; x < width; x += bw0 )
05743             {
05744                 int bw = std::min( bw0, width - x);
05745                 int bh = std::min( bh0, range.end - y); // height
05746 
05747                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
05748                 Mat dpart(dst, Rect(x, y, bw, bh));
05749 
05750                 for( y1 = 0; y1 < bh; y1++ )
05751                 {
05752                     short* xy = XY + y1*bw*2;
05753                     double X0 = M[0]*x + M[1]*(y + y1) + M[2];
05754                     double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
05755                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
05756 
05757                     if( interpolation == INTER_NEAREST )
05758                     {
05759                         x1 = 0;
05760 
05761                         #if CV_SSE4_1
05762                         if (haveSSE4_1)
05763                         {
05764                             __m128d v_X0d = _mm_set1_pd(X0);
05765                             __m128d v_Y0d = _mm_set1_pd(Y0);
05766                             __m128d v_W0 = _mm_set1_pd(W0);
05767                             __m128d v_x1 = _mm_set_pd(1, 0);
05768 
05769                             for( ; x1 <= bw - 16; x1 += 16 )
05770                             {
05771                                 // 0-3
05772                                 __m128i v_X0, v_Y0;
05773                                 {
05774                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05775                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05776                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05777                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05778                                     v_x1 = _mm_add_pd(v_x1, v_2);
05779 
05780                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05781                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05782                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05783                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05784                                     v_x1 = _mm_add_pd(v_x1, v_2);
05785 
05786                                     v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05787                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05788                                     v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05789                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05790                                 }
05791 
05792                                 // 4-8
05793                                 __m128i v_X1, v_Y1;
05794                                 {
05795                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05796                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05797                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05798                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05799                                     v_x1 = _mm_add_pd(v_x1, v_2);
05800 
05801                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05802                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05803                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05804                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05805                                     v_x1 = _mm_add_pd(v_x1, v_2);
05806 
05807                                     v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05808                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05809                                     v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05810                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05811                                 }
05812 
05813                                 // 8-11
05814                                 __m128i v_X2, v_Y2;
05815                                 {
05816                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05817                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05818                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05819                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05820                                     v_x1 = _mm_add_pd(v_x1, v_2);
05821 
05822                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05823                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05824                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05825                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05826                                     v_x1 = _mm_add_pd(v_x1, v_2);
05827 
05828                                     v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05829                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05830                                     v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05831                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05832                                 }
05833 
05834                                 // 12-15
05835                                 __m128i v_X3, v_Y3;
05836                                 {
05837                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05838                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05839                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05840                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05841                                     v_x1 = _mm_add_pd(v_x1, v_2);
05842 
05843                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05844                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05845                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05846                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05847                                     v_x1 = _mm_add_pd(v_x1, v_2);
05848 
05849                                     v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05850                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05851                                     v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05852                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05853                                 }
05854 
05855                                 // convert to 16s
05856                                 v_X0 = _mm_packs_epi32(v_X0, v_X1);
05857                                 v_X1 = _mm_packs_epi32(v_X2, v_X3);
05858                                 v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
05859                                 v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
05860 
05861                                 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
05862 
05863                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
05864                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
05865                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
05866                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
05867                             }
05868                         }
05869                         #endif
05870 
05871                         for( ; x1 < bw; x1++ )
05872                         {
05873                             double W = W0 + M[6]*x1;
05874                             W = W ? 1./W : 0;
05875                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
05876                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
05877                             int X = saturate_cast<int>(fX);
05878                             int Y = saturate_cast<int>(fY);
05879 
05880                             xy[x1*2] = saturate_cast<short>(X);
05881                             xy[x1*2+1] = saturate_cast<short>(Y);
05882                         }
05883                     }
05884                     else
05885                     {
05886                         short* alpha = A + y1*bw;
05887                         x1 = 0;
05888 
05889                         #if CV_SSE4_1
05890                         if (haveSSE4_1)
05891                         {
05892                             __m128d v_X0d = _mm_set1_pd(X0);
05893                             __m128d v_Y0d = _mm_set1_pd(Y0);
05894                             __m128d v_W0 = _mm_set1_pd(W0);
05895                             __m128d v_x1 = _mm_set_pd(1, 0);
05896 
05897                             for( ; x1 <= bw - 16; x1 += 16 )
05898                             {
05899                                 // 0-3
05900                                 __m128i v_X0, v_Y0;
05901                                 {
05902                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05903                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05904                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05905                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05906                                     v_x1 = _mm_add_pd(v_x1, v_2);
05907 
05908                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05909                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05910                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05911                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05912                                     v_x1 = _mm_add_pd(v_x1, v_2);
05913 
05914                                     v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05915                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05916                                     v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05917                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05918                                 }
05919 
05920                                 // 4-8
05921                                 __m128i v_X1, v_Y1;
05922                                 {
05923                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05924                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05925                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05926                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05927                                     v_x1 = _mm_add_pd(v_x1, v_2);
05928 
05929                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05930                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05931                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05932                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05933                                     v_x1 = _mm_add_pd(v_x1, v_2);
05934 
05935                                     v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05936                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05937                                     v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05938                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05939                                 }
05940 
05941                                 // 8-11
05942                                 __m128i v_X2, v_Y2;
05943                                 {
05944                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05945                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05946                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05947                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05948                                     v_x1 = _mm_add_pd(v_x1, v_2);
05949 
05950                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05951                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05952                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05953                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05954                                     v_x1 = _mm_add_pd(v_x1, v_2);
05955 
05956                                     v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05957                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05958                                     v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05959                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05960                                 }
05961 
05962                                 // 12-15
05963                                 __m128i v_X3, v_Y3;
05964                                 {
05965                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05966                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05967                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05968                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05969                                     v_x1 = _mm_add_pd(v_x1, v_2);
05970 
05971                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05972                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05973                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05974                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05975                                     v_x1 = _mm_add_pd(v_x1, v_2);
05976 
05977                                     v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05978                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05979                                     v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05980                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05981                                 }
05982 
05983                                 // store alpha
05984                                 __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
05985                                                                  _mm_and_si128(v_X0, v_itsi1));
05986                                 __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
05987                                                                  _mm_and_si128(v_X1, v_itsi1));
05988                                 _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
05989 
05990                                 v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
05991                                                          _mm_and_si128(v_X2, v_itsi1));
05992                                 v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
05993                                                          _mm_and_si128(v_X3, v_itsi1));
05994                                 _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
05995 
05996                                 // convert to 16s
05997                                 v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
05998                                 v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
05999                                 v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
06000                                 v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
06001 
06002                                 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
06003 
06004                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
06005                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
06006                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
06007                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
06008                             }
06009                         }
06010                         #endif
06011 
06012                         for( ; x1 < bw; x1++ )
06013                         {
06014                             double W = W0 + M[6]*x1;
06015                             W = W ? INTER_TAB_SIZE/W : 0;
06016                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
06017                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
06018                             int X = saturate_cast<int>(fX);
06019                             int Y = saturate_cast<int>(fY);
06020 
06021                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
06022                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
06023                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
06024                                                 (X & (INTER_TAB_SIZE-1)));
06025                         }
06026                     }
06027                 }
06028 
06029                 if( interpolation == INTER_NEAREST )
06030                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
06031                 else
06032                 {
06033                     Mat _matA(bh, bw, CV_16U, A);
06034                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
06035                 }
06036             }
06037         }
06038     }
06039 
06040 private:
06041     Mat src;
06042     Mat dst;
06043     double* M;
06044     int interpolation, borderType;
06045     Scalar borderValue;
06046 };
06047 
06048 
06049 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
06050 class IPPWarpPerspectiveInvoker :
06051     public ParallelLoopBody
06052 {
06053 public:
06054     IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
06055                               int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
06056         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
06057         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
06058     {
06059         *ok = true;
06060     }
06061 
06062     virtual void operator() (const Range& range) const
06063     {
06064         IppiSize srcsize = {src.cols, src.rows};
06065         IppiRect srcroi = {0, 0, src.cols, src.rows};
06066         IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
06067         int cnn = src.channels();
06068 
06069         if( borderType == BORDER_CONSTANT )
06070         {
06071             IppiSize setSize = {dst.cols, range.end - range.start};
06072             void *dataPointer = dst.ptr(range.start);
06073             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
06074             {
06075                 *ok = false;
06076                 return;
06077             }
06078         }
06079 
06080         IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode);
06081         if (status != ippStsNoErr)
06082             *ok = false;
06083         else
06084         {
06085             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
06086         }
06087     }
06088 private:
06089     Mat &src;
06090     Mat &dst;
06091     int mode;
06092     double (&coeffs)[3][3];
06093     int borderType;
06094     const Scalar borderValue;
06095     ippiWarpPerspectiveFunc func;
06096     bool *ok;
06097 
06098     const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
06099 };
06100 #endif
06101 }
06102 
06103 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
06104                           Size dsize, int flags, int borderType, const Scalar & borderValue )
06105 {
06106     CV_Assert( _src.total() > 0 );
06107 
06108 #ifdef HAVE_OPENCL
06109     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
06110                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
06111                               OCL_OP_PERSPECTIVE))
06112 #endif
06113 
06114     Mat src = _src.getMat(), M0 = _M0.getMat();
06115     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
06116     Mat dst = _dst.getMat();
06117 
06118     if( dst.data == src.data )
06119         src = src.clone();
06120 
06121     double M[9];
06122     Mat matM(3, 3, CV_64F, M);
06123     int interpolation = flags & INTER_MAX;
06124     if( interpolation == INTER_AREA )
06125         interpolation = INTER_LINEAR;
06126 
06127     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
06128     M0.convertTo(matM, matM.type());
06129 
06130 #ifdef HAVE_TEGRA_OPTIMIZATION
06131     if( tegra::useTegra() && tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
06132         return;
06133 #endif
06134 
06135 
06136 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
06137     CV_IPP_CHECK()
06138     {
06139         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
06140         if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
06141            (cn == 1 || cn == 3 || cn == 4) &&
06142            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
06143            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
06144         {
06145             ippiWarpPerspectiveFunc ippFunc = 0;
06146             if ((flags & WARP_INVERSE_MAP) != 0)
06147             {
06148                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
06149                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
06150                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
06151                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
06152                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
06153                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
06154                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
06155                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
06156                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
06157             }
06158             else
06159             {
06160                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
06161                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
06162                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
06163                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
06164                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
06165                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
06166                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
06167                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
06168                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
06169             }
06170             int mode =
06171             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
06172             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
06173             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
06174             CV_Assert(mode && ippFunc);
06175 
06176             double coeffs[3][3];
06177             for( int i = 0; i < 3; i++ )
06178                 for( int j = 0; j < 3; j++ )
06179                     coeffs[i][j] = matM.at<double>(i, j);
06180 
06181             bool ok;
06182             Range range(0, dst.rows);
06183             IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
06184             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
06185             if( ok )
06186             {
06187                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
06188                 return;
06189             }
06190             setIppErrorStatus();
06191         }
06192     }
06193 #endif
06194 
06195     if( !(flags & WARP_INVERSE_MAP) )
06196         invert(matM, matM);
06197 
06198     Range range(0, dst.rows);
06199     WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
06200     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
06201 }
06202 
06203 
06204 cv::Mat cv::getRotationMatrix2D( Point2f  center, double angle, double scale )
06205 {
06206     angle *= CV_PI/180;
06207     double alpha = cos(angle)*scale;
06208     double beta = sin(angle)*scale;
06209 
06210     Mat M(2, 3, CV_64F);
06211     double* m = M.ptr<double>();
06212 
06213     m[0] = alpha;
06214     m[1] = beta;
06215     m[2] = (1-alpha)*center.x - beta*center.y;
06216     m[3] = -beta;
06217     m[4] = alpha;
06218     m[5] = beta*center.x + (1-alpha)*center.y;
06219 
06220     return M;
06221 }
06222 
06223 /* Calculates coefficients of perspective transformation
06224  * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
06225  *
06226  *      c00*xi + c01*yi + c02
06227  * ui = ---------------------
06228  *      c20*xi + c21*yi + c22
06229  *
06230  *      c10*xi + c11*yi + c12
06231  * vi = ---------------------
06232  *      c20*xi + c21*yi + c22
06233  *
06234  * Coefficients are calculated by solving linear system:
06235  * / x0 y0  1  0  0  0 -x0*u0 -y0*u0 \ /c00\ /u0\
06236  * | x1 y1  1  0  0  0 -x1*u1 -y1*u1 | |c01| |u1|
06237  * | x2 y2  1  0  0  0 -x2*u2 -y2*u2 | |c02| |u2|
06238  * | x3 y3  1  0  0  0 -x3*u3 -y3*u3 |.|c10|=|u3|,
06239  * |  0  0  0 x0 y0  1 -x0*v0 -y0*v0 | |c11| |v0|
06240  * |  0  0  0 x1 y1  1 -x1*v1 -y1*v1 | |c12| |v1|
06241  * |  0  0  0 x2 y2  1 -x2*v2 -y2*v2 | |c20| |v2|
06242  * \  0  0  0 x3 y3  1 -x3*v3 -y3*v3 / \c21/ \v3/
06243  *
06244  * where:
06245  *   cij - matrix coefficients, c22 = 1
06246  */
06247 cv::Mat cv::getPerspectiveTransform( const Point2f  src[], const Point2f  dst[] )
06248 {
06249     Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
06250     double a[8][8], b[8];
06251     Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
06252 
06253     for( int i = 0; i < 4; ++i )
06254     {
06255         a[i][0] = a[i+4][3] = src[i].x;
06256         a[i][1] = a[i+4][4] = src[i].y;
06257         a[i][2] = a[i+4][5] = 1;
06258         a[i][3] = a[i][4] = a[i][5] =
06259         a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
06260         a[i][6] = -src[i].x*dst[i].x;
06261         a[i][7] = -src[i].y*dst[i].x;
06262         a[i+4][6] = -src[i].x*dst[i].y;
06263         a[i+4][7] = -src[i].y*dst[i].y;
06264         b[i] = dst[i].x;
06265         b[i+4] = dst[i].y;
06266     }
06267 
06268     solve( A, B, X, DECOMP_SVD );
06269     M.ptr<double>()[8] = 1.;
06270 
06271     return M;
06272 }
06273 
06274 /* Calculates coefficients of affine transformation
06275  * which maps (xi,yi) to (ui,vi), (i=1,2,3):
06276  *
06277  * ui = c00*xi + c01*yi + c02
06278  *
06279  * vi = c10*xi + c11*yi + c12
06280  *
06281  * Coefficients are calculated by solving linear system:
06282  * / x0 y0  1  0  0  0 \ /c00\ /u0\
06283  * | x1 y1  1  0  0  0 | |c01| |u1|
06284  * | x2 y2  1  0  0  0 | |c02| |u2|
06285  * |  0  0  0 x0 y0  1 | |c10| |v0|
06286  * |  0  0  0 x1 y1  1 | |c11| |v1|
06287  * \  0  0  0 x2 y2  1 / |c12| |v2|
06288  *
06289  * where:
06290  *   cij - matrix coefficients
06291  */
06292 
06293 cv::Mat cv::getAffineTransform( const Point2f  src[], const Point2f  dst[] )
06294 {
06295     Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
06296     double a[6*6], b[6];
06297     Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
06298 
06299     for( int i = 0; i < 3; i++ )
06300     {
06301         int j = i*12;
06302         int k = i*12+6;
06303         a[j] = a[k+3] = src[i].x;
06304         a[j+1] = a[k+4] = src[i].y;
06305         a[j+2] = a[k+5] = 1;
06306         a[j+3] = a[j+4] = a[j+5] = 0;
06307         a[k] = a[k+1] = a[k+2] = 0;
06308         b[i*2] = dst[i].x;
06309         b[i*2+1] = dst[i].y;
06310     }
06311 
06312     solve( A, B, X );
06313     return M;
06314 }
06315 
06316 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
06317 {
06318     Mat matM = _matM.getMat();
06319     CV_Assert(matM.rows == 2 && matM.cols == 3);
06320     __iM.create(2, 3, matM.type());
06321     Mat _iM = __iM.getMat();
06322 
06323     if( matM.type() == CV_32F )
06324     {
06325         const float* M = matM.ptr<float>();
06326         float* iM = _iM.ptr<float>();
06327         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
06328 
06329         double D = M[0]*M[step+1] - M[1]*M[step];
06330         D = D != 0 ? 1./D : 0;
06331         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
06332         double b1 = -A11*M[2] - A12*M[step+2];
06333         double b2 = -A21*M[2] - A22*M[step+2];
06334 
06335         iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
06336         iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
06337     }
06338     else if( matM.type() == CV_64F )
06339     {
06340         const double* M = matM.ptr<double>();
06341         double* iM = _iM.ptr<double>();
06342         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
06343 
06344         double D = M[0]*M[step+1] - M[1]*M[step];
06345         D = D != 0 ? 1./D : 0;
06346         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
06347         double b1 = -A11*M[2] - A12*M[step+2];
06348         double b2 = -A21*M[2] - A22*M[step+2];
06349 
06350         iM[0] = A11; iM[1] = A12; iM[2] = b1;
06351         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
06352     }
06353     else
06354         CV_Error( CV_StsUnsupportedFormat, "" );
06355 }
06356 
06357 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
06358 {
06359     Mat src = _src.getMat(), dst = _dst.getMat();
06360     CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
06361     return getPerspectiveTransform((const Point2f *)src.data, (const Point2f *)dst.data);
06362 }
06363 
06364 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
06365 {
06366     Mat src = _src.getMat(), dst = _dst.getMat();
06367     CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
06368     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
06369 }
06370 
06371 CV_IMPL void
06372 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
06373 {
06374     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
06375     CV_Assert( src.type() == dst.type() );
06376     cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
06377         (double)dst.rows/src.rows, method );
06378 }
06379 
06380 
06381 CV_IMPL void
06382 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
06383               int flags, CvScalar  fillval )
06384 {
06385     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
06386     cv::Mat matrix = cv::cvarrToMat(marr);
06387     CV_Assert( src.type() == dst.type() );
06388     cv::warpAffine( src, dst, matrix, dst.size(), flags,
06389         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
06390         fillval );
06391 }
06392 
06393 CV_IMPL void
06394 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
06395                    int flags, CvScalar  fillval )
06396 {
06397     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
06398     cv::Mat matrix = cv::cvarrToMat(marr);
06399     CV_Assert( src.type() == dst.type() );
06400     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
06401         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
06402         fillval );
06403 }
06404 
06405 CV_IMPL void
06406 cvRemap( const CvArr* srcarr, CvArr* dstarr,
06407          const CvArr* _mapx, const CvArr* _mapy,
06408          int flags, CvScalar  fillval )
06409 {
06410     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
06411     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
06412     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
06413     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
06414         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
06415         fillval );
06416     CV_Assert( dst0.data == dst.data );
06417 }
06418 
06419 
06420 CV_IMPL CvMat*
06421 cv2DRotationMatrix( CvPoint2D32f center, double angle,
06422                     double scale, CvMat* matrix )
06423 {
06424     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
06425     CV_Assert( M.size() == M0.size() );
06426     M.convertTo(M0, M0.type());
06427     return matrix;
06428 }
06429 
06430 
06431 CV_IMPL CvMat*
06432 cvGetPerspectiveTransform( const CvPoint2D32f* src,
06433                           const CvPoint2D32f* dst,
06434                           CvMat* matrix )
06435 {
06436     cv::Mat M0 = cv::cvarrToMat(matrix),
06437         M = cv::getPerspectiveTransform((const cv::Point2f *)src, (const cv::Point2f *)dst);
06438     CV_Assert( M.size() == M0.size() );
06439     M.convertTo(M0, M0.type());
06440     return matrix;
06441 }
06442 
06443 
06444 CV_IMPL CvMat*
06445 cvGetAffineTransform( const CvPoint2D32f* src,
06446                           const CvPoint2D32f* dst,
06447                           CvMat* matrix )
06448 {
06449     cv::Mat M0 = cv::cvarrToMat(matrix),
06450         M = cv::getAffineTransform((const cv::Point2f *)src, (const cv::Point2f *)dst);
06451     CV_Assert( M.size() == M0.size() );
06452     M.convertTo(M0, M0.type());
06453     return matrix;
06454 }
06455 
06456 
06457 CV_IMPL void
06458 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
06459 {
06460     cv::Mat map1 = cv::cvarrToMat(arr1), map2;
06461     cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
06462 
06463     if( arr2 )
06464         map2 = cv::cvarrToMat(arr2);
06465     if( dstarr2 )
06466     {
06467         dstmap2 = cv::cvarrToMat(dstarr2);
06468         if( dstmap2.type() == CV_16SC1 )
06469             dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
06470     }
06471 
06472     cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
06473 }
06474 
06475 /****************************************************************************************\
06476 *                                   Log-Polar Transform                                  *
06477 \****************************************************************************************/
06478 
06479 /* now it is done via Remap; more correct implementation should use
06480    some super-sampling technique outside of the "fovea" circle */
06481 CV_IMPL void
06482 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
06483             CvPoint2D32f center, double M, int flags )
06484 {
06485     cv::Ptr<CvMat> mapx, mapy;
06486 
06487     CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
06488     CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
06489     CvSize ssize, dsize;
06490 
06491     if( !CV_ARE_TYPES_EQ( src, dst ))
06492         CV_Error( CV_StsUnmatchedFormats, "" );
06493 
06494     if( M <= 0 )
06495         CV_Error( CV_StsOutOfRange, "M should be >0" );
06496 
06497     ssize = cvGetMatSize(src);
06498     dsize = cvGetMatSize(dst);
06499 
06500     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
06501     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
06502 
06503     if( !(flags & CV_WARP_INVERSE_MAP) )
06504     {
06505         int phi, rho;
06506         cv::AutoBuffer<double> _exp_tab(dsize.width);
06507         double* exp_tab = _exp_tab;
06508 
06509         for( rho = 0; rho < dst->width; rho++ )
06510             exp_tab[rho] = std::exp(rho/M);
06511 
06512         for( phi = 0; phi < dsize.height; phi++ )
06513         {
06514             double cp = cos(phi*2*CV_PI/dsize.height);
06515             double sp = sin(phi*2*CV_PI/dsize.height);
06516             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
06517             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
06518 
06519             for( rho = 0; rho < dsize.width; rho++ )
06520             {
06521                 double r = exp_tab[rho];
06522                 double x = r*cp + center.x;
06523                 double y = r*sp + center.y;
06524 
06525                 mx[rho] = (float)x;
06526                 my[rho] = (float)y;
06527             }
06528         }
06529     }
06530     else
06531     {
06532         int x, y;
06533         CvMat bufx, bufy, bufp, bufa;
06534         double ascale = ssize.height/(2*CV_PI);
06535         cv::AutoBuffer<float> _buf(4*dsize.width);
06536         float* buf = _buf;
06537 
06538         bufx = cvMat( 1, dsize.width, CV_32F, buf );
06539         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
06540         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
06541         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
06542 
06543         for( x = 0; x < dsize.width; x++ )
06544             bufx.data.fl[x] = (float)x - center.x;
06545 
06546         for( y = 0; y < dsize.height; y++ )
06547         {
06548             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
06549             float* my = (float*)(mapy->data.ptr + y*mapy->step);
06550 
06551             for( x = 0; x < dsize.width; x++ )
06552                 bufy.data.fl[x] = (float)y - center.y;
06553 
06554 #if 1
06555             cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
06556 
06557             for( x = 0; x < dsize.width; x++ )
06558                 bufp.data.fl[x] += 1.f;
06559 
06560             cvLog( &bufp, &bufp );
06561 
06562             for( x = 0; x < dsize.width; x++ )
06563             {
06564                 double rho = bufp.data.fl[x]*M;
06565                 double phi = bufa.data.fl[x]*ascale;
06566 
06567                 mx[x] = (float)rho;
06568                 my[x] = (float)phi;
06569             }
06570 #else
06571             for( x = 0; x < dsize.width; x++ )
06572             {
06573                 double xx = bufx.data.fl[x];
06574                 double yy = bufy.data.fl[x];
06575 
06576                 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
06577                 double a = atan2(yy,xx);
06578                 if( a < 0 )
06579                     a = 2*CV_PI + a;
06580                 a *= ascale;
06581 
06582                 mx[x] = (float)p;
06583                 my[x] = (float)a;
06584             }
06585 #endif
06586         }
06587     }
06588 
06589     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
06590 }
06591 
06592 void cv::logPolar( InputArray _src, OutputArray _dst,
06593                    Point2f  center, double M, int flags )
06594 {
06595     Mat src = _src.getMat();
06596     _dst.create( src.size(), src.type() );
06597     CvMat c_src = src, c_dst = _dst.getMat();
06598     cvLogPolar( &c_src, &c_dst, center, M, flags );
06599 }
06600 
06601 /****************************************************************************************
06602                                    Linear-Polar Transform
06603   J.L. Blanco, Apr 2009
06604  ****************************************************************************************/
06605 CV_IMPL
06606 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
06607             CvPoint2D32f center, double maxRadius, int flags )
06608 {
06609     cv::Ptr<CvMat> mapx, mapy;
06610 
06611     CvMat srcstub, *src = (CvMat*)srcarr;
06612     CvMat dststub, *dst = (CvMat*)dstarr;
06613     CvSize ssize, dsize;
06614 
06615     src = cvGetMat( srcarr, &srcstub,0,0 );
06616     dst = cvGetMat( dstarr, &dststub,0,0 );
06617 
06618     if( !CV_ARE_TYPES_EQ( src, dst ))
06619         CV_Error( CV_StsUnmatchedFormats, "" );
06620 
06621     ssize.width = src->cols;
06622     ssize.height = src->rows;
06623     dsize.width = dst->cols;
06624     dsize.height = dst->rows;
06625 
06626     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
06627     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
06628 
06629     if( !(flags & CV_WARP_INVERSE_MAP) )
06630     {
06631         int phi, rho;
06632 
06633         for( phi = 0; phi < dsize.height; phi++ )
06634         {
06635             double cp = cos(phi*2*CV_PI/dsize.height);
06636             double sp = sin(phi*2*CV_PI/dsize.height);
06637             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
06638             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
06639 
06640             for( rho = 0; rho < dsize.width; rho++ )
06641             {
06642                 double r = maxRadius*(rho+1)/dsize.width;
06643                 double x = r*cp + center.x;
06644                 double y = r*sp + center.y;
06645 
06646                 mx[rho] = (float)x;
06647                 my[rho] = (float)y;
06648             }
06649         }
06650     }
06651     else
06652     {
06653         int x, y;
06654         CvMat bufx, bufy, bufp, bufa;
06655         const double ascale = ssize.height/(2*CV_PI);
06656         const double pscale = ssize.width/maxRadius;
06657 
06658         cv::AutoBuffer<float> _buf(4*dsize.width);
06659         float* buf = _buf;
06660 
06661         bufx = cvMat( 1, dsize.width, CV_32F, buf );
06662         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
06663         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
06664         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
06665 
06666         for( x = 0; x < dsize.width; x++ )
06667             bufx.data.fl[x] = (float)x - center.x;
06668 
06669         for( y = 0; y < dsize.height; y++ )
06670         {
06671             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
06672             float* my = (float*)(mapy->data.ptr + y*mapy->step);
06673 
06674             for( x = 0; x < dsize.width; x++ )
06675                 bufy.data.fl[x] = (float)y - center.y;
06676 
06677             cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
06678 
06679             for( x = 0; x < dsize.width; x++ )
06680                 bufp.data.fl[x] += 1.f;
06681 
06682             for( x = 0; x < dsize.width; x++ )
06683             {
06684                 double rho = bufp.data.fl[x]*pscale;
06685                 double phi = bufa.data.fl[x]*ascale;
06686                 mx[x] = (float)rho;
06687                 my[x] = (float)phi;
06688             }
06689         }
06690     }
06691 
06692     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
06693 }
06694 
06695 void cv::linearPolar( InputArray _src, OutputArray _dst,
06696                       Point2f  center, double maxRadius, int flags )
06697 {
06698     Mat src = _src.getMat();
06699     _dst.create( src.size(), src.type() );
06700     CvMat c_src = src, c_dst = _dst.getMat();
06701     cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags );
06702 }
06703 
06704 /* End of file. */
06705
Repository toolbox

Repository details

Type:	Program
Created:	26 Jul 2017
Imports:	3
Forks:	0
Commits:	168
Dependents:	0
Dependencies:	0
Followers:	9
Important changes to repositories hosted on mbed.com

imgwarp.cpp

Repository toolbox

Repository details

Important Information for this Arm website

Important changes to repositories hosted on mbed.com

imgwarp.cpp

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning