Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers imgwarp.cpp Source File

imgwarp.cpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 /* ////////////////////////////////////////////////////////////////////
00045 //
00046 //  Geometrical transforms on images and matrices: rotation, zoom etc.
00047 //
00048 // */
00049 
00050 #include "precomp.hpp"
00051 #include "opencl_kernels_imgproc.hpp"
00052 
00053 namespace cv
00054 {
00055 #if IPP_VERSION_X100 >= 710
00056     typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
00057     typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
00058     typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
00059 #endif
00060 
00061 #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK
00062     typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
00063     typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
00064     typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
00065 
00066     template <int channels, typename Type>
00067     bool IPPSetSimple(cv::Scalar  value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
00068     {
00069         Type values[channels];
00070         for( int i = 0; i < channels; i++ )
00071             values[i] = saturate_cast<Type>(value[i]);
00072         return func(values, dataPointer, step, size) >= 0;
00073     }
00074 
00075     static bool IPPSet(const cv::Scalar  &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
00076     {
00077         if( channels == 1 )
00078         {
00079             switch( depth )
00080             {
00081             case CV_8U:
00082                 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0;
00083             case CV_16U:
00084                 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0;
00085             case CV_32F:
00086                 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0;
00087             }
00088         }
00089         else
00090         {
00091             if( channels == 3 )
00092             {
00093                 switch( depth )
00094                 {
00095                 case CV_8U:
00096                     return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
00097                 case CV_16U:
00098                     return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
00099                 case CV_32F:
00100                     return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
00101                 }
00102             }
00103             else if( channels == 4 )
00104             {
00105                 switch( depth )
00106                 {
00107                 case CV_8U:
00108                     return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
00109                 case CV_16U:
00110                     return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
00111                 case CV_32F:
00112                     return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
00113                 }
00114             }
00115         }
00116         return false;
00117     }
00118 #endif
00119 
00120 /************** interpolation formulas and tables ***************/
00121 
00122 const int INTER_RESIZE_COEF_BITS=11;
00123 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
00124 
00125 const int INTER_REMAP_COEF_BITS=15;
00126 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
00127 
00128 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
00129 
00130 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
00131 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
00132 
00133 #if CV_SSE2 || CV_NEON
00134 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
00135 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
00136 #endif
00137 
00138 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
00139 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
00140 
00141 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
00142 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
00143 
00144 static inline void interpolateLinear( float x, float* coeffs )
00145 {
00146     coeffs[0] = 1.f - x;
00147     coeffs[1] = x;
00148 }
00149 
00150 static inline void interpolateCubic( float x, float* coeffs )
00151 {
00152     const float A = -0.75f;
00153 
00154     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
00155     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
00156     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
00157     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
00158 }
00159 
00160 static inline void interpolateLanczos4( float x, float* coeffs )
00161 {
00162     static const double s45 = 0.70710678118654752440084436210485;
00163     static const double cs[][2]=
00164     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
00165 
00166     if( x < FLT_EPSILON )
00167     {
00168         for( int i = 0; i < 8; i++ )
00169             coeffs[i] = 0;
00170         coeffs[3] = 1;
00171         return;
00172     }
00173 
00174     float sum = 0;
00175     double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
00176     for(int i = 0; i < 8; i++ )
00177     {
00178         double y = -(x+3-i)*CV_PI*0.25;
00179         coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
00180         sum += coeffs[i];
00181     }
00182 
00183     sum = 1.f/sum;
00184     for(int i = 0; i < 8; i++ )
00185         coeffs[i] *= sum;
00186 }
00187 
00188 static void initInterTab1D(int method, float* tab, int tabsz)
00189 {
00190     float scale = 1.f/tabsz;
00191     if( method == INTER_LINEAR )
00192     {
00193         for( int i = 0; i < tabsz; i++, tab += 2 )
00194             interpolateLinear( i*scale, tab );
00195     }
00196     else if( method == INTER_CUBIC )
00197     {
00198         for( int i = 0; i < tabsz; i++, tab += 4 )
00199             interpolateCubic( i*scale, tab );
00200     }
00201     else if( method == INTER_LANCZOS4 )
00202     {
00203         for( int i = 0; i < tabsz; i++, tab += 8 )
00204             interpolateLanczos4( i*scale, tab );
00205     }
00206     else
00207         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
00208 }
00209 
00210 
00211 static const void* initInterTab2D( int method, bool fixpt )
00212 {
00213     static bool inittab[INTER_MAX+1] = {false};
00214     float* tab = 0;
00215     short* itab = 0;
00216     int ksize = 0;
00217     if( method == INTER_LINEAR )
00218         tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
00219     else if( method == INTER_CUBIC )
00220         tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
00221     else if( method == INTER_LANCZOS4 )
00222         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
00223     else
00224         CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
00225 
00226     if( !inittab[method] )
00227     {
00228         AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
00229         int i, j, k1, k2;
00230         initInterTab1D(method, _tab, INTER_TAB_SIZE);
00231         for( i = 0; i < INTER_TAB_SIZE; i++ )
00232             for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
00233             {
00234                 int isum = 0;
00235                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
00236                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
00237 
00238                 for( k1 = 0; k1 < ksize; k1++ )
00239                 {
00240                     float vy = _tab[i*ksize + k1];
00241                     for( k2 = 0; k2 < ksize; k2++ )
00242                     {
00243                         float v = vy*_tab[j*ksize + k2];
00244                         tab[k1*ksize + k2] = v;
00245                         isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
00246                     }
00247                 }
00248 
00249                 if( isum != INTER_REMAP_COEF_SCALE )
00250                 {
00251                     int diff = isum - INTER_REMAP_COEF_SCALE;
00252                     int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
00253                     for( k1 = ksize2; k1 < ksize2+2; k1++ )
00254                         for( k2 = ksize2; k2 < ksize2+2; k2++ )
00255                         {
00256                             if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
00257                                 mk1 = k1, mk2 = k2;
00258                             else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
00259                                 Mk1 = k1, Mk2 = k2;
00260                         }
00261                     if( diff < 0 )
00262                         itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
00263                     else
00264                         itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
00265                 }
00266             }
00267         tab -= INTER_TAB_SIZE2*ksize*ksize;
00268         itab -= INTER_TAB_SIZE2*ksize*ksize;
00269 #if CV_SSE2 || CV_NEON
00270         if( method == INTER_LINEAR )
00271         {
00272             for( i = 0; i < INTER_TAB_SIZE2; i++ )
00273                 for( j = 0; j < 4; j++ )
00274                 {
00275                     BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
00276                     BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
00277                     BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
00278                     BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
00279                 }
00280         }
00281 #endif
00282         inittab[method] = true;
00283     }
00284     return fixpt ? (const void*)itab : (const void*)tab;
00285 }
00286 
00287 #ifndef __MINGW32__
00288 static bool initAllInterTab2D()
00289 {
00290     return  initInterTab2D( INTER_LINEAR, false ) &&
00291             initInterTab2D( INTER_LINEAR, true ) &&
00292             initInterTab2D( INTER_CUBIC, false ) &&
00293             initInterTab2D( INTER_CUBIC, true ) &&
00294             initInterTab2D( INTER_LANCZOS4, false ) &&
00295             initInterTab2D( INTER_LANCZOS4, true );
00296 }
00297 
00298 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
00299 #endif
00300 
00301 template<typename ST, typename DT> struct Cast
00302 {
00303     typedef ST type1;
00304     typedef DT rtype;
00305 
00306     DT operator()(ST val) const { return saturate_cast<DT>(val); }
00307 };
00308 
00309 template<typename ST, typename DT, int bits> struct FixedPtCast
00310 {
00311     typedef ST type1;
00312     typedef DT rtype;
00313     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
00314 
00315     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
00316 };
00317 
00318 /****************************************************************************************\
00319 *                                         Resize                                         *
00320 \****************************************************************************************/
00321 
00322 class resizeNNInvoker :
00323     public ParallelLoopBody
00324 {
00325 public:
00326     resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
00327         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
00328         ify(_ify)
00329     {
00330     }
00331 
00332     virtual void operator() (const Range& range) const
00333     {
00334         Size ssize = src.size(), dsize = dst.size();
00335         int y, x, pix_size = (int)src.elemSize();
00336 
00337         for( y = range.start; y < range.end; y++ )
00338         {
00339             uchar* D = dst.data + dst.step*y;
00340             int sy = std::min(cvFloor(y*ify), ssize.height-1);
00341             const uchar* S = src.ptr(sy);
00342 
00343             switch( pix_size )
00344             {
00345             case 1:
00346                 for( x = 0; x <= dsize.width - 2; x += 2 )
00347                 {
00348                     uchar t0 = S[x_ofs[x]];
00349                     uchar t1 = S[x_ofs[x+1]];
00350                     D[x] = t0;
00351                     D[x+1] = t1;
00352                 }
00353 
00354                 for( ; x < dsize.width; x++ )
00355                     D[x] = S[x_ofs[x]];
00356                 break;
00357             case 2:
00358                 for( x = 0; x < dsize.width; x++ )
00359                     *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
00360                 break;
00361             case 3:
00362                 for( x = 0; x < dsize.width; x++, D += 3 )
00363                 {
00364                     const uchar* _tS = S + x_ofs[x];
00365                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
00366                 }
00367                 break;
00368             case 4:
00369                 for( x = 0; x < dsize.width; x++ )
00370                     *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
00371                 break;
00372             case 6:
00373                 for( x = 0; x < dsize.width; x++, D += 6 )
00374                 {
00375                     const ushort* _tS = (const ushort*)(S + x_ofs[x]);
00376                     ushort* _tD = (ushort*)D;
00377                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
00378                 }
00379                 break;
00380             case 8:
00381                 for( x = 0; x < dsize.width; x++, D += 8 )
00382                 {
00383                     const int* _tS = (const int*)(S + x_ofs[x]);
00384                     int* _tD = (int*)D;
00385                     _tD[0] = _tS[0]; _tD[1] = _tS[1];
00386                 }
00387                 break;
00388             case 12:
00389                 for( x = 0; x < dsize.width; x++, D += 12 )
00390                 {
00391                     const int* _tS = (const int*)(S + x_ofs[x]);
00392                     int* _tD = (int*)D;
00393                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
00394                 }
00395                 break;
00396             default:
00397                 for( x = 0; x < dsize.width; x++, D += pix_size )
00398                 {
00399                     const int* _tS = (const int*)(S + x_ofs[x]);
00400                     int* _tD = (int*)D;
00401                     for( int k = 0; k < pix_size4; k++ )
00402                         _tD[k] = _tS[k];
00403                 }
00404             }
00405         }
00406     }
00407 
00408 private:
00409     const Mat src;
00410     Mat dst;
00411     int* x_ofs, pix_size4;
00412     double ify;
00413 
00414     resizeNNInvoker(const resizeNNInvoker&);
00415     resizeNNInvoker& operator=(const resizeNNInvoker&);
00416 };
00417 
00418 static void
00419 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
00420 {
00421     Size ssize = src.size(), dsize = dst.size();
00422     AutoBuffer<int> _x_ofs(dsize.width);
00423     int* x_ofs = _x_ofs;
00424     int pix_size = (int)src.elemSize();
00425     int pix_size4 = (int)(pix_size / sizeof(int));
00426     double ifx = 1./fx, ify = 1./fy;
00427     int x;
00428 
00429     for( x = 0; x < dsize.width; x++ )
00430     {
00431         int sx = cvFloor(x*ifx);
00432         x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
00433     }
00434 
00435     Range range(0, dsize.height);
00436     resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
00437     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
00438 }
00439 
00440 
00441 struct VResizeNoVec
00442 {
00443     int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
00444 };
00445 
00446 struct HResizeNoVec
00447 {
00448     int operator()(const uchar**, uchar**, int, const int*,
00449         const uchar*, int, int, int, int, int) const { return 0; }
00450 };
00451 
00452 #if CV_SSE2
00453 
00454 struct VResizeLinearVec_32s8u
00455 {
00456     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
00457     {
00458         if( !checkHardwareSupport(CV_CPU_SSE2) )
00459             return 0;
00460 
00461         const int** src = (const int**)_src;
00462         const short* beta = (const short*)_beta;
00463         const int *S0 = src[0], *S1 = src[1];
00464         int x = 0;
00465         __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
00466         __m128i delta = _mm_set1_epi16(2);
00467 
00468         if( (((size_t)S0|(size_t)S1)&15) == 0 )
00469             for( ; x <= width - 16; x += 16 )
00470             {
00471                 __m128i x0, x1, x2, y0, y1, y2;
00472                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
00473                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
00474                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
00475                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
00476                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
00477                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
00478 
00479                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
00480                 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
00481                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
00482                 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
00483                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
00484                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
00485 
00486                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
00487                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
00488 
00489                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
00490                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
00491                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
00492             }
00493         else
00494             for( ; x <= width - 16; x += 16 )
00495             {
00496                 __m128i x0, x1, x2, y0, y1, y2;
00497                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
00498                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
00499                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
00500                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
00501                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
00502                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
00503 
00504                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
00505                 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
00506                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
00507                 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
00508                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
00509                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
00510 
00511                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
00512                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
00513 
00514                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
00515                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
00516                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
00517             }
00518 
00519         for( ; x < width - 4; x += 4 )
00520         {
00521             __m128i x0, y0;
00522             x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
00523             y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
00524             x0 = _mm_packs_epi32(x0, x0);
00525             y0 = _mm_packs_epi32(y0, y0);
00526             x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
00527             x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
00528             x0 = _mm_packus_epi16(x0, x0);
00529             *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
00530         }
00531 
00532         return x;
00533     }
00534 };
00535 
00536 
00537 template<int shiftval> struct VResizeLinearVec_32f16
00538 {
00539     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00540     {
00541         if( !checkHardwareSupport(CV_CPU_SSE2) )
00542             return 0;
00543 
00544         const float** src = (const float**)_src;
00545         const float* beta = (const float*)_beta;
00546         const float *S0 = src[0], *S1 = src[1];
00547         ushort* dst = (ushort*)_dst;
00548         int x = 0;
00549 
00550         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
00551         __m128i preshift = _mm_set1_epi32(shiftval);
00552         __m128i postshift = _mm_set1_epi16((short)shiftval);
00553 
00554         if( (((size_t)S0|(size_t)S1)&15) == 0 )
00555             for( ; x <= width - 16; x += 16 )
00556             {
00557                 __m128 x0, x1, y0, y1;
00558                 __m128i t0, t1, t2;
00559                 x0 = _mm_load_ps(S0 + x);
00560                 x1 = _mm_load_ps(S0 + x + 4);
00561                 y0 = _mm_load_ps(S1 + x);
00562                 y1 = _mm_load_ps(S1 + x + 4);
00563 
00564                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00565                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00566                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00567                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
00568                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
00569 
00570                 x0 = _mm_load_ps(S0 + x + 8);
00571                 x1 = _mm_load_ps(S0 + x + 12);
00572                 y0 = _mm_load_ps(S1 + x + 8);
00573                 y1 = _mm_load_ps(S1 + x + 12);
00574 
00575                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00576                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00577                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00578                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
00579                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
00580 
00581                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
00582                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
00583             }
00584         else
00585             for( ; x <= width - 16; x += 16 )
00586             {
00587                 __m128 x0, x1, y0, y1;
00588                 __m128i t0, t1, t2;
00589                 x0 = _mm_loadu_ps(S0 + x);
00590                 x1 = _mm_loadu_ps(S0 + x + 4);
00591                 y0 = _mm_loadu_ps(S1 + x);
00592                 y1 = _mm_loadu_ps(S1 + x + 4);
00593 
00594                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00595                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00596                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00597                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
00598                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
00599 
00600                 x0 = _mm_loadu_ps(S0 + x + 8);
00601                 x1 = _mm_loadu_ps(S0 + x + 12);
00602                 y0 = _mm_loadu_ps(S1 + x + 8);
00603                 y1 = _mm_loadu_ps(S1 + x + 12);
00604 
00605                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00606                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00607                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00608                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
00609                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
00610 
00611                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
00612                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
00613             }
00614 
00615         for( ; x < width - 4; x += 4 )
00616         {
00617             __m128 x0, y0;
00618             __m128i t0;
00619             x0 = _mm_loadu_ps(S0 + x);
00620             y0 = _mm_loadu_ps(S1 + x);
00621 
00622             x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00623             t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
00624             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
00625             _mm_storel_epi64( (__m128i*)(dst + x), t0);
00626         }
00627 
00628         return x;
00629     }
00630 };
00631 
00632 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
00633 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
00634 
00635 struct VResizeLinearVec_32f
00636 {
00637     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00638     {
00639         if( !checkHardwareSupport(CV_CPU_SSE) )
00640             return 0;
00641 
00642         const float** src = (const float**)_src;
00643         const float* beta = (const float*)_beta;
00644         const float *S0 = src[0], *S1 = src[1];
00645         float* dst = (float*)_dst;
00646         int x = 0;
00647 
00648         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
00649 
00650         if( (((size_t)S0|(size_t)S1)&15) == 0 )
00651             for( ; x <= width - 8; x += 8 )
00652             {
00653                 __m128 x0, x1, y0, y1;
00654                 x0 = _mm_load_ps(S0 + x);
00655                 x1 = _mm_load_ps(S0 + x + 4);
00656                 y0 = _mm_load_ps(S1 + x);
00657                 y1 = _mm_load_ps(S1 + x + 4);
00658 
00659                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00660                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00661 
00662                 _mm_storeu_ps( dst + x, x0);
00663                 _mm_storeu_ps( dst + x + 4, x1);
00664             }
00665         else
00666             for( ; x <= width - 8; x += 8 )
00667             {
00668                 __m128 x0, x1, y0, y1;
00669                 x0 = _mm_loadu_ps(S0 + x);
00670                 x1 = _mm_loadu_ps(S0 + x + 4);
00671                 y0 = _mm_loadu_ps(S1 + x);
00672                 y1 = _mm_loadu_ps(S1 + x + 4);
00673 
00674                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
00675                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
00676 
00677                 _mm_storeu_ps( dst + x, x0);
00678                 _mm_storeu_ps( dst + x + 4, x1);
00679             }
00680 
00681         return x;
00682     }
00683 };
00684 
00685 
00686 struct VResizeCubicVec_32s8u
00687 {
00688     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
00689     {
00690         if( !checkHardwareSupport(CV_CPU_SSE2) )
00691             return 0;
00692 
00693         const int** src = (const int**)_src;
00694         const short* beta = (const short*)_beta;
00695         const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
00696         int x = 0;
00697         float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
00698         __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
00699             b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
00700 
00701         if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
00702             for( ; x <= width - 8; x += 8 )
00703             {
00704                 __m128i x0, x1, y0, y1;
00705                 __m128 s0, s1, f0, f1;
00706                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
00707                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
00708                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
00709                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
00710 
00711                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
00712                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
00713                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
00714                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
00715                 s0 = _mm_add_ps(s0, f0);
00716                 s1 = _mm_add_ps(s1, f1);
00717 
00718                 x0 = _mm_load_si128((const __m128i*)(S2 + x));
00719                 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
00720                 y0 = _mm_load_si128((const __m128i*)(S3 + x));
00721                 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
00722 
00723                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
00724                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
00725                 s0 = _mm_add_ps(s0, f0);
00726                 s1 = _mm_add_ps(s1, f1);
00727                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
00728                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
00729                 s0 = _mm_add_ps(s0, f0);
00730                 s1 = _mm_add_ps(s1, f1);
00731 
00732                 x0 = _mm_cvtps_epi32(s0);
00733                 x1 = _mm_cvtps_epi32(s1);
00734 
00735                 x0 = _mm_packs_epi32(x0, x1);
00736                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
00737             }
00738         else
00739             for( ; x <= width - 8; x += 8 )
00740             {
00741                 __m128i x0, x1, y0, y1;
00742                 __m128 s0, s1, f0, f1;
00743                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
00744                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
00745                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
00746                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
00747 
00748                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
00749                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
00750                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
00751                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
00752                 s0 = _mm_add_ps(s0, f0);
00753                 s1 = _mm_add_ps(s1, f1);
00754 
00755                 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
00756                 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
00757                 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
00758                 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
00759 
00760                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
00761                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
00762                 s0 = _mm_add_ps(s0, f0);
00763                 s1 = _mm_add_ps(s1, f1);
00764                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
00765                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
00766                 s0 = _mm_add_ps(s0, f0);
00767                 s1 = _mm_add_ps(s1, f1);
00768 
00769                 x0 = _mm_cvtps_epi32(s0);
00770                 x1 = _mm_cvtps_epi32(s1);
00771 
00772                 x0 = _mm_packs_epi32(x0, x1);
00773                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
00774             }
00775 
00776         return x;
00777     }
00778 };
00779 
00780 
00781 template<int shiftval> struct VResizeCubicVec_32f16
00782 {
00783     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00784     {
00785         if( !checkHardwareSupport(CV_CPU_SSE2) )
00786             return 0;
00787 
00788         const float** src = (const float**)_src;
00789         const float* beta = (const float*)_beta;
00790         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
00791         ushort* dst = (ushort*)_dst;
00792         int x = 0;
00793         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
00794             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
00795         __m128i preshift = _mm_set1_epi32(shiftval);
00796         __m128i postshift = _mm_set1_epi16((short)shiftval);
00797 
00798         for( ; x <= width - 8; x += 8 )
00799         {
00800             __m128 x0, x1, y0, y1, s0, s1;
00801             __m128i t0, t1;
00802             x0 = _mm_loadu_ps(S0 + x);
00803             x1 = _mm_loadu_ps(S0 + x + 4);
00804             y0 = _mm_loadu_ps(S1 + x);
00805             y1 = _mm_loadu_ps(S1 + x + 4);
00806 
00807             s0 = _mm_mul_ps(x0, b0);
00808             s1 = _mm_mul_ps(x1, b0);
00809             y0 = _mm_mul_ps(y0, b1);
00810             y1 = _mm_mul_ps(y1, b1);
00811             s0 = _mm_add_ps(s0, y0);
00812             s1 = _mm_add_ps(s1, y1);
00813 
00814             x0 = _mm_loadu_ps(S2 + x);
00815             x1 = _mm_loadu_ps(S2 + x + 4);
00816             y0 = _mm_loadu_ps(S3 + x);
00817             y1 = _mm_loadu_ps(S3 + x + 4);
00818 
00819             x0 = _mm_mul_ps(x0, b2);
00820             x1 = _mm_mul_ps(x1, b2);
00821             y0 = _mm_mul_ps(y0, b3);
00822             y1 = _mm_mul_ps(y1, b3);
00823             s0 = _mm_add_ps(s0, x0);
00824             s1 = _mm_add_ps(s1, x1);
00825             s0 = _mm_add_ps(s0, y0);
00826             s1 = _mm_add_ps(s1, y1);
00827 
00828             t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
00829             t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
00830 
00831             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
00832             _mm_storeu_si128( (__m128i*)(dst + x), t0);
00833         }
00834 
00835         return x;
00836     }
00837 };
00838 
00839 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
00840 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
00841 
00842 struct VResizeCubicVec_32f
00843 {
00844     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00845     {
00846         if( !checkHardwareSupport(CV_CPU_SSE) )
00847             return 0;
00848 
00849         const float** src = (const float**)_src;
00850         const float* beta = (const float*)_beta;
00851         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
00852         float* dst = (float*)_dst;
00853         int x = 0;
00854         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
00855             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
00856 
00857         for( ; x <= width - 8; x += 8 )
00858         {
00859             __m128 x0, x1, y0, y1, s0, s1;
00860             x0 = _mm_loadu_ps(S0 + x);
00861             x1 = _mm_loadu_ps(S0 + x + 4);
00862             y0 = _mm_loadu_ps(S1 + x);
00863             y1 = _mm_loadu_ps(S1 + x + 4);
00864 
00865             s0 = _mm_mul_ps(x0, b0);
00866             s1 = _mm_mul_ps(x1, b0);
00867             y0 = _mm_mul_ps(y0, b1);
00868             y1 = _mm_mul_ps(y1, b1);
00869             s0 = _mm_add_ps(s0, y0);
00870             s1 = _mm_add_ps(s1, y1);
00871 
00872             x0 = _mm_loadu_ps(S2 + x);
00873             x1 = _mm_loadu_ps(S2 + x + 4);
00874             y0 = _mm_loadu_ps(S3 + x);
00875             y1 = _mm_loadu_ps(S3 + x + 4);
00876 
00877             x0 = _mm_mul_ps(x0, b2);
00878             x1 = _mm_mul_ps(x1, b2);
00879             y0 = _mm_mul_ps(y0, b3);
00880             y1 = _mm_mul_ps(y1, b3);
00881             s0 = _mm_add_ps(s0, x0);
00882             s1 = _mm_add_ps(s1, x1);
00883             s0 = _mm_add_ps(s0, y0);
00884             s1 = _mm_add_ps(s1, y1);
00885 
00886             _mm_storeu_ps( dst + x, s0);
00887             _mm_storeu_ps( dst + x + 4, s1);
00888         }
00889 
00890         return x;
00891     }
00892 };
00893 
00894 #if CV_SSE4_1
00895 
00896 struct VResizeLanczos4Vec_32f16u
00897 {
00898     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00899     {
00900         const float** src = (const float**)_src;
00901         const float* beta = (const float*)_beta;
00902         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
00903                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
00904         short * dst = (short*)_dst;
00905         int x = 0;
00906         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
00907                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
00908                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
00909                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
00910 
00911         for( ; x <= width - 8; x += 8 )
00912         {
00913             __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
00914             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
00915             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
00916             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
00917             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
00918             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
00919             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
00920             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
00921 
00922             __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
00923             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
00924             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
00925             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
00926             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
00927             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
00928             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
00929             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
00930 
00931             __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
00932             __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
00933 
00934             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
00935         }
00936 
00937         return x;
00938     }
00939 };
00940 
00941 #else
00942 
00943 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
00944 
00945 #endif
00946 
00947 struct VResizeLanczos4Vec_32f16s
00948 {
00949     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00950     {
00951         const float** src = (const float**)_src;
00952         const float* beta = (const float*)_beta;
00953         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
00954                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
00955         short * dst = (short*)_dst;
00956         int x = 0;
00957         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
00958                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
00959                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
00960                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
00961 
00962         for( ; x <= width - 8; x += 8 )
00963         {
00964             __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
00965             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
00966             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
00967             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
00968             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
00969             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
00970             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
00971             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
00972 
00973             __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
00974             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
00975             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
00976             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
00977             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
00978             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
00979             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
00980             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
00981 
00982             __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
00983             __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
00984 
00985             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
00986         }
00987 
00988         return x;
00989     }
00990 };
00991 
00992 
00993 struct VResizeLanczos4Vec_32f
00994 {
00995     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
00996     {
00997         const float** src = (const float**)_src;
00998         const float* beta = (const float*)_beta;
00999         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
01000                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
01001         float* dst = (float*)_dst;
01002         int x = 0;
01003 
01004         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
01005                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
01006                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
01007                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
01008 
01009         for( ; x <= width - 4; x += 4 )
01010         {
01011             __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
01012             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
01013             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
01014             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
01015             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
01016             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
01017             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
01018             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
01019 
01020             _mm_storeu_ps(dst + x, v_dst);
01021         }
01022 
01023         return x;
01024     }
01025 };
01026 
01027 
01028 #elif CV_NEON
01029 
01030 struct VResizeLinearVec_32s8u
01031 {
01032     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
01033     {
01034         const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
01035         const short* beta = (const short*)_beta;
01036         int x = 0;
01037         int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
01038 
01039         for( ; x <= width - 16; x += 16)
01040         {
01041             int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
01042             int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
01043 
01044             int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
01045             int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
01046 
01047             int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
01048                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
01049             v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
01050 
01051             v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
01052             v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
01053             v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
01054             v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
01055 
01056             v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
01057             v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
01058 
01059             int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
01060                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
01061             v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
01062 
01063             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
01064         }
01065 
01066         return x;
01067     }
01068 };
01069 
01070 struct VResizeLinearVec_32f16u
01071 {
01072     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01073     {
01074         const float** src = (const float**)_src;
01075         const float* beta = (const float*)_beta;
01076         const float *S0 = src[0], *S1 = src[1];
01077         ushort* dst = (ushort*)_dst;
01078         int x = 0;
01079 
01080         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
01081 
01082         for( ; x <= width - 8; x += 8 )
01083         {
01084             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
01085             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
01086 
01087             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
01088             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
01089 
01090             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
01091                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
01092         }
01093 
01094         return x;
01095     }
01096 };
01097 
01098 struct VResizeLinearVec_32f16s
01099 {
01100     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01101     {
01102         const float** src = (const float**)_src;
01103         const float* beta = (const float*)_beta;
01104         const float *S0 = src[0], *S1 = src[1];
01105         short* dst = (short*)_dst;
01106         int x = 0;
01107 
01108         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
01109 
01110         for( ; x <= width - 8; x += 8 )
01111         {
01112             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
01113             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
01114 
01115             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
01116             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
01117 
01118             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
01119                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
01120         }
01121 
01122         return x;
01123     }
01124 };
01125 
01126 struct VResizeLinearVec_32f
01127 {
01128     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01129     {
01130         const float** src = (const float**)_src;
01131         const float* beta = (const float*)_beta;
01132         const float *S0 = src[0], *S1 = src[1];
01133         float* dst = (float*)_dst;
01134         int x = 0;
01135 
01136         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
01137 
01138         for( ; x <= width - 8; x += 8 )
01139         {
01140             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
01141             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
01142 
01143             vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
01144             vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
01145         }
01146 
01147         return x;
01148     }
01149 };
01150 
01151 typedef VResizeNoVec VResizeCubicVec_32s8u;
01152 
01153 struct VResizeCubicVec_32f16u
01154 {
01155     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01156     {
01157         const float** src = (const float**)_src;
01158         const float* beta = (const float*)_beta;
01159         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
01160         ushort* dst = (ushort*)_dst;
01161         int x = 0;
01162         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01163                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
01164 
01165         for( ; x <= width - 8; x += 8 )
01166         {
01167             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01168                                                                          v_b1, vld1q_f32(S1 + x)),
01169                                                                          v_b2, vld1q_f32(S2 + x)),
01170                                                                          v_b3, vld1q_f32(S3 + x));
01171             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01172                                                                          v_b1, vld1q_f32(S1 + x + 4)),
01173                                                                          v_b2, vld1q_f32(S2 + x + 4)),
01174                                                                          v_b3, vld1q_f32(S3 + x + 4));
01175 
01176             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
01177                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
01178         }
01179 
01180         return x;
01181     }
01182 };
01183 
01184 struct VResizeCubicVec_32f16s
01185 {
01186     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01187     {
01188         const float** src = (const float**)_src;
01189         const float* beta = (const float*)_beta;
01190         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
01191         short* dst = (short*)_dst;
01192         int x = 0;
01193         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01194                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
01195 
01196         for( ; x <= width - 8; x += 8 )
01197         {
01198             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01199                                                                          v_b1, vld1q_f32(S1 + x)),
01200                                                                          v_b2, vld1q_f32(S2 + x)),
01201                                                                          v_b3, vld1q_f32(S3 + x));
01202             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01203                                                                          v_b1, vld1q_f32(S1 + x + 4)),
01204                                                                          v_b2, vld1q_f32(S2 + x + 4)),
01205                                                                          v_b3, vld1q_f32(S3 + x + 4));
01206 
01207             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
01208                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
01209         }
01210 
01211         return x;
01212     }
01213 };
01214 
01215 struct VResizeCubicVec_32f
01216 {
01217     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01218     {
01219         const float** src = (const float**)_src;
01220         const float* beta = (const float*)_beta;
01221         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
01222         float* dst = (float*)_dst;
01223         int x = 0;
01224         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01225                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
01226 
01227         for( ; x <= width - 8; x += 8 )
01228         {
01229             vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01230                                                                        v_b1, vld1q_f32(S1 + x)),
01231                                                                        v_b2, vld1q_f32(S2 + x)),
01232                                                                        v_b3, vld1q_f32(S3 + x)));
01233             vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01234                                                                           v_b1, vld1q_f32(S1 + x + 4)),
01235                                                                           v_b2, vld1q_f32(S2 + x + 4)),
01236                                                                           v_b3, vld1q_f32(S3 + x + 4)));
01237         }
01238 
01239         return x;
01240     }
01241 };
01242 
01243 struct VResizeLanczos4Vec_32f16u
01244 {
01245     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01246     {
01247         const float** src = (const float**)_src;
01248         const float* beta = (const float*)_beta;
01249         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
01250                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
01251         ushort * dst = (ushort*)_dst;
01252         int x = 0;
01253         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01254                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
01255                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
01256                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
01257 
01258         for( ; x <= width - 8; x += 8 )
01259         {
01260             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01261                                                                          v_b1, vld1q_f32(S1 + x)),
01262                                                                          v_b2, vld1q_f32(S2 + x)),
01263                                                                          v_b3, vld1q_f32(S3 + x));
01264             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
01265                                                                          v_b5, vld1q_f32(S5 + x)),
01266                                                                          v_b6, vld1q_f32(S6 + x)),
01267                                                                          v_b7, vld1q_f32(S7 + x));
01268             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
01269 
01270             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01271                                                              v_b1, vld1q_f32(S1 + x + 4)),
01272                                                              v_b2, vld1q_f32(S2 + x + 4)),
01273                                                              v_b3, vld1q_f32(S3 + x + 4));
01274             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
01275                                                              v_b5, vld1q_f32(S5 + x + 4)),
01276                                                              v_b6, vld1q_f32(S6 + x + 4)),
01277                                                              v_b7, vld1q_f32(S7 + x + 4));
01278             v_dst1 = vaddq_f32(v_dst0, v_dst1);
01279 
01280             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
01281                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
01282         }
01283 
01284         return x;
01285     }
01286 };
01287 
01288 struct VResizeLanczos4Vec_32f16s
01289 {
01290     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01291     {
01292         const float** src = (const float**)_src;
01293         const float* beta = (const float*)_beta;
01294         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
01295                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
01296         short * dst = (short*)_dst;
01297         int x = 0;
01298         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01299                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
01300                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
01301                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
01302 
01303         for( ; x <= width - 8; x += 8 )
01304         {
01305             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01306                                                                          v_b1, vld1q_f32(S1 + x)),
01307                                                                          v_b2, vld1q_f32(S2 + x)),
01308                                                                          v_b3, vld1q_f32(S3 + x));
01309             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
01310                                                                          v_b5, vld1q_f32(S5 + x)),
01311                                                                          v_b6, vld1q_f32(S6 + x)),
01312                                                                          v_b7, vld1q_f32(S7 + x));
01313             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
01314 
01315             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
01316                                                              v_b1, vld1q_f32(S1 + x + 4)),
01317                                                              v_b2, vld1q_f32(S2 + x + 4)),
01318                                                              v_b3, vld1q_f32(S3 + x + 4));
01319             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
01320                                                              v_b5, vld1q_f32(S5 + x + 4)),
01321                                                              v_b6, vld1q_f32(S6 + x + 4)),
01322                                                              v_b7, vld1q_f32(S7 + x + 4));
01323             v_dst1 = vaddq_f32(v_dst0, v_dst1);
01324 
01325             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
01326                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
01327         }
01328 
01329         return x;
01330     }
01331 };
01332 
01333 struct VResizeLanczos4Vec_32f
01334 {
01335     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
01336     {
01337         const float** src = (const float**)_src;
01338         const float* beta = (const float*)_beta;
01339         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
01340                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
01341         float* dst = (float*)_dst;
01342         int x = 0;
01343         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
01344                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
01345                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
01346                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
01347 
01348         for( ; x <= width - 4; x += 4 )
01349         {
01350             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
01351                                                                          v_b1, vld1q_f32(S1 + x)),
01352                                                                          v_b2, vld1q_f32(S2 + x)),
01353                                                                          v_b3, vld1q_f32(S3 + x));
01354             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
01355                                                                          v_b5, vld1q_f32(S5 + x)),
01356                                                                          v_b6, vld1q_f32(S6 + x)),
01357                                                                          v_b7, vld1q_f32(S7 + x));
01358             vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
01359         }
01360 
01361         return x;
01362     }
01363 };
01364 
01365 #else
01366 
01367 typedef VResizeNoVec VResizeLinearVec_32s8u;
01368 typedef VResizeNoVec VResizeLinearVec_32f16u;
01369 typedef VResizeNoVec VResizeLinearVec_32f16s;
01370 typedef VResizeNoVec VResizeLinearVec_32f;
01371 
01372 typedef VResizeNoVec VResizeCubicVec_32s8u;
01373 typedef VResizeNoVec VResizeCubicVec_32f16u;
01374 typedef VResizeNoVec VResizeCubicVec_32f16s;
01375 typedef VResizeNoVec VResizeCubicVec_32f;
01376 
01377 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
01378 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
01379 typedef VResizeNoVec VResizeLanczos4Vec_32f;
01380 
01381 #endif
01382 
01383 typedef HResizeNoVec HResizeLinearVec_8u32s;
01384 typedef HResizeNoVec HResizeLinearVec_16u32f;
01385 typedef HResizeNoVec HResizeLinearVec_16s32f;
01386 typedef HResizeNoVec HResizeLinearVec_32f;
01387 typedef HResizeNoVec HResizeLinearVec_64f;
01388 
01389 
01390 template<typename T, typename WT, typename AT, int ONE, class VecOp>
01391 struct HResizeLinear
01392 {
01393     typedef T value_type;
01394     typedef WT buf_type;
01395     typedef AT alpha_type;
01396 
01397     void operator()(const T** src, WT** dst, int count,
01398                     const int* xofs, const AT* alpha,
01399                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
01400     {
01401         int dx, k;
01402         VecOp vecOp;
01403 
01404         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
01405             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
01406 
01407         for( k = 0; k <= count - 2; k++ )
01408         {
01409             const T *S0 = src[k], *S1 = src[k+1];
01410             WT *D0 = dst[k], *D1 = dst[k+1];
01411             for( dx = dx0; dx < xmax; dx++ )
01412             {
01413                 int sx = xofs[dx];
01414                 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
01415                 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
01416                 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
01417                 D0[dx] = t0; D1[dx] = t1;
01418             }
01419 
01420             for( ; dx < dwidth; dx++ )
01421             {
01422                 int sx = xofs[dx];
01423                 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
01424             }
01425         }
01426 
01427         for( ; k < count; k++ )
01428         {
01429             const T *S = src[k];
01430             WT *D = dst[k];
01431             for( dx = 0; dx < xmax; dx++ )
01432             {
01433                 int sx = xofs[dx];
01434                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
01435             }
01436 
01437             for( ; dx < dwidth; dx++ )
01438                 D[dx] = WT(S[xofs[dx]]*ONE);
01439         }
01440     }
01441 };
01442 
01443 
01444 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
01445 struct VResizeLinear
01446 {
01447     typedef T value_type;
01448     typedef WT buf_type;
01449     typedef AT alpha_type;
01450 
01451     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
01452     {
01453         WT b0 = beta[0], b1 = beta[1];
01454         const WT *S0 = src[0], *S1 = src[1];
01455         CastOp castOp;
01456         VecOp vecOp;
01457 
01458         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
01459         #if CV_ENABLE_UNROLLED
01460         for( ; x <= width - 4; x += 4 )
01461         {
01462             WT t0, t1;
01463             t0 = S0[x]*b0 + S1[x]*b1;
01464             t1 = S0[x+1]*b0 + S1[x+1]*b1;
01465             dst[x] = castOp(t0); dst[x+1] = castOp(t1);
01466             t0 = S0[x+2]*b0 + S1[x+2]*b1;
01467             t1 = S0[x+3]*b0 + S1[x+3]*b1;
01468             dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
01469         }
01470         #endif
01471         for( ; x < width; x++ )
01472             dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
01473     }
01474 };
01475 
01476 template<>
01477 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
01478 {
01479     typedef uchar value_type;
01480     typedef int buf_type;
01481     typedef short alpha_type;
01482 
01483     void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
01484     {
01485         alpha_type b0 = beta[0], b1 = beta[1];
01486         const buf_type *S0 = src[0], *S1 = src[1];
01487         VResizeLinearVec_32s8u vecOp;
01488 
01489         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
01490         #if CV_ENABLE_UNROLLED
01491         for( ; x <= width - 4; x += 4 )
01492         {
01493             dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
01494             dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
01495             dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
01496             dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
01497         }
01498         #endif
01499         for( ; x < width; x++ )
01500             dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
01501     }
01502 };
01503 
01504 
01505 template<typename T, typename WT, typename AT>
01506 struct HResizeCubic
01507 {
01508     typedef T value_type;
01509     typedef WT buf_type;
01510     typedef AT alpha_type;
01511 
01512     void operator()(const T** src, WT** dst, int count,
01513                     const int* xofs, const AT* alpha,
01514                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
01515     {
01516         for( int k = 0; k < count; k++ )
01517         {
01518             const T *S = src[k];
01519             WT *D = dst[k];
01520             int dx = 0, limit = xmin;
01521             for(;;)
01522             {
01523                 for( ; dx < limit; dx++, alpha += 4 )
01524                 {
01525                     int j, sx = xofs[dx] - cn;
01526                     WT v = 0;
01527                     for( j = 0; j < 4; j++ )
01528                     {
01529                         int sxj = sx + j*cn;
01530                         if( (unsigned)sxj >= (unsigned)swidth )
01531                         {
01532                             while( sxj < 0 )
01533                                 sxj += cn;
01534                             while( sxj >= swidth )
01535                                 sxj -= cn;
01536                         }
01537                         v += S[sxj]*alpha[j];
01538                     }
01539                     D[dx] = v;
01540                 }
01541                 if( limit == dwidth )
01542                     break;
01543                 for( ; dx < xmax; dx++, alpha += 4 )
01544                 {
01545                     int sx = xofs[dx];
01546                     D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
01547                         S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
01548                 }
01549                 limit = dwidth;
01550             }
01551             alpha -= dwidth*4;
01552         }
01553     }
01554 };
01555 
01556 
01557 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
01558 struct VResizeCubic
01559 {
01560     typedef T value_type;
01561     typedef WT buf_type;
01562     typedef AT alpha_type;
01563 
01564     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
01565     {
01566         WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
01567         const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
01568         CastOp castOp;
01569         VecOp vecOp;
01570 
01571         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
01572         for( ; x < width; x++ )
01573             dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
01574     }
01575 };
01576 
01577 
01578 template<typename T, typename WT, typename AT>
01579 struct HResizeLanczos4
01580 {
01581     typedef T value_type;
01582     typedef WT buf_type;
01583     typedef AT alpha_type;
01584 
01585     void operator()(const T** src, WT** dst, int count,
01586                     const int* xofs, const AT* alpha,
01587                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
01588     {
01589         for( int k = 0; k < count; k++ )
01590         {
01591             const T *S = src[k];
01592             WT *D = dst[k];
01593             int dx = 0, limit = xmin;
01594             for(;;)
01595             {
01596                 for( ; dx < limit; dx++, alpha += 8 )
01597                 {
01598                     int j, sx = xofs[dx] - cn*3;
01599                     WT v = 0;
01600                     for( j = 0; j < 8; j++ )
01601                     {
01602                         int sxj = sx + j*cn;
01603                         if( (unsigned)sxj >= (unsigned)swidth )
01604                         {
01605                             while( sxj < 0 )
01606                                 sxj += cn;
01607                             while( sxj >= swidth )
01608                                 sxj -= cn;
01609                         }
01610                         v += S[sxj]*alpha[j];
01611                     }
01612                     D[dx] = v;
01613                 }
01614                 if( limit == dwidth )
01615                     break;
01616                 for( ; dx < xmax; dx++, alpha += 8 )
01617                 {
01618                     int sx = xofs[dx];
01619                     D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
01620                         S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
01621                         S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
01622                         S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
01623                 }
01624                 limit = dwidth;
01625             }
01626             alpha -= dwidth*8;
01627         }
01628     }
01629 };
01630 
01631 
01632 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
01633 struct VResizeLanczos4
01634 {
01635     typedef T value_type;
01636     typedef WT buf_type;
01637     typedef AT alpha_type;
01638 
01639     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
01640     {
01641         CastOp castOp;
01642         VecOp vecOp;
01643         int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
01644         #if CV_ENABLE_UNROLLED
01645         for( ; x <= width - 4; x += 4 )
01646         {
01647             WT b = beta[0];
01648             const WT* S = src[0];
01649             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
01650 
01651             for( k = 1; k < 8; k++ )
01652             {
01653                 b = beta[k]; S = src[k];
01654                 s0 += S[x]*b; s1 += S[x+1]*b;
01655                 s2 += S[x+2]*b; s3 += S[x+3]*b;
01656             }
01657 
01658             dst[x] = castOp(s0); dst[x+1] = castOp(s1);
01659             dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
01660         }
01661         #endif
01662         for( ; x < width; x++ )
01663         {
01664             dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
01665                 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
01666                 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
01667         }
01668     }
01669 };
01670 
01671 
01672 static inline int clip(int x, int a, int b)
01673 {
01674     return x >= a ? (x < b ? x : b-1) : a;
01675 }
01676 
01677 static const int MAX_ESIZE=16;
01678 
01679 template <typename HResize, typename VResize>
01680 class resizeGeneric_Invoker :
01681     public ParallelLoopBody
01682 {
01683 public:
01684     typedef typename HResize::value_type T;
01685     typedef typename HResize::buf_type WT;
01686     typedef typename HResize::alpha_type AT;
01687 
01688     resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
01689         const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
01690         int _ksize, int _xmin, int _xmax) :
01691         ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
01692         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
01693         ksize(_ksize), xmin(_xmin), xmax(_xmax)
01694     {
01695         CV_Assert(ksize <= MAX_ESIZE);
01696     }
01697 
01698 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
01699 # pragma GCC diagnostic push
01700 # pragma GCC diagnostic ignored "-Warray-bounds"
01701 #endif
01702     virtual void operator() (const Range& range) const
01703     {
01704         int dy, cn = src.channels();
01705         HResize hresize;
01706         VResize vresize;
01707 
01708         int bufstep = (int)alignSize(dsize.width, 16);
01709         AutoBuffer<WT> _buffer(bufstep*ksize);
01710         const T* srows[MAX_ESIZE]={0};
01711         WT* rows[MAX_ESIZE]={0};
01712         int prev_sy[MAX_ESIZE];
01713 
01714         for(int k = 0; k < ksize; k++ )
01715         {
01716             prev_sy[k] = -1;
01717             rows[k] = (WT*)_buffer + bufstep*k;
01718         }
01719 
01720         const AT* beta = _beta + ksize * range.start;
01721 
01722         for( dy = range.start; dy < range.end; dy++, beta += ksize )
01723         {
01724             int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
01725 
01726             for(int k = 0; k < ksize; k++ )
01727             {
01728                 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
01729                 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
01730                 {
01731                     if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
01732                     {
01733                         if( k1 > k )
01734                             memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
01735                         break;
01736                     }
01737                 }
01738                 if( k1 == ksize )
01739                     k0 = std::min(k0, k); // remember the first row that needs to be computed
01740                 srows[k] = src.template ptr<T>(sy);
01741                 prev_sy[k] = sy;
01742             }
01743 
01744             if( k0 < ksize )
01745                 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
01746                         ssize.width, dsize.width, cn, xmin, xmax );
01747             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
01748         }
01749     }
01750 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
01751 # pragma GCC diagnostic pop
01752 #endif
01753 
01754 private:
01755     Mat src;
01756     Mat dst;
01757     const int* xofs, *yofs;
01758     const AT* alpha, *_beta;
01759     Size ssize, dsize;
01760     const int ksize, xmin, xmax;
01761 
01762     resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
01763 };
01764 
01765 template<class HResize, class VResize>
01766 static void resizeGeneric_( const Mat& src, Mat& dst,
01767                             const int* xofs, const void* _alpha,
01768                             const int* yofs, const void* _beta,
01769                             int xmin, int xmax, int ksize )
01770 {
01771     typedef typename HResize::alpha_type AT;
01772 
01773     const AT* beta = (const AT*)_beta;
01774     Size ssize = src.size(), dsize = dst.size();
01775     int cn = src.channels();
01776     ssize.width *= cn;
01777     dsize.width *= cn;
01778     xmin *= cn;
01779     xmax *= cn;
01780     // image resize is a separable operation. In case of not too strong
01781 
01782     Range range(0, dsize.height);
01783     resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
01784         ssize, dsize, ksize, xmin, xmax);
01785     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
01786 }
01787 
01788 template <typename T, typename WT>
01789 struct ResizeAreaFastNoVec
01790 {
01791     ResizeAreaFastNoVec(int, int) { }
01792     ResizeAreaFastNoVec(int, int, int, int) { }
01793     int operator() (const T*, T*, int) const
01794     { return 0; }
01795 };
01796 
01797 #if CV_NEON
01798 
01799 class ResizeAreaFastVec_SIMD_8u
01800 {
01801 public:
01802     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
01803         cn(_cn), step(_step)
01804     {
01805     }
01806 
01807     int operator() (const uchar* S, uchar* D, int w) const
01808     {
01809         int dx = 0;
01810         const uchar* S0 = S, * S1 = S0 + step;
01811 
01812         uint16x8_t v_2 = vdupq_n_u16(2);
01813 
01814         if (cn == 1)
01815         {
01816             for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
01817             {
01818                 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
01819 
01820                 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
01821                 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
01822                 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
01823 
01824                 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
01825                 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
01826                 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
01827 
01828                 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
01829             }
01830         }
01831         else if (cn == 4)
01832         {
01833             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
01834             {
01835                 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
01836 
01837                 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
01838                 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
01839                 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
01840                 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
01841 
01842                 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
01843                                            vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
01844                 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
01845                                            vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
01846                 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
01847 
01848                 vst1_u8(D, vmovn_u16(v_dst));
01849             }
01850         }
01851 
01852         return dx;
01853     }
01854 
01855 private:
01856     int cn, step;
01857 };
01858 
01859 class ResizeAreaFastVec_SIMD_16u
01860 {
01861 public:
01862     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
01863         cn(_cn), step(_step)
01864     {
01865     }
01866 
01867     int operator() (const ushort * S, ushort * D, int w) const
01868     {
01869         int dx = 0;
01870         const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
01871 
01872         uint32x4_t v_2 = vdupq_n_u32(2);
01873 
01874         if (cn == 1)
01875         {
01876             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
01877             {
01878                 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
01879 
01880                 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
01881                 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
01882                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
01883 
01884                 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
01885                 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
01886                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
01887 
01888                 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
01889             }
01890         }
01891         else if (cn == 4)
01892         {
01893             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
01894             {
01895                 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
01896                 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
01897                                              vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
01898                 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
01899             }
01900         }
01901 
01902         return dx;
01903     }
01904 
01905 private:
01906     int cn, step;
01907 };
01908 
01909 class ResizeAreaFastVec_SIMD_16s
01910 {
01911 public:
01912     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
01913         cn(_cn), step(_step)
01914     {
01915     }
01916 
01917     int operator() (const short * S, short * D, int w) const
01918     {
01919         int dx = 0;
01920         const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
01921 
01922         int32x4_t v_2 = vdupq_n_s32(2);
01923 
01924         if (cn == 1)
01925         {
01926             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
01927             {
01928                 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
01929 
01930                 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
01931                 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
01932                 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
01933 
01934                 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
01935                 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
01936                 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
01937 
01938                 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
01939             }
01940         }
01941         else if (cn == 4)
01942         {
01943             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
01944             {
01945                 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
01946                 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
01947                                             vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
01948                 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
01949             }
01950         }
01951 
01952         return dx;
01953     }
01954 
01955 private:
01956     int cn, step;
01957 };
01958 
01959 struct ResizeAreaFastVec_SIMD_32f
01960 {
01961     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
01962         cn(_cn), step(_step)
01963     {
01964         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
01965     }
01966 
01967     int operator() (const float * S, float * D, int w) const
01968     {
01969         if (!fast_mode)
01970             return 0;
01971 
01972         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
01973         int dx = 0;
01974 
01975         float32x4_t v_025 = vdupq_n_f32(0.25f);
01976 
01977         if (cn == 1)
01978         {
01979             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
01980             {
01981                 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
01982 
01983                 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
01984                 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
01985 
01986                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
01987             }
01988         }
01989         else if (cn == 4)
01990         {
01991             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
01992             {
01993                 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
01994                 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
01995 
01996                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
01997             }
01998         }
01999 
02000         return dx;
02001     }
02002 
02003 private:
02004     int cn;
02005     bool fast_mode;
02006     int step;
02007 };
02008 
02009 #elif CV_SSE2
02010 
02011 class ResizeAreaFastVec_SIMD_8u
02012 {
02013 public:
02014     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
02015         cn(_cn), step(_step)
02016     {
02017         use_simd = checkHardwareSupport(CV_CPU_SSE2);
02018     }
02019 
02020     int operator() (const uchar* S, uchar* D, int w) const
02021     {
02022         if (!use_simd)
02023             return 0;
02024 
02025         int dx = 0;
02026         const uchar* S0 = S;
02027         const uchar* S1 = S0 + step;
02028         __m128i zero = _mm_setzero_si128();
02029         __m128i delta2 = _mm_set1_epi16(2);
02030 
02031         if (cn == 1)
02032         {
02033             __m128i masklow = _mm_set1_epi16(0x00ff);
02034             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
02035             {
02036                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02037                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02038 
02039                 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
02040                 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
02041                 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
02042                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
02043 
02044                 _mm_storel_epi64((__m128i*)D, s0);
02045             }
02046         }
02047         else if (cn == 3)
02048             for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
02049             {
02050                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02051                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02052 
02053                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
02054                 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
02055                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
02056                 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
02057 
02058                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
02059                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
02060                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
02061                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
02062                 _mm_storel_epi64((__m128i*)D, s0);
02063 
02064                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
02065                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
02066                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
02067                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
02068                 _mm_storel_epi64((__m128i*)(D+3), s0);
02069             }
02070         else
02071         {
02072             CV_Assert(cn == 4);
02073             int v[] = { 0, 0, -1, -1 };
02074             __m128i mask = _mm_loadu_si128((const __m128i*)v);
02075 
02076             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
02077             {
02078                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02079                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02080 
02081                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
02082                 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
02083                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
02084                 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
02085 
02086                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
02087                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
02088                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
02089                 __m128i res0 = _mm_srli_epi16(s0, 2);
02090 
02091                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
02092                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
02093                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
02094                 __m128i res1 = _mm_srli_epi16(s0, 2);
02095                 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
02096                                                    _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
02097                 _mm_storel_epi64((__m128i*)(D), s0);
02098             }
02099         }
02100 
02101         return dx;
02102     }
02103 
02104 private:
02105     int cn;
02106     bool use_simd;
02107     int step;
02108 };
02109 
02110 class ResizeAreaFastVec_SIMD_16u
02111 {
02112 public:
02113     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
02114         cn(_cn), step(_step)
02115     {
02116         use_simd = checkHardwareSupport(CV_CPU_SSE2);
02117     }
02118 
02119     int operator() (const ushort* S, ushort* D, int w) const
02120     {
02121         if (!use_simd)
02122             return 0;
02123 
02124         int dx = 0;
02125         const ushort* S0 = (const ushort*)S;
02126         const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
02127         __m128i masklow = _mm_set1_epi32(0x0000ffff);
02128         __m128i zero = _mm_setzero_si128();
02129         __m128i delta2 = _mm_set1_epi32(2);
02130 
02131 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
02132 
02133         if (cn == 1)
02134         {
02135             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02136             {
02137                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02138                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02139 
02140                 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
02141                 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
02142                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
02143                 s0 = _mm_srli_epi32(s0, 2);
02144                 s0 = _mm_packus_epi32(s0, zero);
02145 
02146                 _mm_storel_epi64((__m128i*)D, s0);
02147             }
02148         }
02149         else if (cn == 3)
02150             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
02151             {
02152                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02153                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02154 
02155                 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
02156                 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
02157                 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
02158                 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
02159 
02160                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
02161                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
02162                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
02163                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
02164                 _mm_storel_epi64((__m128i*)D, s0);
02165             }
02166         else
02167         {
02168             CV_Assert(cn == 4);
02169             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02170             {
02171                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02172                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02173 
02174                 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
02175                 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
02176                 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
02177                 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
02178 
02179                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
02180                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
02181                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
02182                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
02183                 _mm_storel_epi64((__m128i*)D, s0);
02184             }
02185         }
02186 
02187 #undef _mm_packus_epi32
02188 
02189         return dx;
02190     }
02191 
02192 private:
02193     int cn;
02194     int step;
02195     bool use_simd;
02196 };
02197 
02198 class ResizeAreaFastVec_SIMD_16s
02199 {
02200 public:
02201     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
02202         cn(_cn), step(_step)
02203     {
02204         use_simd = checkHardwareSupport(CV_CPU_SSE2);
02205     }
02206 
02207     int operator() (const short* S, short* D, int w) const
02208     {
02209         if (!use_simd)
02210             return 0;
02211 
02212         int dx = 0;
02213         const short* S0 = (const short*)S;
02214         const short* S1 = (const short*)((const uchar*)(S) + step);
02215         __m128i masklow = _mm_set1_epi32(0x0000ffff);
02216         __m128i zero = _mm_setzero_si128();
02217         __m128i delta2 = _mm_set1_epi32(2);
02218 
02219         if (cn == 1)
02220         {
02221             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02222             {
02223                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02224                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02225 
02226                 __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
02227                     _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
02228                 __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
02229                     _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
02230                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
02231                 s0 = _mm_srai_epi32(s0, 2);
02232                 s0 = _mm_packs_epi32(s0, zero);
02233 
02234                 _mm_storel_epi64((__m128i*)D, s0);
02235             }
02236         }
02237         else if (cn == 3)
02238             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
02239             {
02240                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02241                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02242 
02243                 __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
02244                 __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
02245                 __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
02246                 __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
02247 
02248                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
02249                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
02250                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
02251                 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
02252                 _mm_storel_epi64((__m128i*)D, s0);
02253             }
02254         else
02255         {
02256             CV_Assert(cn == 4);
02257             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02258             {
02259                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
02260                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
02261 
02262                 __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
02263                 __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
02264                 __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
02265                 __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
02266 
02267                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
02268                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
02269                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
02270                 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
02271                 _mm_storel_epi64((__m128i*)D, s0);
02272             }
02273         }
02274 
02275         return dx;
02276     }
02277 
02278 private:
02279     int cn;
02280     int step;
02281     bool use_simd;
02282 };
02283 
02284 struct ResizeAreaFastVec_SIMD_32f
02285 {
02286     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
02287         cn(_cn), step(_step)
02288     {
02289         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
02290         fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
02291     }
02292 
02293     int operator() (const float * S, float * D, int w) const
02294     {
02295         if (!fast_mode)
02296             return 0;
02297 
02298         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
02299         int dx = 0;
02300 
02301         __m128 v_025 = _mm_set1_ps(0.25f);
02302 
02303         if (cn == 1)
02304         {
02305             const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
02306             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02307             {
02308                 __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
02309                        v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
02310 
02311                 __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
02312                                            _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
02313                 __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
02314                                            _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
02315 
02316                 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
02317             }
02318         }
02319         else if (cn == 4)
02320         {
02321             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
02322             {
02323                 __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
02324                 __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
02325 
02326                 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
02327             }
02328         }
02329 
02330         return dx;
02331     }
02332 
02333 private:
02334     int cn;
02335     bool fast_mode;
02336     int step;
02337 };
02338 
02339 #else
02340 
02341 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
02342 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
02343 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
02344 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
02345 
02346 #endif
02347 
02348 template<typename T, typename SIMDVecOp>
02349 struct ResizeAreaFastVec
02350 {
02351     ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
02352         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
02353     {
02354         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
02355     }
02356 
02357     int operator() (const T* S, T* D, int w) const
02358     {
02359         if (!fast_mode)
02360             return 0;
02361 
02362         const T* nextS = (const T*)((const uchar*)S + step);
02363         int dx = vecOp(S, D, w);
02364 
02365         if (cn == 1)
02366             for( ; dx < w; ++dx )
02367             {
02368                 int index = dx*2;
02369                 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
02370             }
02371         else if (cn == 3)
02372             for( ; dx < w; dx += 3 )
02373             {
02374                 int index = dx*2;
02375                 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
02376                 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
02377                 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
02378             }
02379         else
02380             {
02381                 CV_Assert(cn == 4);
02382                 for( ; dx < w; dx += 4 )
02383                 {
02384                     int index = dx*2;
02385                     D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
02386                     D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
02387                     D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
02388                     D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
02389                 }
02390             }
02391 
02392         return dx;
02393     }
02394 
02395 private:
02396     int scale_x, scale_y;
02397     int cn;
02398     bool fast_mode;
02399     int step;
02400     SIMDVecOp vecOp;
02401 };
02402 
02403 template <typename T, typename WT, typename VecOp>
02404 class resizeAreaFast_Invoker :
02405     public ParallelLoopBody
02406 {
02407 public:
02408     resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
02409         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
02410         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
02411         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
02412     {
02413     }
02414 
02415     virtual void operator() (const Range& range) const
02416     {
02417         Size ssize = src.size(), dsize = dst.size();
02418         int cn = src.channels();
02419         int area = scale_x*scale_y;
02420         float scale = 1.f/(area);
02421         int dwidth1 = (ssize.width/scale_x)*cn;
02422         dsize.width *= cn;
02423         ssize.width *= cn;
02424         int dy, dx, k = 0;
02425 
02426         VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
02427 
02428         for( dy = range.start; dy < range.end; dy++ )
02429         {
02430             T* D = (T*)(dst.data + dst.step*dy);
02431             int sy0 = dy*scale_y;
02432             int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
02433 
02434             if( sy0 >= ssize.height )
02435             {
02436                 for( dx = 0; dx < dsize.width; dx++ )
02437                     D[dx] = 0;
02438                 continue;
02439             }
02440 
02441             dx = vop(src.template ptr<T>(sy0), D, w);
02442             for( ; dx < w; dx++ )
02443             {
02444                 const T* S = src.template ptr<T>(sy0) + xofs[dx];
02445                 WT sum = 0;
02446                 k = 0;
02447                 #if CV_ENABLE_UNROLLED
02448                 for( ; k <= area - 4; k += 4 )
02449                     sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
02450                 #endif
02451                 for( ; k < area; k++ )
02452                     sum += S[ofs[k]];
02453 
02454                 D[dx] = saturate_cast<T>(sum * scale);
02455             }
02456 
02457             for( ; dx < dsize.width; dx++ )
02458             {
02459                 WT sum = 0;
02460                 int count = 0, sx0 = xofs[dx];
02461                 if( sx0 >= ssize.width )
02462                     D[dx] = 0;
02463 
02464                 for( int sy = 0; sy < scale_y; sy++ )
02465                 {
02466                     if( sy0 + sy >= ssize.height )
02467                         break;
02468                     const T* S = src.template ptr<T>(sy0 + sy) + sx0;
02469                     for( int sx = 0; sx < scale_x*cn; sx += cn )
02470                     {
02471                         if( sx0 + sx >= ssize.width )
02472                             break;
02473                         sum += S[sx];
02474                         count++;
02475                     }
02476                 }
02477 
02478                 D[dx] = saturate_cast<T>((float)sum/count);
02479             }
02480         }
02481     }
02482 
02483 private:
02484     Mat src;
02485     Mat dst;
02486     int scale_x, scale_y;
02487     const int *ofs, *xofs;
02488 };
02489 
02490 template<typename T, typename WT, typename VecOp>
02491 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
02492                              int scale_x, int scale_y )
02493 {
02494     Range range(0, dst.rows);
02495     resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
02496         scale_y, ofs, xofs);
02497     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
02498 }
02499 
02500 struct DecimateAlpha
02501 {
02502     int si, di;
02503     float alpha;
02504 };
02505 
02506 
02507 template<typename T, typename WT> class ResizeArea_Invoker :
02508     public ParallelLoopBody
02509 {
02510 public:
02511     ResizeArea_Invoker( const Mat& _src, Mat& _dst,
02512                         const DecimateAlpha* _xtab, int _xtab_size,
02513                         const DecimateAlpha* _ytab, int _ytab_size,
02514                         const int* _tabofs )
02515     {
02516         src = &_src;
02517         dst = &_dst;
02518         xtab0 = _xtab;
02519         xtab_size0 = _xtab_size;
02520         ytab = _ytab;
02521         ytab_size = _ytab_size;
02522         tabofs = _tabofs;
02523     }
02524 
02525     virtual void operator() (const Range& range) const
02526     {
02527         Size dsize = dst->size();
02528         int cn = dst->channels();
02529         dsize.width *= cn;
02530         AutoBuffer<WT> _buffer(dsize.width*2);
02531         const DecimateAlpha* xtab = xtab0;
02532         int xtab_size = xtab_size0;
02533         WT *buf = _buffer, *sum = buf + dsize.width;
02534         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
02535 
02536         for( dx = 0; dx < dsize.width; dx++ )
02537             sum[dx] = (WT)0;
02538 
02539         for( j = j_start; j < j_end; j++ )
02540         {
02541             WT beta = ytab[j].alpha;
02542             int dy = ytab[j].di;
02543             int sy = ytab[j].si;
02544 
02545             {
02546                 const T* S = src->template ptr<T>(sy);
02547                 for( dx = 0; dx < dsize.width; dx++ )
02548                     buf[dx] = (WT)0;
02549 
02550                 if( cn == 1 )
02551                     for( k = 0; k < xtab_size; k++ )
02552                     {
02553                         int dxn = xtab[k].di;
02554                         WT alpha = xtab[k].alpha;
02555                         buf[dxn] += S[xtab[k].si]*alpha;
02556                     }
02557                 else if( cn == 2 )
02558                     for( k = 0; k < xtab_size; k++ )
02559                     {
02560                         int sxn = xtab[k].si;
02561                         int dxn = xtab[k].di;
02562                         WT alpha = xtab[k].alpha;
02563                         WT t0 = buf[dxn] + S[sxn]*alpha;
02564                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
02565                         buf[dxn] = t0; buf[dxn+1] = t1;
02566                     }
02567                 else if( cn == 3 )
02568                     for( k = 0; k < xtab_size; k++ )
02569                     {
02570                         int sxn = xtab[k].si;
02571                         int dxn = xtab[k].di;
02572                         WT alpha = xtab[k].alpha;
02573                         WT t0 = buf[dxn] + S[sxn]*alpha;
02574                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
02575                         WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
02576                         buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
02577                     }
02578                 else if( cn == 4 )
02579                 {
02580                     for( k = 0; k < xtab_size; k++ )
02581                     {
02582                         int sxn = xtab[k].si;
02583                         int dxn = xtab[k].di;
02584                         WT alpha = xtab[k].alpha;
02585                         WT t0 = buf[dxn] + S[sxn]*alpha;
02586                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
02587                         buf[dxn] = t0; buf[dxn+1] = t1;
02588                         t0 = buf[dxn+2] + S[sxn+2]*alpha;
02589                         t1 = buf[dxn+3] + S[sxn+3]*alpha;
02590                         buf[dxn+2] = t0; buf[dxn+3] = t1;
02591                     }
02592                 }
02593                 else
02594                 {
02595                     for( k = 0; k < xtab_size; k++ )
02596                     {
02597                         int sxn = xtab[k].si;
02598                         int dxn = xtab[k].di;
02599                         WT alpha = xtab[k].alpha;
02600                         for( int c = 0; c < cn; c++ )
02601                             buf[dxn + c] += S[sxn + c]*alpha;
02602                     }
02603                 }
02604             }
02605 
02606             if( dy != prev_dy )
02607             {
02608                 T* D = dst->template ptr<T>(prev_dy);
02609 
02610                 for( dx = 0; dx < dsize.width; dx++ )
02611                 {
02612                     D[dx] = saturate_cast<T>(sum[dx]);
02613                     sum[dx] = beta*buf[dx];
02614                 }
02615                 prev_dy = dy;
02616             }
02617             else
02618             {
02619                 for( dx = 0; dx < dsize.width; dx++ )
02620                     sum[dx] += beta*buf[dx];
02621             }
02622         }
02623 
02624         {
02625         T* D = dst->template ptr<T>(prev_dy);
02626         for( dx = 0; dx < dsize.width; dx++ )
02627             D[dx] = saturate_cast<T>(sum[dx]);
02628         }
02629     }
02630 
02631 private:
02632     const Mat* src;
02633     Mat* dst;
02634     const DecimateAlpha* xtab0;
02635     const DecimateAlpha* ytab;
02636     int xtab_size0, ytab_size;
02637     const int* tabofs;
02638 };
02639 
02640 
02641 template <typename T, typename WT>
02642 static void resizeArea_( const Mat& src, Mat& dst,
02643                          const DecimateAlpha* xtab, int xtab_size,
02644                          const DecimateAlpha* ytab, int ytab_size,
02645                          const int* tabofs )
02646 {
02647     parallel_for_(Range(0, dst.rows),
02648                  ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
02649                  dst.total()/((double)(1 << 16)));
02650 }
02651 
02652 
02653 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
02654                             const int* xofs, const void* alpha,
02655                             const int* yofs, const void* beta,
02656                             int xmin, int xmax, int ksize );
02657 
02658 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
02659                                     const int* ofs, const int *xofs,
02660                                     int scale_x, int scale_y );
02661 
02662 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
02663                                 const DecimateAlpha* xtab, int xtab_size,
02664                                 const DecimateAlpha* ytab, int ytab_size,
02665                                 const int* yofs);
02666 
02667 
02668 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
02669 {
02670     int k = 0;
02671     for(int dx = 0; dx < dsize; dx++ )
02672     {
02673         double fsx1 = dx * scale;
02674         double fsx2 = fsx1 + scale;
02675         double cellWidth = std::min(scale, ssize - fsx1);
02676 
02677         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
02678 
02679         sx2 = std::min(sx2, ssize - 1);
02680         sx1 = std::min(sx1, sx2);
02681 
02682         if( sx1 - fsx1 > 1e-3 )
02683         {
02684             assert( k < ssize*2 );
02685             tab[k].di = dx * cn;
02686             tab[k].si = (sx1 - 1) * cn;
02687             tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
02688         }
02689 
02690         for(int sx = sx1; sx < sx2; sx++ )
02691         {
02692             assert( k < ssize*2 );
02693             tab[k].di = dx * cn;
02694             tab[k].si = sx * cn;
02695             tab[k++].alpha = float(1.0 / cellWidth);
02696         }
02697 
02698         if( fsx2 - sx2 > 1e-3 )
02699         {
02700             assert( k < ssize*2 );
02701             tab[k].di = dx * cn;
02702             tab[k].si = sx2 * cn;
02703             tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
02704         }
02705     }
02706     return k;
02707 }
02708 
02709 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
02710 
02711 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
02712     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
02713     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
02714     specBuf.allocate(specSize);\
02715     pSpec = (uchar*)specBuf;\
02716     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
02717 
02718 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
02719     if (mode == (int)ippCubic) { *ok = false; return; } \
02720     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
02721     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
02722     specBuf.allocate(specSize);\
02723     pSpec = (uchar*)specBuf;\
02724     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
02725     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
02726     getSrcOffsetFunc =  (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
02727 
02728 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
02729     func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
02730     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
02731     specBuf.allocate(specSize);\
02732     pSpec = (uchar*)specBuf;\
02733     AutoBuffer<uchar> buf(initSize);\
02734     uchar* pInit = (uchar*)buf;\
02735     CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
02736 
02737 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
02738     if (mode == (int)ippLinear)     { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
02739     else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
02740     else { *ok = false; return; } \
02741     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
02742     getSrcOffsetFunc =  (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
02743 
02744 #if IPP_VERSION_X100 >= 710
02745 class IPPresizeInvoker :
02746     public ParallelLoopBody
02747 {
02748 public:
02749     IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
02750         ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
02751         inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
02752         func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
02753     {
02754         *ok = true;
02755         IppiSize srcSize, dstSize;
02756         int type = src.type(), specSize = 0, initSize = 0;
02757         srcSize.width  = src.cols;
02758         srcSize.height = src.rows;
02759         dstSize.width  = dst.cols;
02760         dstSize.height = dst.rows;
02761 
02762         switch (type)
02763         {
02764 #if IPP_DISABLE_BLOCK // disabled since it breaks tests for CascadeClassifier
02765             case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
02766             case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
02767             case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
02768 #endif
02769             case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
02770             case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
02771             case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
02772             case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
02773             case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
02774             case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
02775             case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
02776             case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
02777             case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
02778             case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
02779             case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
02780             case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
02781             default: { *ok = false; return; } break;
02782         }
02783     }
02784 
02785     ~IPPresizeInvoker()
02786     {
02787     }
02788 
02789     virtual void operator() (const Range& range) const
02790     {
02791         if (*ok == false)
02792             return;
02793 
02794         int cn = src.channels();
02795         int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
02796         int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
02797         int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
02798 
02799         IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
02800         IppiSize  dstSize   = { dstwidth, dstheight - dsty };
02801         int bufsize = 0, itemSize = (int)src.elemSize1();
02802 
02803         CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
02804         CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
02805 
02806         const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
02807         Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
02808 
02809         AutoBuffer<uchar> buf(bufsize + 64);
02810         uchar* bufptr = alignPtr((uchar*)buf, 32);
02811 
02812         if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
02813             *ok = false;
02814         else
02815         {
02816             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
02817         }
02818     }
02819 private:
02820     const Mat & src;
02821     Mat & dst;
02822     double inv_scale_x;
02823     double inv_scale_y;
02824     void *pSpec;
02825     AutoBuffer<uchar> specBuf;
02826     int mode;
02827     ippiResizeFunc func;
02828     ippiResizeGetBufferSize getBufferSizeFunc;
02829     ippiResizeGetSrcOffset getSrcOffsetFunc;
02830     bool *ok;
02831     const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
02832 };
02833 
02834 #endif
02835 
02836 #ifdef HAVE_OPENCL
02837 
02838 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
02839                                       float * const alpha_tab, int * const ofs_tab)
02840 {
02841     int k = 0, dx = 0;
02842     for ( ; dx < dsize; dx++)
02843     {
02844         ofs_tab[dx] = k;
02845 
02846         double fsx1 = dx * scale;
02847         double fsx2 = fsx1 + scale;
02848         double cellWidth = std::min(scale, ssize - fsx1);
02849 
02850         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
02851 
02852         sx2 = std::min(sx2, ssize - 1);
02853         sx1 = std::min(sx1, sx2);
02854 
02855         if (sx1 - fsx1 > 1e-3)
02856         {
02857             map_tab[k] = sx1 - 1;
02858             alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
02859         }
02860 
02861         for (int sx = sx1; sx < sx2; sx++)
02862         {
02863             map_tab[k] = sx;
02864             alpha_tab[k++] = float(1.0 / cellWidth);
02865         }
02866 
02867         if (fsx2 - sx2 > 1e-3)
02868         {
02869             map_tab[k] = sx2;
02870             alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
02871         }
02872     }
02873     ofs_tab[dx] = k;
02874 }
02875 
02876 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
02877                         double fx, double fy, int interpolation)
02878 {
02879     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
02880 
02881     double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
02882     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
02883     int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
02884     bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
02885         std::abs(inv_fy - iscale_y) < DBL_EPSILON;
02886 
02887     // in case of scale_x && scale_y is equal to 2
02888     // INTER_AREA (fast) also is equal to INTER_LINEAR
02889     if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
02890         /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
02891 
02892     if( !(cn <= 4 &&
02893            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
02894             (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
02895         return false;
02896 
02897     UMat src = _src.getUMat();
02898     _dst.create(dsize, type);
02899     UMat dst = _dst.getUMat();
02900 
02901     Size ssize = src.size();
02902     ocl::Kernel k;
02903     size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
02904 
02905     ocl::Image2D srcImage;
02906 
02907     // See if this could be done with a sampler.  We stick with integer
02908     // datatypes because the observed error is low.
02909     bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
02910                        ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
02911                        ocl::Image2D::isFormatSupported(depth, cn, true) &&
02912                        src.offset==0);
02913     if (useSampler)
02914     {
02915         int wdepth = std::max(depth, CV_32S);
02916         char buf[2][32];
02917         cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
02918                         "-D convertToDT=%s -D cn=%d",
02919                         depth, ocl::typeToStr(type), ocl::typeToStr(depth),
02920                         ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
02921                         cn);
02922         k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
02923 
02924         if (k.empty())
02925             useSampler = false;
02926         else
02927         {
02928             // Convert the input into an OpenCL image type, using normalized channel data types
02929             // and aliasing the UMat.
02930             srcImage = ocl::Image2D(src, true, true);
02931             k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
02932                    (float)inv_fx, (float)inv_fy);
02933         }
02934     }
02935 
02936     if (interpolation == INTER_LINEAR && !useSampler)
02937     {
02938         char buf[2][32];
02939 
02940         // integer path is slower because of CPU part, so it's disabled
02941         if (depth == CV_8U && ((void)0, 0))
02942         {
02943             AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
02944             int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
02945             short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
02946             float fxx, fyy;
02947             int sx, sy;
02948 
02949             for (int dx = 0; dx < dsize.width; dx++)
02950             {
02951                 fxx = (float)((dx+0.5)*inv_fx - 0.5);
02952                 sx = cvFloor(fxx);
02953                 fxx -= sx;
02954 
02955                 if (sx < 0)
02956                     fxx = 0, sx = 0;
02957 
02958                 if (sx >= ssize.width-1)
02959                     fxx = 0, sx = ssize.width-1;
02960 
02961                 xofs[dx] = sx;
02962                 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
02963                 ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
02964             }
02965 
02966             for (int dy = 0; dy < dsize.height; dy++)
02967             {
02968                 fyy = (float)((dy+0.5)*inv_fy - 0.5);
02969                 sy = cvFloor(fyy);
02970                 fyy -= sy;
02971 
02972                 yofs[dy] = sy;
02973                 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
02974                 ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
02975             }
02976 
02977             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
02978             UMat coeffs;
02979             Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
02980 
02981             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
02982                      format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
02983                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
02984                             "-D INTER_RESIZE_COEF_BITS=%d",
02985                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
02986                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
02987                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
02988                             cn, INTER_RESIZE_COEF_BITS));
02989             if (k.empty())
02990                 return false;
02991 
02992             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
02993                    ocl::KernelArg::PtrReadOnly(coeffs));
02994         }
02995         else
02996         {
02997             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
02998             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
02999                      format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
03000                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
03001                             "-D INTER_RESIZE_COEF_BITS=%d",
03002                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
03003                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
03004                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
03005                             cn, INTER_RESIZE_COEF_BITS));
03006             if (k.empty())
03007                 return false;
03008 
03009             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
03010                    (float)inv_fx, (float)inv_fy);
03011         }
03012     }
03013     else if (interpolation == INTER_NEAREST)
03014     {
03015         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
03016                  format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
03017                         ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
03018         if (k.empty())
03019             return false;
03020 
03021         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
03022                (float)inv_fx, (float)inv_fy);
03023     }
03024     else if (interpolation == INTER_AREA)
03025     {
03026         int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
03027         int wtype = CV_MAKE_TYPE(wdepth, cn);
03028 
03029         char cvt[2][40];
03030         String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
03031                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
03032                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
03033 
03034         UMat alphaOcl, tabofsOcl, mapOcl;
03035         UMat dmap, smap;
03036 
03037         if (is_area_fast)
03038         {
03039             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
03040             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
03041                                                 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
03042                                                 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
03043                                                 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
03044                                     iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
03045 
03046             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
03047             if (k.empty())
03048                 return false;
03049         }
03050         else
03051         {
03052             buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
03053             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
03054             if (k.empty())
03055                 return false;
03056 
03057             int xytab_size = (ssize.width + ssize.height) << 1;
03058             int tabofs_size = dsize.height + dsize.width + 2;
03059 
03060             AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
03061             AutoBuffer<float> _xyalpha_tab(xytab_size);
03062             int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
03063             float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
03064             int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
03065 
03066             ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
03067             ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
03068 
03069             // loading precomputed arrays to GPU
03070             Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
03071             Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
03072             Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
03073         }
03074 
03075         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
03076 
03077         if (is_area_fast)
03078             k.args(srcarg, dstarg);
03079         else
03080             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
03081                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
03082 
03083         return k.run(2, globalsize, NULL, false);
03084     }
03085 
03086     return k.run(2, globalsize, 0, false);
03087 }
03088 
03089 #endif
03090 
03091 #if IPP_VERSION_X100 >= 710
03092 static bool ipp_resize_mt(    Mat src, Mat dst,
03093                         double inv_scale_x, double inv_scale_y, int interpolation)
03094 {
03095     int mode = -1;
03096     if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
03097         mode = ippLinear;
03098     else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
03099         mode = ippCubic;
03100     else
03101         return false;
03102 
03103     bool ok = true;
03104     Range range(0, src.rows);
03105     IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
03106     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
03107     if( ok )
03108         return true;
03109 
03110     return false;
03111 }
03112 #endif
03113 
03114 }
03115 
03116 
03117 
03118 //////////////////////////////////////////////////////////////////////////////////////////
03119 
03120 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
03121                  double inv_scale_x, double inv_scale_y, int interpolation )
03122 {
03123     static ResizeFunc linear_tab[] =
03124     {
03125         resizeGeneric_<
03126             HResizeLinear<uchar, int, short,
03127                 INTER_RESIZE_COEF_SCALE,
03128                 HResizeLinearVec_8u32s>,
03129             VResizeLinear<uchar, int, short,
03130                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
03131                 VResizeLinearVec_32s8u> >,
03132         0,
03133         resizeGeneric_<
03134             HResizeLinear<ushort, float, float, 1,
03135                 HResizeLinearVec_16u32f>,
03136             VResizeLinear<ushort, float, float, Cast<float, ushort>,
03137                 VResizeLinearVec_32f16u> >,
03138         resizeGeneric_<
03139             HResizeLinear<short, float, float, 1,
03140                 HResizeLinearVec_16s32f>,
03141             VResizeLinear<short, float, float, Cast<float, short>,
03142                 VResizeLinearVec_32f16s> >,
03143         0,
03144         resizeGeneric_<
03145             HResizeLinear<float, float, float, 1,
03146                 HResizeLinearVec_32f>,
03147             VResizeLinear<float, float, float, Cast<float, float>,
03148                 VResizeLinearVec_32f> >,
03149         resizeGeneric_<
03150             HResizeLinear<double, double, float, 1,
03151                 HResizeNoVec>,
03152             VResizeLinear<double, double, float, Cast<double, double>,
03153                 VResizeNoVec> >,
03154         0
03155     };
03156 
03157     static ResizeFunc cubic_tab[] =
03158     {
03159         resizeGeneric_<
03160             HResizeCubic<uchar, int, short>,
03161             VResizeCubic<uchar, int, short,
03162                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
03163                 VResizeCubicVec_32s8u> >,
03164         0,
03165         resizeGeneric_<
03166             HResizeCubic<ushort, float, float>,
03167             VResizeCubic<ushort, float, float, Cast<float, ushort>,
03168             VResizeCubicVec_32f16u> >,
03169         resizeGeneric_<
03170             HResizeCubic<short, float, float>,
03171             VResizeCubic<short, float, float, Cast<float, short>,
03172             VResizeCubicVec_32f16s> >,
03173         0,
03174         resizeGeneric_<
03175             HResizeCubic<float, float, float>,
03176             VResizeCubic<float, float, float, Cast<float, float>,
03177             VResizeCubicVec_32f> >,
03178         resizeGeneric_<
03179             HResizeCubic<double, double, float>,
03180             VResizeCubic<double, double, float, Cast<double, double>,
03181             VResizeNoVec> >,
03182         0
03183     };
03184 
03185     static ResizeFunc lanczos4_tab[] =
03186     {
03187         resizeGeneric_<HResizeLanczos4<uchar, int, short>,
03188             VResizeLanczos4<uchar, int, short,
03189             FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
03190             VResizeNoVec> >,
03191         0,
03192         resizeGeneric_<HResizeLanczos4<ushort, float, float>,
03193             VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
03194             VResizeLanczos4Vec_32f16u> >,
03195         resizeGeneric_<HResizeLanczos4<short, float, float>,
03196             VResizeLanczos4<short, float, float, Cast<float, short>,
03197             VResizeLanczos4Vec_32f16s> >,
03198         0,
03199         resizeGeneric_<HResizeLanczos4<float, float, float>,
03200             VResizeLanczos4<float, float, float, Cast<float, float>,
03201             VResizeLanczos4Vec_32f> >,
03202         resizeGeneric_<HResizeLanczos4<double, double, float>,
03203             VResizeLanczos4<double, double, float, Cast<double, double>,
03204             VResizeNoVec> >,
03205         0
03206     };
03207 
03208     static ResizeAreaFastFunc areafast_tab[] =
03209     {
03210         resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
03211         0,
03212         resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
03213         resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
03214         0,
03215         resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
03216         resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
03217         0
03218     };
03219 
03220     static ResizeAreaFunc area_tab[] =
03221     {
03222         resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
03223         resizeArea_<short, float>, 0, resizeArea_<float, float>,
03224         resizeArea_<double, double>, 0
03225     };
03226 
03227     Size ssize = _src.size();
03228 
03229     CV_Assert( ssize.area() > 0 );
03230     CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
03231     if( dsize.area() == 0 )
03232     {
03233         dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
03234                      saturate_cast<int>(ssize.height*inv_scale_y));
03235         CV_Assert( dsize.area() > 0 );
03236     }
03237     else
03238     {
03239         inv_scale_x = (double)dsize.width/ssize.width;
03240         inv_scale_y = (double)dsize.height/ssize.height;
03241     }
03242 
03243 
03244     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
03245     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
03246 
03247     int iscale_x = saturate_cast<int>(scale_x);
03248     int iscale_y = saturate_cast<int>(scale_y);
03249 
03250     bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
03251             std::abs(scale_y - iscale_y) < DBL_EPSILON;
03252 
03253 #ifdef HAVE_OPENCL
03254     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
03255                ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
03256 #endif
03257 
03258     Mat src = _src.getMat();
03259     _dst.create(dsize, src.type());
03260     Mat dst = _dst.getMat();
03261 
03262     if (dsize == ssize) {
03263       // Source and destination are of same size. Use simple copy.
03264       src.copyTo(dst);
03265       return;
03266     }
03267 
03268 #ifdef HAVE_TEGRA_OPTIMIZATION
03269     if (tegra::useTegra() && tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
03270         return;
03271 #endif
03272 
03273 #ifdef HAVE_IPP
03274     int mode = -1;
03275     if (interpolation == INTER_LINEAR && _src.rows() >= 2 && _src.cols() >= 2)
03276         mode = INTER_LINEAR;
03277     else if (interpolation == INTER_CUBIC && _src.rows() >= 4 && _src.cols() >= 4)
03278         mode = INTER_CUBIC;
03279 
03280     const double IPP_RESIZE_EPS = 1e-10;
03281     double ex = fabs((double)dsize.width / _src.cols()  - inv_scale_x) / inv_scale_x;
03282     double ey = fabs((double)dsize.height / _src.rows() - inv_scale_y) / inv_scale_y;
03283 #endif
03284     CV_IPP_RUN(IPP_VERSION_X100 >= 710 && ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
03285         (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
03286         !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U) &&
03287         mode >= 0 && (cn == 1 || cn == 3 || cn == 4) && (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
03288         (depth == CV_64F && mode == INTER_LINEAR)), ipp_resize_mt(src, dst, inv_scale_x, inv_scale_y, interpolation))
03289 
03290 
03291     if( interpolation == INTER_NEAREST )
03292     {
03293         resizeNN( src, dst, inv_scale_x, inv_scale_y );
03294         return;
03295     }
03296 
03297     int k, sx, sy, dx, dy;
03298 
03299 
03300     {
03301         // in case of scale_x && scale_y is equal to 2
03302         // INTER_AREA (fast) also is equal to INTER_LINEAR
03303         if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
03304             interpolation = INTER_AREA;
03305 
03306         // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
03307         // In other cases it is emulated using some variant of bilinear interpolation
03308         if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
03309         {
03310             if( is_area_fast )
03311             {
03312                 int area = iscale_x*iscale_y;
03313                 size_t srcstep = src.step / src.elemSize1();
03314                 AutoBuffer<int>  _ofs(area + dsize.width*cn);
03315                 int* ofs = _ofs;
03316                 int* xofs = ofs + area;
03317                 ResizeAreaFastFunc func = areafast_tab[depth];
03318                 CV_Assert( func != 0 );
03319 
03320                 for( sy = 0, k = 0; sy < iscale_y; sy++ )
03321                     for( sx = 0; sx < iscale_x; sx++ )
03322                         ofs[k++] = (int)(sy*srcstep + sx*cn);
03323 
03324                 for( dx = 0; dx < dsize.width; dx++ )
03325                 {
03326                     int j = dx * cn;
03327                     sx = iscale_x * j;
03328                     for( k = 0; k < cn; k++ )
03329                         xofs[j + k] = sx + k;
03330                 }
03331 
03332                 func( src, dst, ofs, xofs, iscale_x, iscale_y );
03333                 return;
03334             }
03335 
03336             ResizeAreaFunc func = area_tab[depth];
03337             CV_Assert( func != 0 && cn <= 4 );
03338 
03339             AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
03340             DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
03341 
03342             int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
03343             int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
03344 
03345             AutoBuffer<int>  _tabofs(dsize.height + 1);
03346             int* tabofs = _tabofs;
03347             for( k = 0, dy = 0; k < ytab_size; k++ )
03348             {
03349                 if( k == 0 || ytab[k].di != ytab[k-1].di )
03350                 {
03351                     assert( ytab[k].di == dy );
03352                     tabofs[dy++] = k;
03353                 }
03354             }
03355             tabofs[dy] = ytab_size;
03356 
03357             func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
03358             return;
03359         }
03360     }
03361 
03362     int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
03363     bool area_mode = interpolation == INTER_AREA;
03364     bool fixpt = depth == CV_8U;
03365     float fx, fy;
03366     ResizeFunc func=0;
03367     int ksize=0, ksize2;
03368     if( interpolation == INTER_CUBIC )
03369         ksize = 4, func = cubic_tab[depth];
03370     else if( interpolation == INTER_LANCZOS4 )
03371         ksize = 8, func = lanczos4_tab[depth];
03372     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
03373         ksize = 2, func = linear_tab[depth];
03374     else
03375         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
03376     ksize2 = ksize/2;
03377 
03378     CV_Assert( func != 0 );
03379 
03380     AutoBuffer<uchar>  _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
03381     int* xofs = (int*)(uchar*)_buffer;
03382     int* yofs = xofs + width;
03383     float* alpha = (float*)(yofs + dsize.height);
03384     short* ialpha = (short*)alpha;
03385     float* beta = alpha + width*ksize;
03386     short* ibeta = ialpha + width*ksize;
03387     float cbuf[MAX_ESIZE];
03388 
03389     for( dx = 0; dx < dsize.width; dx++ )
03390     {
03391         if( !area_mode )
03392         {
03393             fx = (float)((dx+0.5)*scale_x - 0.5);
03394             sx = cvFloor(fx);
03395             fx -= sx;
03396         }
03397         else
03398         {
03399             sx = cvFloor(dx*scale_x);
03400             fx = (float)((dx+1) - (sx+1)*inv_scale_x);
03401             fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
03402         }
03403 
03404         if( sx < ksize2-1 )
03405         {
03406             xmin = dx+1;
03407             if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
03408                 fx = 0, sx = 0;
03409         }
03410 
03411         if( sx + ksize2 >= ssize.width )
03412         {
03413             xmax = std::min( xmax, dx );
03414             if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
03415                 fx = 0, sx = ssize.width-1;
03416         }
03417 
03418         for( k = 0, sx *= cn; k < cn; k++ )
03419             xofs[dx*cn + k] = sx + k;
03420 
03421         if( interpolation == INTER_CUBIC )
03422             interpolateCubic( fx, cbuf );
03423         else if( interpolation == INTER_LANCZOS4 )
03424             interpolateLanczos4( fx, cbuf );
03425         else
03426         {
03427             cbuf[0] = 1.f - fx;
03428             cbuf[1] = fx;
03429         }
03430         if( fixpt )
03431         {
03432             for( k = 0; k < ksize; k++ )
03433                 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
03434             for( ; k < cn*ksize; k++ )
03435                 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
03436         }
03437         else
03438         {
03439             for( k = 0; k < ksize; k++ )
03440                 alpha[dx*cn*ksize + k] = cbuf[k];
03441             for( ; k < cn*ksize; k++ )
03442                 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
03443         }
03444     }
03445 
03446     for( dy = 0; dy < dsize.height; dy++ )
03447     {
03448         if( !area_mode )
03449         {
03450             fy = (float)((dy+0.5)*scale_y - 0.5);
03451             sy = cvFloor(fy);
03452             fy -= sy;
03453         }
03454         else
03455         {
03456             sy = cvFloor(dy*scale_y);
03457             fy = (float)((dy+1) - (sy+1)*inv_scale_y);
03458             fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
03459         }
03460 
03461         yofs[dy] = sy;
03462         if( interpolation == INTER_CUBIC )
03463             interpolateCubic( fy, cbuf );
03464         else if( interpolation == INTER_LANCZOS4 )
03465             interpolateLanczos4( fy, cbuf );
03466         else
03467         {
03468             cbuf[0] = 1.f - fy;
03469             cbuf[1] = fy;
03470         }
03471 
03472         if( fixpt )
03473         {
03474             for( k = 0; k < ksize; k++ )
03475                 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
03476         }
03477         else
03478         {
03479             for( k = 0; k < ksize; k++ )
03480                 beta[dy*ksize + k] = cbuf[k];
03481         }
03482     }
03483 
03484     func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
03485           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
03486 }
03487 
03488 
03489 /****************************************************************************************\
03490 *                       General warping (affine, perspective, remap)                     *
03491 \****************************************************************************************/
03492 
03493 namespace cv
03494 {
03495 
03496 template<typename T>
03497 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
03498                           int borderType, const Scalar& _borderValue )
03499 {
03500     Size ssize = _src.size(), dsize = _dst.size();
03501     int cn = _src.channels();
03502     const T* S0 = _src.ptr<T>();
03503     size_t sstep = _src.step/sizeof(S0[0]);
03504     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
03505         saturate_cast<T>(_borderValue[1]),
03506         saturate_cast<T>(_borderValue[2]),
03507         saturate_cast<T>(_borderValue[3]));
03508     int dx, dy;
03509 
03510     unsigned width1 = ssize.width, height1 = ssize.height;
03511 
03512     if( _dst.isContinuous() && _xy.isContinuous() )
03513     {
03514         dsize.width *= dsize.height;
03515         dsize.height = 1;
03516     }
03517 
03518     for( dy = 0; dy < dsize.height; dy++ )
03519     {
03520         T* D = _dst.ptr<T>(dy);
03521         const short* XY = _xy.ptr<short>(dy);
03522 
03523         if( cn == 1 )
03524         {
03525             for( dx = 0; dx < dsize.width; dx++ )
03526             {
03527                 int sx = XY[dx*2], sy = XY[dx*2+1];
03528                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
03529                     D[dx] = S0[sy*sstep + sx];
03530                 else
03531                 {
03532                     if( borderType == BORDER_REPLICATE )
03533                     {
03534                         sx = clip(sx, 0, ssize.width);
03535                         sy = clip(sy, 0, ssize.height);
03536                         D[dx] = S0[sy*sstep + sx];
03537                     }
03538                     else if( borderType == BORDER_CONSTANT )
03539                         D[dx] = cval[0];
03540                     else if( borderType != BORDER_TRANSPARENT )
03541                     {
03542                         sx = borderInterpolate(sx, ssize.width, borderType);
03543                         sy = borderInterpolate(sy, ssize.height, borderType);
03544                         D[dx] = S0[sy*sstep + sx];
03545                     }
03546                 }
03547             }
03548         }
03549         else
03550         {
03551             for( dx = 0; dx < dsize.width; dx++, D += cn )
03552             {
03553                 int sx = XY[dx*2], sy = XY[dx*2+1], k;
03554                 const T *S;
03555                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
03556                 {
03557                     if( cn == 3 )
03558                     {
03559                         S = S0 + sy*sstep + sx*3;
03560                         D[0] = S[0], D[1] = S[1], D[2] = S[2];
03561                     }
03562                     else if( cn == 4 )
03563                     {
03564                         S = S0 + sy*sstep + sx*4;
03565                         D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
03566                     }
03567                     else
03568                     {
03569                         S = S0 + sy*sstep + sx*cn;
03570                         for( k = 0; k < cn; k++ )
03571                             D[k] = S[k];
03572                     }
03573                 }
03574                 else if( borderType != BORDER_TRANSPARENT )
03575                 {
03576                     if( borderType == BORDER_REPLICATE )
03577                     {
03578                         sx = clip(sx, 0, ssize.width);
03579                         sy = clip(sy, 0, ssize.height);
03580                         S = S0 + sy*sstep + sx*cn;
03581                     }
03582                     else if( borderType == BORDER_CONSTANT )
03583                         S = &cval[0];
03584                     else
03585                     {
03586                         sx = borderInterpolate(sx, ssize.width, borderType);
03587                         sy = borderInterpolate(sy, ssize.height, borderType);
03588                         S = S0 + sy*sstep + sx*cn;
03589                     }
03590                     for( k = 0; k < cn; k++ )
03591                         D[k] = S[k];
03592                 }
03593             }
03594         }
03595     }
03596 }
03597 
03598 
03599 struct RemapNoVec
03600 {
03601     int operator()( const Mat&, void*, const short*, const ushort*,
03602                     const void*, int ) const { return 0; }
03603 };
03604 
03605 #if CV_SSE2
03606 
03607 struct RemapVec_8u
03608 {
03609     int operator()( const Mat& _src, void* _dst, const short* XY,
03610                     const ushort* FXY, const void* _wtab, int width ) const
03611     {
03612         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
03613 
03614         if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
03615             sstep > 0x8000 )
03616             return 0;
03617 
03618         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
03619         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
03620         uchar* D = (uchar*)_dst;
03621         __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
03622         __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
03623         __m128i z = _mm_setzero_si128();
03624         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
03625 
03626         if( cn == 1 )
03627         {
03628             for( ; x <= width - 8; x += 8 )
03629             {
03630                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
03631                 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
03632                 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
03633                 unsigned i0, i1;
03634 
03635                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
03636                 xy1 = _mm_madd_epi16( xy1, xy2ofs );
03637                 _mm_store_si128( (__m128i*)iofs0, xy0 );
03638                 _mm_store_si128( (__m128i*)iofs1, xy1 );
03639 
03640                 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
03641                 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
03642                 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
03643                 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
03644                 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
03645                 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
03646                 v0 = _mm_unpacklo_epi8(v0, z);
03647                 v1 = _mm_unpacklo_epi8(v1, z);
03648 
03649                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
03650                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
03651                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
03652                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
03653                 b0 = _mm_unpacklo_epi64(a0, a1);
03654                 b1 = _mm_unpackhi_epi64(a0, a1);
03655                 v0 = _mm_madd_epi16(v0, b0);
03656                 v1 = _mm_madd_epi16(v1, b1);
03657                 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
03658 
03659                 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
03660                 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
03661                 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
03662                 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
03663                 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
03664                 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
03665                 v2 = _mm_unpacklo_epi8(v2, z);
03666                 v3 = _mm_unpacklo_epi8(v3, z);
03667 
03668                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
03669                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
03670                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
03671                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
03672                 b0 = _mm_unpacklo_epi64(a0, a1);
03673                 b1 = _mm_unpackhi_epi64(a0, a1);
03674                 v2 = _mm_madd_epi16(v2, b0);
03675                 v3 = _mm_madd_epi16(v3, b1);
03676                 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
03677 
03678                 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
03679                 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
03680                 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
03681                 _mm_storel_epi64( (__m128i*)(D + x), v0 );
03682             }
03683         }
03684         else if( cn == 3 )
03685         {
03686             for( ; x <= width - 5; x += 4, D += 12 )
03687             {
03688                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
03689                 __m128i u0, v0, u1, v1;
03690 
03691                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
03692                 _mm_store_si128( (__m128i*)iofs0, xy0 );
03693                 const __m128i *w0, *w1;
03694                 w0 = (const __m128i*)(wtab + FXY[x]*16);
03695                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
03696 
03697                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
03698                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
03699                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
03700                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
03701                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
03702                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
03703                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
03704                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
03705                 u0 = _mm_unpacklo_epi8(u0, z);
03706                 v0 = _mm_unpacklo_epi8(v0, z);
03707                 u1 = _mm_unpacklo_epi8(u1, z);
03708                 v1 = _mm_unpacklo_epi8(v1, z);
03709                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
03710                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
03711                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
03712                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
03713                 u0 = _mm_slli_si128(u0, 4);
03714                 u0 = _mm_packs_epi32(u0, u1);
03715                 u0 = _mm_packus_epi16(u0, u0);
03716                 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
03717 
03718                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
03719                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
03720 
03721                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
03722                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
03723                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
03724                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
03725                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
03726                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
03727                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
03728                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
03729                 u0 = _mm_unpacklo_epi8(u0, z);
03730                 v0 = _mm_unpacklo_epi8(v0, z);
03731                 u1 = _mm_unpacklo_epi8(u1, z);
03732                 v1 = _mm_unpacklo_epi8(v1, z);
03733                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
03734                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
03735                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
03736                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
03737                 u0 = _mm_slli_si128(u0, 4);
03738                 u0 = _mm_packs_epi32(u0, u1);
03739                 u0 = _mm_packus_epi16(u0, u0);
03740                 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
03741             }
03742         }
03743         else if( cn == 4 )
03744         {
03745             for( ; x <= width - 4; x += 4, D += 16 )
03746             {
03747                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
03748                 __m128i u0, v0, u1, v1;
03749 
03750                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
03751                 _mm_store_si128( (__m128i*)iofs0, xy0 );
03752                 const __m128i *w0, *w1;
03753                 w0 = (const __m128i*)(wtab + FXY[x]*16);
03754                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
03755 
03756                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
03757                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
03758                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
03759                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
03760                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
03761                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
03762                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
03763                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
03764                 u0 = _mm_unpacklo_epi8(u0, z);
03765                 v0 = _mm_unpacklo_epi8(v0, z);
03766                 u1 = _mm_unpacklo_epi8(u1, z);
03767                 v1 = _mm_unpacklo_epi8(v1, z);
03768                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
03769                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
03770                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
03771                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
03772                 u0 = _mm_packs_epi32(u0, u1);
03773                 u0 = _mm_packus_epi16(u0, u0);
03774                 _mm_storel_epi64((__m128i*)D, u0);
03775 
03776                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
03777                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
03778 
03779                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
03780                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
03781                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
03782                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
03783                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
03784                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
03785                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
03786                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
03787                 u0 = _mm_unpacklo_epi8(u0, z);
03788                 v0 = _mm_unpacklo_epi8(v0, z);
03789                 u1 = _mm_unpacklo_epi8(u1, z);
03790                 v1 = _mm_unpacklo_epi8(v1, z);
03791                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
03792                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
03793                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
03794                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
03795                 u0 = _mm_packs_epi32(u0, u1);
03796                 u0 = _mm_packus_epi16(u0, u0);
03797                 _mm_storel_epi64((__m128i*)(D + 8), u0);
03798             }
03799         }
03800 
03801         return x;
03802     }
03803 };
03804 
03805 #else
03806 
03807 typedef RemapNoVec RemapVec_8u;
03808 
03809 #endif
03810 
03811 
03812 template<class CastOp, class VecOp, typename AT>
03813 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
03814                            const Mat& _fxy, const void* _wtab,
03815                            int borderType, const Scalar& _borderValue )
03816 {
03817     typedef typename CastOp::rtype T;
03818     typedef typename CastOp::type1 WT;
03819     Size ssize = _src.size(), dsize = _dst.size();
03820     int k, cn = _src.channels();
03821     const AT* wtab = (const AT*)_wtab;
03822     const T* S0 = _src.ptr<T>();
03823     size_t sstep = _src.step/sizeof(S0[0]);
03824     T cval[CV_CN_MAX];
03825     int dx, dy;
03826     CastOp castOp;
03827     VecOp vecOp;
03828 
03829     for( k = 0; k < cn; k++ )
03830         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
03831 
03832     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
03833     CV_Assert( ssize.area() > 0 );
03834 #if CV_SSE2
03835     if( _src.type() == CV_8UC3 )
03836         width1 = std::max(ssize.width-2, 0);
03837 #endif
03838 
03839     for( dy = 0; dy < dsize.height; dy++ )
03840     {
03841         T* D = _dst.ptr<T>(dy);
03842         const short* XY = _xy.ptr<short>(dy);
03843         const ushort* FXY = _fxy.ptr<ushort>(dy);
03844         int X0 = 0;
03845         bool prevInlier = false;
03846 
03847         for( dx = 0; dx <= dsize.width; dx++ )
03848         {
03849             bool curInlier = dx < dsize.width ?
03850                 (unsigned)XY[dx*2] < width1 &&
03851                 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
03852             if( curInlier == prevInlier )
03853                 continue;
03854 
03855             int X1 = dx;
03856             dx = X0;
03857             X0 = X1;
03858             prevInlier = curInlier;
03859 
03860             if( !curInlier )
03861             {
03862                 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
03863                 D += len*cn;
03864                 dx += len;
03865 
03866                 if( cn == 1 )
03867                 {
03868                     for( ; dx < X1; dx++, D++ )
03869                     {
03870                         int sx = XY[dx*2], sy = XY[dx*2+1];
03871                         const AT* w = wtab + FXY[dx]*4;
03872                         const T* S = S0 + sy*sstep + sx;
03873                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
03874                     }
03875                 }
03876                 else if( cn == 2 )
03877                     for( ; dx < X1; dx++, D += 2 )
03878                     {
03879                         int sx = XY[dx*2], sy = XY[dx*2+1];
03880                         const AT* w = wtab + FXY[dx]*4;
03881                         const T* S = S0 + sy*sstep + sx*2;
03882                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
03883                         WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
03884                         D[0] = castOp(t0); D[1] = castOp(t1);
03885                     }
03886                 else if( cn == 3 )
03887                     for( ; dx < X1; dx++, D += 3 )
03888                     {
03889                         int sx = XY[dx*2], sy = XY[dx*2+1];
03890                         const AT* w = wtab + FXY[dx]*4;
03891                         const T* S = S0 + sy*sstep + sx*3;
03892                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
03893                         WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
03894                         WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
03895                         D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
03896                     }
03897                 else if( cn == 4 )
03898                     for( ; dx < X1; dx++, D += 4 )
03899                     {
03900                         int sx = XY[dx*2], sy = XY[dx*2+1];
03901                         const AT* w = wtab + FXY[dx]*4;
03902                         const T* S = S0 + sy*sstep + sx*4;
03903                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
03904                         WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
03905                         D[0] = castOp(t0); D[1] = castOp(t1);
03906                         t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
03907                         t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
03908                         D[2] = castOp(t0); D[3] = castOp(t1);
03909                     }
03910                 else
03911                     for( ; dx < X1; dx++, D += cn )
03912                     {
03913                         int sx = XY[dx*2], sy = XY[dx*2+1];
03914                         const AT* w = wtab + FXY[dx]*4;
03915                         const T* S = S0 + sy*sstep + sx*cn;
03916                         for( k = 0; k < cn; k++ )
03917                         {
03918                             WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
03919                             D[k] = castOp(t0);
03920                         }
03921                     }
03922             }
03923             else
03924             {
03925                 if( borderType == BORDER_TRANSPARENT && cn != 3 )
03926                 {
03927                     D += (X1 - dx)*cn;
03928                     dx = X1;
03929                     continue;
03930                 }
03931 
03932                 if( cn == 1 )
03933                     for( ; dx < X1; dx++, D++ )
03934                     {
03935                         int sx = XY[dx*2], sy = XY[dx*2+1];
03936                         if( borderType == BORDER_CONSTANT &&
03937                             (sx >= ssize.width || sx+1 < 0 ||
03938                              sy >= ssize.height || sy+1 < 0) )
03939                         {
03940                             D[0] = cval[0];
03941                         }
03942                         else
03943                         {
03944                             int sx0, sx1, sy0, sy1;
03945                             T v0, v1, v2, v3;
03946                             const AT* w = wtab + FXY[dx]*4;
03947                             if( borderType == BORDER_REPLICATE )
03948                             {
03949                                 sx0 = clip(sx, 0, ssize.width);
03950                                 sx1 = clip(sx+1, 0, ssize.width);
03951                                 sy0 = clip(sy, 0, ssize.height);
03952                                 sy1 = clip(sy+1, 0, ssize.height);
03953                                 v0 = S0[sy0*sstep + sx0];
03954                                 v1 = S0[sy0*sstep + sx1];
03955                                 v2 = S0[sy1*sstep + sx0];
03956                                 v3 = S0[sy1*sstep + sx1];
03957                             }
03958                             else
03959                             {
03960                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
03961                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
03962                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
03963                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
03964                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
03965                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
03966                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
03967                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
03968                             }
03969                             D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
03970                         }
03971                     }
03972                 else
03973                     for( ; dx < X1; dx++, D += cn )
03974                     {
03975                         int sx = XY[dx*2], sy = XY[dx*2+1];
03976                         if( borderType == BORDER_CONSTANT &&
03977                             (sx >= ssize.width || sx+1 < 0 ||
03978                              sy >= ssize.height || sy+1 < 0) )
03979                         {
03980                             for( k = 0; k < cn; k++ )
03981                                 D[k] = cval[k];
03982                         }
03983                         else
03984                         {
03985                             int sx0, sx1, sy0, sy1;
03986                             const T *v0, *v1, *v2, *v3;
03987                             const AT* w = wtab + FXY[dx]*4;
03988                             if( borderType == BORDER_REPLICATE )
03989                             {
03990                                 sx0 = clip(sx, 0, ssize.width);
03991                                 sx1 = clip(sx+1, 0, ssize.width);
03992                                 sy0 = clip(sy, 0, ssize.height);
03993                                 sy1 = clip(sy+1, 0, ssize.height);
03994                                 v0 = S0 + sy0*sstep + sx0*cn;
03995                                 v1 = S0 + sy0*sstep + sx1*cn;
03996                                 v2 = S0 + sy1*sstep + sx0*cn;
03997                                 v3 = S0 + sy1*sstep + sx1*cn;
03998                             }
03999                             else if( borderType == BORDER_TRANSPARENT &&
04000                                 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
04001                                 (unsigned)sy >= (unsigned)(ssize.height-1)))
04002                                 continue;
04003                             else
04004                             {
04005                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
04006                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
04007                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
04008                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
04009                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
04010                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
04011                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
04012                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
04013                             }
04014                             for( k = 0; k < cn; k++ )
04015                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
04016                         }
04017                     }
04018             }
04019         }
04020     }
04021 }
04022 
04023 
04024 template<class CastOp, typename AT, int ONE>
04025 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
04026                           const Mat& _fxy, const void* _wtab,
04027                           int borderType, const Scalar& _borderValue )
04028 {
04029     typedef typename CastOp::rtype T;
04030     typedef typename CastOp::type1 WT;
04031     Size ssize = _src.size(), dsize = _dst.size();
04032     int cn = _src.channels();
04033     const AT* wtab = (const AT*)_wtab;
04034     const T* S0 = _src.ptr<T>();
04035     size_t sstep = _src.step/sizeof(S0[0]);
04036     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
04037         saturate_cast<T>(_borderValue[1]),
04038         saturate_cast<T>(_borderValue[2]),
04039         saturate_cast<T>(_borderValue[3]));
04040     int dx, dy;
04041     CastOp castOp;
04042     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
04043 
04044     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
04045 
04046     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
04047     {
04048         dsize.width *= dsize.height;
04049         dsize.height = 1;
04050     }
04051 
04052     for( dy = 0; dy < dsize.height; dy++ )
04053     {
04054         T* D = _dst.ptr<T>(dy);
04055         const short* XY = _xy.ptr<short>(dy);
04056         const ushort* FXY = _fxy.ptr<ushort>(dy);
04057 
04058         for( dx = 0; dx < dsize.width; dx++, D += cn )
04059         {
04060             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
04061             const AT* w = wtab + FXY[dx]*16;
04062             int i, k;
04063             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
04064             {
04065                 const T* S = S0 + sy*sstep + sx*cn;
04066                 for( k = 0; k < cn; k++ )
04067                 {
04068                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
04069                     S += sstep;
04070                     sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
04071                     S += sstep;
04072                     sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
04073                     S += sstep;
04074                     sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
04075                     S += 1 - sstep*3;
04076                     D[k] = castOp(sum);
04077                 }
04078             }
04079             else
04080             {
04081                 int x[4], y[4];
04082                 if( borderType == BORDER_TRANSPARENT &&
04083                     ((unsigned)(sx+1) >= (unsigned)ssize.width ||
04084                     (unsigned)(sy+1) >= (unsigned)ssize.height) )
04085                     continue;
04086 
04087                 if( borderType1 == BORDER_CONSTANT &&
04088                     (sx >= ssize.width || sx+4 <= 0 ||
04089                     sy >= ssize.height || sy+4 <= 0))
04090                 {
04091                     for( k = 0; k < cn; k++ )
04092                         D[k] = cval[k];
04093                     continue;
04094                 }
04095 
04096                 for( i = 0; i < 4; i++ )
04097                 {
04098                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
04099                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
04100                 }
04101 
04102                 for( k = 0; k < cn; k++, S0++, w -= 16 )
04103                 {
04104                     WT cv = cval[k], sum = cv*ONE;
04105                     for( i = 0; i < 4; i++, w += 4 )
04106                     {
04107                         int yi = y[i];
04108                         const T* S = S0 + yi*sstep;
04109                         if( yi < 0 )
04110                             continue;
04111                         if( x[0] >= 0 )
04112                             sum += (S[x[0]] - cv)*w[0];
04113                         if( x[1] >= 0 )
04114                             sum += (S[x[1]] - cv)*w[1];
04115                         if( x[2] >= 0 )
04116                             sum += (S[x[2]] - cv)*w[2];
04117                         if( x[3] >= 0 )
04118                             sum += (S[x[3]] - cv)*w[3];
04119                     }
04120                     D[k] = castOp(sum);
04121                 }
04122                 S0 -= cn;
04123             }
04124         }
04125     }
04126 }
04127 
04128 
04129 template<class CastOp, typename AT, int ONE>
04130 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
04131                            const Mat& _fxy, const void* _wtab,
04132                            int borderType, const Scalar& _borderValue )
04133 {
04134     typedef typename CastOp::rtype T;
04135     typedef typename CastOp::type1 WT;
04136     Size ssize = _src.size(), dsize = _dst.size();
04137     int cn = _src.channels();
04138     const AT* wtab = (const AT*)_wtab;
04139     const T* S0 = _src.ptr<T>();
04140     size_t sstep = _src.step/sizeof(S0[0]);
04141     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
04142         saturate_cast<T>(_borderValue[1]),
04143         saturate_cast<T>(_borderValue[2]),
04144         saturate_cast<T>(_borderValue[3]));
04145     int dx, dy;
04146     CastOp castOp;
04147     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
04148 
04149     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
04150 
04151     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
04152     {
04153         dsize.width *= dsize.height;
04154         dsize.height = 1;
04155     }
04156 
04157     for( dy = 0; dy < dsize.height; dy++ )
04158     {
04159         T* D = _dst.ptr<T>(dy);
04160         const short* XY = _xy.ptr<short>(dy);
04161         const ushort* FXY = _fxy.ptr<ushort>(dy);
04162 
04163         for( dx = 0; dx < dsize.width; dx++, D += cn )
04164         {
04165             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
04166             const AT* w = wtab + FXY[dx]*64;
04167             const T* S = S0 + sy*sstep + sx*cn;
04168             int i, k;
04169             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
04170             {
04171                 for( k = 0; k < cn; k++ )
04172                 {
04173                     WT sum = 0;
04174                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
04175                         sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
04176                             S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
04177                     w -= 64;
04178                     S -= sstep*8 - 1;
04179                     D[k] = castOp(sum);
04180                 }
04181             }
04182             else
04183             {
04184                 int x[8], y[8];
04185                 if( borderType == BORDER_TRANSPARENT &&
04186                     ((unsigned)(sx+3) >= (unsigned)ssize.width ||
04187                     (unsigned)(sy+3) >= (unsigned)ssize.height) )
04188                     continue;
04189 
04190                 if( borderType1 == BORDER_CONSTANT &&
04191                     (sx >= ssize.width || sx+8 <= 0 ||
04192                     sy >= ssize.height || sy+8 <= 0))
04193                 {
04194                     for( k = 0; k < cn; k++ )
04195                         D[k] = cval[k];
04196                     continue;
04197                 }
04198 
04199                 for( i = 0; i < 8; i++ )
04200                 {
04201                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
04202                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
04203                 }
04204 
04205                 for( k = 0; k < cn; k++, S0++, w -= 64 )
04206                 {
04207                     WT cv = cval[k], sum = cv*ONE;
04208                     for( i = 0; i < 8; i++, w += 8 )
04209                     {
04210                         int yi = y[i];
04211                         const T* S1 = S0 + yi*sstep;
04212                         if( yi < 0 )
04213                             continue;
04214                         if( x[0] >= 0 )
04215                             sum += (S1[x[0]] - cv)*w[0];
04216                         if( x[1] >= 0 )
04217                             sum += (S1[x[1]] - cv)*w[1];
04218                         if( x[2] >= 0 )
04219                             sum += (S1[x[2]] - cv)*w[2];
04220                         if( x[3] >= 0 )
04221                             sum += (S1[x[3]] - cv)*w[3];
04222                         if( x[4] >= 0 )
04223                             sum += (S1[x[4]] - cv)*w[4];
04224                         if( x[5] >= 0 )
04225                             sum += (S1[x[5]] - cv)*w[5];
04226                         if( x[6] >= 0 )
04227                             sum += (S1[x[6]] - cv)*w[6];
04228                         if( x[7] >= 0 )
04229                             sum += (S1[x[7]] - cv)*w[7];
04230                     }
04231                     D[k] = castOp(sum);
04232                 }
04233                 S0 -= cn;
04234             }
04235         }
04236     }
04237 }
04238 
04239 
04240 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
04241                             int borderType, const Scalar& _borderValue );
04242 
04243 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
04244                           const Mat& _fxy, const void* _wtab,
04245                           int borderType, const Scalar& _borderValue);
04246 
04247 class RemapInvoker :
04248     public ParallelLoopBody
04249 {
04250 public:
04251     RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
04252                  const Mat *_m2, int _borderType, const Scalar &_borderValue,
04253                  int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
04254         ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
04255         borderType(_borderType), borderValue(_borderValue),
04256         planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
04257     {
04258     }
04259 
04260     virtual void operator() (const Range& range) const
04261     {
04262         int x, y, x1, y1;
04263         const int buf_size = 1 << 14;
04264         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
04265         int bcols0 = std::min(buf_size/brows0, dst->cols);
04266         brows0 = std::min(buf_size/bcols0, dst->rows);
04267     #if CV_SSE2
04268         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
04269     #endif
04270 
04271         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
04272         if( !nnfunc )
04273             _bufa.create(brows0, bcols0, CV_16UC1);
04274 
04275         for( y = range.start; y < range.end; y += brows0 )
04276         {
04277             for( x = 0; x < dst->cols; x += bcols0 )
04278             {
04279                 int brows = std::min(brows0, range.end - y);
04280                 int bcols = std::min(bcols0, dst->cols - x);
04281                 Mat dpart(*dst, Rect(x, y, bcols, brows));
04282                 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
04283 
04284                 if( nnfunc )
04285                 {
04286                     if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
04287                         bufxy = (*m1)(Rect(x, y, bcols, brows));
04288                     else if( map_depth != CV_32F )
04289                     {
04290                         for( y1 = 0; y1 < brows; y1++ )
04291                         {
04292                             short* XY = bufxy.ptr<short>(y1);
04293                             const short* sXY = m1->ptr<short>(y+y1) + x*2;
04294                             const ushort* sA = m2->ptr<ushort>(y+y1) + x;
04295 
04296                             for( x1 = 0; x1 < bcols; x1++ )
04297                             {
04298                                 int a = sA[x1] & (INTER_TAB_SIZE2-1);
04299                                 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
04300                                 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
04301                             }
04302                         }
04303                     }
04304                     else if( !planar_input )
04305                         (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
04306                     else
04307                     {
04308                         for( y1 = 0; y1 < brows; y1++ )
04309                         {
04310                             short* XY = bufxy.ptr<short>(y1);
04311                             const float* sX = m1->ptr<float>(y+y1) + x;
04312                             const float* sY = m2->ptr<float>(y+y1) + x;
04313                             x1 = 0;
04314 
04315                         #if CV_SSE2
04316                             if( useSIMD )
04317                             {
04318                                 for( ; x1 <= bcols - 8; x1 += 8 )
04319                                 {
04320                                     __m128 fx0 = _mm_loadu_ps(sX + x1);
04321                                     __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
04322                                     __m128 fy0 = _mm_loadu_ps(sY + x1);
04323                                     __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
04324                                     __m128i ix0 = _mm_cvtps_epi32(fx0);
04325                                     __m128i ix1 = _mm_cvtps_epi32(fx1);
04326                                     __m128i iy0 = _mm_cvtps_epi32(fy0);
04327                                     __m128i iy1 = _mm_cvtps_epi32(fy1);
04328                                     ix0 = _mm_packs_epi32(ix0, ix1);
04329                                     iy0 = _mm_packs_epi32(iy0, iy1);
04330                                     ix1 = _mm_unpacklo_epi16(ix0, iy0);
04331                                     iy1 = _mm_unpackhi_epi16(ix0, iy0);
04332                                     _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
04333                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
04334                                 }
04335                             }
04336                         #endif
04337 
04338                             for( ; x1 < bcols; x1++ )
04339                             {
04340                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
04341                                 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
04342                             }
04343                         }
04344                     }
04345                     nnfunc( *src, dpart, bufxy, borderType, borderValue );
04346                     continue;
04347                 }
04348 
04349                 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
04350                 for( y1 = 0; y1 < brows; y1++ )
04351                 {
04352                     short* XY = bufxy.ptr<short>(y1);
04353                     ushort* A = bufa.ptr<ushort>(y1);
04354 
04355                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
04356                     {
04357                         bufxy = (*m1)(Rect(x, y, bcols, brows));
04358 
04359                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
04360                         x1 = 0;
04361 
04362                     #if CV_NEON
04363                         uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
04364                         for ( ; x1 <= bcols - 8; x1 += 8)
04365                             vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
04366                     #elif CV_SSE2
04367                         __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1);
04368                         for ( ; x1 <= bcols - 8; x1 += 8)
04369                             _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale));
04370                     #endif
04371 
04372                         for( ; x1 < bcols; x1++ )
04373                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
04374                     }
04375                     else if( planar_input )
04376                     {
04377                         const float* sX = m1->ptr<float>(y+y1) + x;
04378                         const float* sY = m2->ptr<float>(y+y1) + x;
04379 
04380                         x1 = 0;
04381                     #if CV_SSE2
04382                         if( useSIMD )
04383                         {
04384                             __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
04385                             __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
04386                             for( ; x1 <= bcols - 8; x1 += 8 )
04387                             {
04388                                 __m128 fx0 = _mm_loadu_ps(sX + x1);
04389                                 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
04390                                 __m128 fy0 = _mm_loadu_ps(sY + x1);
04391                                 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
04392                                 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
04393                                 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
04394                                 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
04395                                 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
04396                                 __m128i mx0 = _mm_and_si128(ix0, mask);
04397                                 __m128i mx1 = _mm_and_si128(ix1, mask);
04398                                 __m128i my0 = _mm_and_si128(iy0, mask);
04399                                 __m128i my1 = _mm_and_si128(iy1, mask);
04400                                 mx0 = _mm_packs_epi32(mx0, mx1);
04401                                 my0 = _mm_packs_epi32(my0, my1);
04402                                 my0 = _mm_slli_epi16(my0, INTER_BITS);
04403                                 mx0 = _mm_or_si128(mx0, my0);
04404                                 _mm_storeu_si128((__m128i*)(A + x1), mx0);
04405                                 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
04406                                 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
04407                                 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
04408                                 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
04409                                 ix0 = _mm_packs_epi32(ix0, ix1);
04410                                 iy0 = _mm_packs_epi32(iy0, iy1);
04411                                 ix1 = _mm_unpacklo_epi16(ix0, iy0);
04412                                 iy1 = _mm_unpackhi_epi16(ix0, iy0);
04413                                 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
04414                                 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
04415                             }
04416                         }
04417                     #elif CV_NEON
04418                         float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
04419                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
04420 
04421                         for( ; x1 <= bcols - 4; x1 += 4 )
04422                         {
04423                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
04424                                       v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
04425                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
04426                                                       vandq_s32(v_sy, v_scale2));
04427                             vst1_u16(A + x1, vqmovun_s32(v_v));
04428 
04429                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
04430                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
04431                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
04432                         }
04433                     #endif
04434 
04435                         for( ; x1 < bcols; x1++ )
04436                         {
04437                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
04438                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
04439                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
04440                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
04441                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
04442                             A[x1] = (ushort)v;
04443                         }
04444                     }
04445                     else
04446                     {
04447                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
04448                         x1 = 0;
04449 
04450                     #if CV_NEON
04451                         float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
04452                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
04453 
04454                         for( ; x1 <= bcols - 4; x1 += 4 )
04455                         {
04456                             float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
04457                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
04458                             int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
04459                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
04460                                                       vandq_s32(v_sy, v_scale2));
04461                             vst1_u16(A + x1, vqmovun_s32(v_v));
04462 
04463                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
04464                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
04465                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
04466                         }
04467                     #endif
04468 
04469                         for( x1 = 0; x1 < bcols; x1++ )
04470                         {
04471                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
04472                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
04473                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
04474                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
04475                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
04476                             A[x1] = (ushort)v;
04477                         }
04478                     }
04479                 }
04480                 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
04481             }
04482         }
04483     }
04484 
04485 private:
04486     const Mat* src;
04487     Mat* dst;
04488     const Mat *m1, *m2;
04489     int borderType;
04490     Scalar borderValue;
04491     int planar_input;
04492     RemapNNFunc nnfunc;
04493     RemapFunc ifunc;
04494     const void *ctab;
04495 };
04496 
04497 #ifdef HAVE_OPENCL
04498 
04499 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
04500                       int interpolation, int borderType, const Scalar& borderValue)
04501 {
04502     const ocl::Device & dev = ocl::Device::getDefault();
04503     int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
04504             rowsPerWI = dev.isIntel() ? 4 : 1;
04505 
04506     if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
04507             || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
04508         return false;
04509 
04510     UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
04511 
04512     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
04513         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
04514     {
04515         if (map1.type() != CV_16SC2)
04516             std::swap(map1, map2);
04517     }
04518     else
04519         CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
04520 
04521     _dst.create(map1.size(), type);
04522     UMat dst = _dst.getUMat();
04523 
04524     String kernelName = "remap";
04525     if (map1.type() == CV_32FC2 && map2.empty())
04526         kernelName += "_32FC2";
04527     else if (map1.type() == CV_16SC2)
04528     {
04529         kernelName += "_16SC2";
04530         if (!map2.empty())
04531             kernelName += "_16UC1";
04532     }
04533     else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
04534         kernelName += "_2_32FC1";
04535     else
04536         CV_Error(Error::StsBadArg, "Unsupported map types");
04537 
04538     static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
04539     static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
04540                            "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
04541     String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
04542                                  interMap[interpolation], borderMap[borderType],
04543                                  ocl::typeToStr(type), rowsPerWI);
04544 
04545     if (interpolation != INTER_NEAREST)
04546     {
04547         char cvt[3][40];
04548         int wdepth = std::max(CV_32F, depth);
04549         buildOptions = buildOptions
04550                       + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
04551                                " -D convertToWT2=%s -D WT2=%s",
04552                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
04553                                ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
04554                                ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
04555                                ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
04556                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
04557     }
04558     int scalarcn = cn == 3 ? 4 : cn;
04559     int sctype = CV_MAKETYPE(depth, scalarcn);
04560     buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
04561                            ocl::typeToStr(type), ocl::typeToStr(depth),
04562                            cn, ocl::typeToStr(sctype), depth);
04563 
04564     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
04565 
04566     Mat scalar(1, 1, sctype, borderValue);
04567     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
04568             map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
04569             scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
04570 
04571     if (map2.empty())
04572         k.args(srcarg, dstarg, map1arg, scalararg);
04573     else
04574         k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
04575 
04576     size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
04577     return k.run(2, globalThreads, NULL, false);
04578 }
04579 
04580 #endif
04581 
04582 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && IPP_DISABLE_BLOCK
04583 
04584 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
04585                                            const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
04586                                            void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
04587 
04588 class IPPRemapInvoker :
04589         public ParallelLoopBody
04590 {
04591 public:
04592     IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
04593                     int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
04594         ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
04595         ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
04596     {
04597         *ok = true;
04598     }
04599 
04600     virtual void operator() (const Range & range) const
04601     {
04602         IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
04603         Mat dstRoi = dst.rowRange(range);
04604         IppiSize dstRoiSize = ippiSize(dstRoi.size());
04605         int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
04606 
04607         if (borderType == BORDER_CONSTANT &&
04608                 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
04609         {
04610             *ok = false;
04611             return;
04612         }
04613 
04614         if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
04615                     map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
04616                     dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0)
04617             *ok = false;
04618         else
04619         {
04620             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
04621         }
04622     }
04623 
04624 private:
04625     Mat & src, & dst, & map1, & map2;
04626     ippiRemap ippFunc;
04627     int ippInterpolation, borderType;
04628     Scalar borderValue;
04629     bool * ok;
04630 };
04631 
04632 #endif
04633 
04634 }
04635 
04636 void cv::remap( InputArray _src, OutputArray _dst,
04637                 InputArray _map1, InputArray _map2,
04638                 int interpolation, int borderType, const Scalar & borderValue )
04639 {
04640     static RemapNNFunc nn_tab[] =
04641     {
04642         remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
04643         remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
04644     };
04645 
04646     static RemapFunc linear_tab[] =
04647     {
04648         remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
04649         remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
04650         remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
04651         remapBilinear<Cast<float, float>, RemapNoVec, float>,
04652         remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
04653     };
04654 
04655     static RemapFunc cubic_tab[] =
04656     {
04657         remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
04658         remapBicubic<Cast<float, ushort>, float, 1>,
04659         remapBicubic<Cast<float, short>, float, 1>, 0,
04660         remapBicubic<Cast<float, float>, float, 1>,
04661         remapBicubic<Cast<double, double>, float, 1>, 0
04662     };
04663 
04664     static RemapFunc lanczos4_tab[] =
04665     {
04666         remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
04667         remapLanczos4<Cast<float, ushort>, float, 1>,
04668         remapLanczos4<Cast<float, short>, float, 1>, 0,
04669         remapLanczos4<Cast<float, float>, float, 1>,
04670         remapLanczos4<Cast<double, double>, float, 1>, 0
04671     };
04672 
04673     CV_Assert( _map1.size().area() > 0 );
04674     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
04675 
04676 #ifdef HAVE_OPENCL
04677     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
04678                ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
04679 #endif
04680 
04681     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
04682     _dst.create( map1.size(), src.type() );
04683     Mat dst = _dst.getMat();
04684     if( dst.data == src.data )
04685         src = src.clone();
04686 
04687     if( interpolation == INTER_AREA )
04688         interpolation = INTER_LINEAR;
04689 
04690     int type = src.type(), depth = CV_MAT_DEPTH(type);
04691 
04692 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && IPP_DISABLE_BLOCK
04693     CV_IPP_CHECK()
04694     {
04695         if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
04696                 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
04697                 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
04698         {
04699             int ippInterpolation =
04700                 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
04701                 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
04702 
04703             ippiRemap ippFunc =
04704                 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
04705                 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
04706                 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
04707                 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
04708                 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
04709                 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
04710                 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
04711                 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
04712                 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
04713 
04714             if (ippFunc)
04715             {
04716                 bool ok;
04717                 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
04718                                         borderType, borderValue, &ok);
04719                 Range range(0, dst.rows);
04720                 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
04721 
04722                 if (ok)
04723                 {
04724                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
04725                     return;
04726                 }
04727                 setIppErrorStatus();
04728             }
04729         }
04730     }
04731 #endif
04732 
04733     RemapNNFunc nnfunc = 0;
04734     RemapFunc ifunc = 0;
04735     const void* ctab = 0;
04736     bool fixpt = depth == CV_8U;
04737     bool planar_input = false;
04738 
04739     if( interpolation == INTER_NEAREST )
04740     {
04741         nnfunc = nn_tab[depth];
04742         CV_Assert( nnfunc != 0 );
04743     }
04744     else
04745     {
04746         if( interpolation == INTER_LINEAR )
04747             ifunc = linear_tab[depth];
04748         else if( interpolation == INTER_CUBIC )
04749             ifunc = cubic_tab[depth];
04750         else if( interpolation == INTER_LANCZOS4 )
04751             ifunc = lanczos4_tab[depth];
04752         else
04753             CV_Error( CV_StsBadArg, "Unknown interpolation method" );
04754         CV_Assert( ifunc != 0 );
04755         ctab = initInterTab2D( interpolation, fixpt );
04756     }
04757 
04758     const Mat *m1 = &map1, *m2 = &map2;
04759 
04760     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
04761         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
04762     {
04763         if( map1.type() != CV_16SC2 )
04764             std::swap(m1, m2);
04765     }
04766     else
04767     {
04768         CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
04769             (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
04770         planar_input = map1.channels() == 1;
04771     }
04772 
04773     RemapInvoker invoker(src, dst, m1, m2,
04774                          borderType, borderValue, planar_input, nnfunc, ifunc,
04775                          ctab);
04776     parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
04777 }
04778 
04779 
04780 void cv::convertMaps( InputArray _map1, InputArray _map2,
04781                       OutputArray _dstmap1, OutputArray _dstmap2,
04782                       int dstm1type, bool nninterpolate )
04783 {
04784     Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
04785     Size size = map1.size();
04786     const Mat *m1 = &map1, *m2 = &map2;
04787     int m1type = m1->type(), m2type = m2->type();
04788 
04789     CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
04790                (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
04791                (m1type == CV_32FC1 && m2type == CV_32FC1) ||
04792                (m1type == CV_32FC2 && m2->empty()) );
04793 
04794     if( m2type == CV_16SC2 )
04795     {
04796         std::swap( m1, m2 );
04797         std::swap( m1type, m2type );
04798     }
04799 
04800     if( dstm1type <= 0 )
04801         dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
04802     CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
04803     _dstmap1.create( size, dstm1type );
04804     dstmap1 = _dstmap1.getMat();
04805 
04806     if( !nninterpolate && dstm1type != CV_32FC2 )
04807     {
04808         _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
04809         dstmap2 = _dstmap2.getMat();
04810     }
04811     else
04812         _dstmap2.release();
04813 
04814     if( m1type == dstm1type || (nninterpolate &&
04815         ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
04816         (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
04817     {
04818         m1->convertTo( dstmap1, dstmap1.type() );
04819         if( !dstmap2.empty() && dstmap2.type() == m2->type() )
04820             m2->copyTo( dstmap2 );
04821         return;
04822     }
04823 
04824     if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
04825     {
04826         Mat vdata[] = { *m1, *m2 };
04827         merge( vdata, 2, dstmap1 );
04828         return;
04829     }
04830 
04831     if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
04832     {
04833         Mat mv[] = { dstmap1, dstmap2 };
04834         split( *m1, mv );
04835         return;
04836     }
04837 
04838     if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
04839         dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
04840     {
04841         size.width *= size.height;
04842         size.height = 1;
04843     }
04844 
04845 #if CV_SSE2
04846     bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
04847 #endif
04848 #if CV_SSE4_1
04849     bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
04850 #endif
04851 
04852     const float scale = 1.f/INTER_TAB_SIZE;
04853     int x, y;
04854     for( y = 0; y < size.height; y++ )
04855     {
04856         const float* src1f = m1->ptr<float>(y);
04857         const float* src2f = m2->ptr<float>(y);
04858         const short* src1 = (const short*)src1f;
04859         const ushort* src2 = (const ushort*)src2f;
04860 
04861         float* dst1f = dstmap1.ptr<float>(y);
04862         float* dst2f = dstmap2.ptr<float>(y);
04863         short* dst1 = (short*)dst1f;
04864         ushort* dst2 = (ushort*)dst2f;
04865         x = 0;
04866 
04867         if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
04868         {
04869             if( nninterpolate )
04870             {
04871                 #if CV_NEON
04872                 for( ; x <= size.width - 8; x += 8 )
04873                 {
04874                     int16x8x2_t v_dst;
04875                     v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
04876                                                 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))));
04877                     v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))),
04878                                                 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4))));
04879 
04880                     vst2q_s16(dst1 + (x << 1), v_dst);
04881                 }
04882                 #elif CV_SSE4_1
04883                 if (useSSE4_1)
04884                 {
04885                     for( ; x <= size.width - 16; x += 16 )
04886                     {
04887                         __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
04888                                                          _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
04889                         __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
04890                                                          _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
04891 
04892                         __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
04893                                                          _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
04894                         __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
04895                                                          _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
04896 
04897                         _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
04898 
04899                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
04900                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
04901                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
04902                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
04903                     }
04904                 }
04905                 #endif
04906                 for( ; x < size.width; x++ )
04907                 {
04908                     dst1[x*2] = saturate_cast<short>(src1f[x]);
04909                     dst1[x*2+1] = saturate_cast<short>(src2f[x]);
04910                 }
04911             }
04912             else
04913             {
04914                 #if CV_NEON
04915                 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
04916                 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
04917 
04918                 for( ; x <= size.width - 8; x += 8 )
04919                 {
04920                     int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale));
04921                     int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale));
04922                     int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale));
04923                     int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale));
04924 
04925                     int16x8x2_t v_dst;
04926                     v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
04927                                                 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
04928                     v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
04929                                                 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
04930 
04931                     vst2q_s16(dst1 + (x << 1), v_dst);
04932 
04933                     uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
04934                                                               vandq_s32(v_ix0, v_mask)));
04935                     uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
04936                                                               vandq_s32(v_ix1, v_mask)));
04937                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
04938                 }
04939                 #elif CV_SSE4_1
04940                 if (useSSE4_1)
04941                 {
04942                     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
04943                     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
04944 
04945                     for( ; x <= size.width - 16; x += 16 )
04946                     {
04947                         __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
04948                         __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
04949                         __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
04950                         __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
04951 
04952                         __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
04953                                                           _mm_srai_epi32(v_ix1, INTER_BITS));
04954                         __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
04955                                                           _mm_srai_epi32(v_iy1, INTER_BITS));
04956                         __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
04957                                                         _mm_and_si128(v_ix0, v_its1));
04958                         __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
04959                                                         _mm_and_si128(v_ix1, v_its1));
04960                         _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
04961 
04962                         v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
04963                         v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
04964                         v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
04965                         v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
04966 
04967                         __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
04968                                                           _mm_srai_epi32(v_ix1, INTER_BITS));
04969                         __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
04970                                                           _mm_srai_epi32(v_iy1, INTER_BITS));
04971                         v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
04972                                                 _mm_and_si128(v_ix0, v_its1));
04973                         v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
04974                                                 _mm_and_si128(v_ix1, v_its1));
04975                         _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
04976 
04977                         _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
04978 
04979                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
04980                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
04981                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
04982                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
04983                     }
04984                 }
04985                 #endif
04986                 for( ; x < size.width; x++ )
04987                 {
04988                     int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
04989                     int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
04990                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
04991                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
04992                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
04993                 }
04994             }
04995         }
04996         else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
04997         {
04998             if( nninterpolate )
04999             {
05000                 #if CV_NEON
05001                 for( ; x <= (size.width << 1) - 8; x += 8 )
05002                     vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
05003                                                      vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
05004                 #elif CV_SSE2
05005                 for( ; x <= (size.width << 1) - 8; x += 8 )
05006                 {
05007                     _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
05008                                                                             _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
05009                 }
05010                 #endif
05011                 for( ; x < size.width; x++ )
05012                 {
05013                     dst1[x*2] = saturate_cast<short>(src1f[x*2]);
05014                     dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
05015                 }
05016             }
05017             else
05018             {
05019                 #if CV_NEON
05020                 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
05021                 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
05022 
05023                 for( ; x <= size.width - 8; x += 8 )
05024                 {
05025                     float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8);
05026                     int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale));
05027                     int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale));
05028                     int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale));
05029                     int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale));
05030 
05031                     int16x8x2_t v_dst;
05032                     v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
05033                                                 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
05034                     v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
05035                                                 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
05036 
05037                     vst2q_s16(dst1 + (x << 1), v_dst);
05038 
05039                     uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
05040                                                               vandq_s32(v_ix0, v_mask)));
05041                     uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
05042                                                               vandq_s32(v_ix1, v_mask)));
05043                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
05044                 }
05045                 #elif CV_SSE4_1
05046                 if (useSSE4_1)
05047                 {
05048                     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
05049                     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
05050                     __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
05051 
05052                     for( ; x <= size.width - 4; x += 4 )
05053                     {
05054                         __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
05055                         __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
05056 
05057                         __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
05058                                                          _mm_srai_epi32(v_src1, INTER_BITS));
05059                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
05060 
05061                         // x0 y0 x1 y1 . . .
05062                         v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
05063                                                  _mm_and_si128(v_src1, v_its1));
05064                         __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
05065                                                       _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
05066                         _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
05067                     }
05068                 }
05069                 #endif
05070                 for( ; x < size.width; x++ )
05071                 {
05072                     int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
05073                     int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
05074                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
05075                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
05076                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
05077                 }
05078             }
05079         }
05080         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
05081         {
05082             #if CV_NEON
05083             uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1);
05084             uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1);
05085             float32x4_t v_scale = vdupq_n_f32(scale);
05086 
05087             for( ; x <= size.width - 8; x += 8)
05088             {
05089                 uint32x4_t v_fxy1, v_fxy2;
05090                 if (src2)
05091                 {
05092                     uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2);
05093                     v_fxy1 = vmovl_u16(vget_low_u16(v_src2));
05094                     v_fxy2 = vmovl_u16(vget_high_u16(v_src2));
05095                 }
05096                 else
05097                     v_fxy1 = v_fxy2 = v_zero;
05098 
05099                 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
05100                 float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
05101                                                v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask)));
05102                 float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
05103                                                v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS)));
05104                 vst1q_f32(dst1f + x, v_dst1);
05105                 vst1q_f32(dst2f + x, v_dst2);
05106 
05107                 v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
05108                                    v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask)));
05109                 v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
05110                                    v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS)));
05111                 vst1q_f32(dst1f + x + 4, v_dst1);
05112                 vst1q_f32(dst2f + x + 4, v_dst2);
05113             }
05114             #elif CV_SSE2
05115             __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
05116             __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
05117             __m128 v_scale = _mm_set1_ps(scale);
05118 
05119             for( ; x <= size.width - 16; x += 16)
05120             {
05121                 __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
05122                 __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
05123                 __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
05124                 __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
05125 
05126                 _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
05127 
05128                 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
05129                 __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
05130                 _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
05131                                                     _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
05132                 _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
05133                                                     _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
05134                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
05135                 _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
05136                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
05137                 _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
05138                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
05139 
05140                 v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
05141                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
05142                 _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
05143                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
05144                 _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
05145                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
05146                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
05147                 _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
05148                                                          _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
05149                 _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
05150                                                          _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
05151             }
05152             #endif
05153             for( ; x < size.width; x++ )
05154             {
05155                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
05156                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
05157                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
05158             }
05159         }
05160         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
05161         {
05162             #if CV_NEON
05163             int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1);
05164             int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1);
05165             float32x4_t v_scale = vdupq_n_f32(scale);
05166 
05167             for( ; x <= size.width - 8; x += 8)
05168             {
05169                 int32x4_t v_fxy1, v_fxy2;
05170                 if (src2)
05171                 {
05172                     int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2);
05173                     v_fxy1 = vmovl_s16(vget_low_s16(v_src2));
05174                     v_fxy2 = vmovl_s16(vget_high_s16(v_src2));
05175                 }
05176                 else
05177                     v_fxy1 = v_fxy2 = v_zero;
05178 
05179                 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
05180                 float32x4x2_t v_dst;
05181                 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
05182                                          v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask)));
05183                 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
05184                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS)));
05185                 vst2q_f32(dst1f + (x << 1), v_dst);
05186 
05187                 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
05188                                          v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask)));
05189                 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
05190                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
05191                 vst2q_f32(dst1f + (x << 1) + 8, v_dst);
05192             }
05193             #elif CV_SSE2
05194             if (useSSE2)
05195             {
05196                 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
05197                 __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
05198                 __m128 v_scale = _mm_set1_ps(scale);
05199 
05200                 for ( ; x <= size.width - 8; x += 8)
05201                 {
05202                     __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
05203                     __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
05204                     __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
05205                     __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
05206 
05207                     __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
05208                     _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
05209 
05210                     v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
05211                     _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
05212                 }
05213             }
05214             #endif
05215             for( ; x < size.width; x++ )
05216             {
05217                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
05218                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
05219                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
05220             }
05221         }
05222         else
05223             CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
05224     }
05225 }
05226 
05227 
05228 namespace cv
05229 {
05230 
05231 class WarpAffineInvoker :
05232     public ParallelLoopBody
05233 {
05234 public:
05235     WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
05236                       const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
05237         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
05238         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
05239         M(_M)
05240     {
05241     }
05242 
05243     virtual void operator() (const Range& range) const
05244     {
05245         const int BLOCK_SZ = 64;
05246         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
05247         const int AB_BITS = MAX(10, (int)INTER_BITS);
05248         const int AB_SCALE = 1 << AB_BITS;
05249         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
05250     #if CV_SSE2
05251         bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
05252     #endif
05253     #if CV_SSE4_1
05254         bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
05255     #endif
05256 
05257         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
05258         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
05259         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
05260 
05261         for( y = range.start; y < range.end; y += bh0 )
05262         {
05263             for( x = 0; x < dst.cols; x += bw0 )
05264             {
05265                 int bw = std::min( bw0, dst.cols - x);
05266                 int bh = std::min( bh0, range.end - y);
05267 
05268                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
05269                 Mat dpart(dst, Rect(x, y, bw, bh));
05270 
05271                 for( y1 = 0; y1 < bh; y1++ )
05272                 {
05273                     short* xy = XY + y1*bw*2;
05274                     int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
05275                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
05276 
05277                     if( interpolation == INTER_NEAREST )
05278                     {
05279                         x1 = 0;
05280                         #if CV_NEON
05281                         int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
05282                         for( ; x1 <= bw - 8; x1 += 8 )
05283                         {
05284                             int16x8x2_t v_dst;
05285                             v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)),
05286                                                         vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS)));
05287                             v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)),
05288                                                         vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS)));
05289 
05290                             vst2q_s16(xy + (x1 << 1), v_dst);
05291                         }
05292                         #elif CV_SSE4_1
05293                         if (useSSE4_1)
05294                         {
05295                             __m128i v_X0 = _mm_set1_epi32(X0);
05296                             __m128i v_Y0 = _mm_set1_epi32(Y0);
05297                             for ( ; x1 <= bw - 16; x1 += 16)
05298                             {
05299                                 __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
05300                                                                _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
05301                                 __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
05302                                                                _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
05303 
05304                                 __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
05305                                                                _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
05306                                 __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
05307                                                                _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
05308 
05309                                 _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
05310 
05311                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
05312                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
05313                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
05314                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
05315                             }
05316                         }
05317                         #endif
05318                         for( ; x1 < bw; x1++ )
05319                         {
05320                             int X = (X0 + adelta[x+x1]) >> AB_BITS;
05321                             int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
05322                             xy[x1*2] = saturate_cast<short>(X);
05323                             xy[x1*2+1] = saturate_cast<short>(Y);
05324                         }
05325                     }
05326                     else
05327                     {
05328                         short* alpha = A + y1*bw;
05329                         x1 = 0;
05330                     #if CV_SSE2
05331                         if( useSSE2 )
05332                         {
05333                             __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
05334                             __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
05335                             for( ; x1 <= bw - 8; x1 += 8 )
05336                             {
05337                                 __m128i tx0, tx1, ty0, ty1;
05338                                 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
05339                                 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
05340                                 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
05341                                 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
05342 
05343                                 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
05344                                 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
05345                                 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
05346                                 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
05347 
05348                                 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
05349                                                             _mm_and_si128(tx1, fxy_mask));
05350                                 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
05351                                                             _mm_and_si128(ty1, fxy_mask));
05352                                 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
05353                                                             _mm_srai_epi32(tx1, INTER_BITS));
05354                                 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
05355                                                     _mm_srai_epi32(ty1, INTER_BITS));
05356                                 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
05357 
05358                                 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
05359                                 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
05360                                 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
05361                             }
05362                         }
05363                     #elif CV_NEON
05364                         int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
05365                         for( ; x1 <= bw - 8; x1 += 8 )
05366                         {
05367                             int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
05368                             int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
05369                             int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS);
05370                             int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS);
05371 
05372                             int16x8x2_t v_xy;
05373                             v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS)));
05374                             v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS)));
05375 
05376                             vst2q_s16(xy + (x1 << 1), v_xy);
05377 
05378                             int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS),
05379                                                                      vandq_s32(v_X0, v_mask)));
05380                             int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS),
05381                                                                      vandq_s32(v_X1, v_mask)));
05382                             vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1));
05383                         }
05384                     #endif
05385                         for( ; x1 < bw; x1++ )
05386                         {
05387                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
05388                             int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
05389                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
05390                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
05391                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
05392                                     (X & (INTER_TAB_SIZE-1)));
05393                         }
05394                     }
05395                 }
05396 
05397                 if( interpolation == INTER_NEAREST )
05398                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
05399                 else
05400                 {
05401                     Mat _matA(bh, bw, CV_16U, A);
05402                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
05403                 }
05404             }
05405         }
05406     }
05407 
05408 private:
05409     Mat src;
05410     Mat dst;
05411     int interpolation, borderType;
05412     Scalar borderValue;
05413     int *adelta, *bdelta;
05414     double *M;
05415 };
05416 
05417 
05418 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
05419 class IPPWarpAffineInvoker :
05420     public ParallelLoopBody
05421 {
05422 public:
05423     IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
05424                          const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
05425         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
05426         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
05427     {
05428         *ok = true;
05429     }
05430 
05431     virtual void operator() (const Range& range) const
05432     {
05433         IppiSize srcsize = { src.cols, src.rows };
05434         IppiRect srcroi = { 0, 0, src.cols, src.rows };
05435         IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
05436         int cnn = src.channels();
05437         if( borderType == BORDER_CONSTANT )
05438         {
05439             IppiSize setSize = { dst.cols, range.end - range.start };
05440             void *dataPointer = dst.ptr(range.start);
05441             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
05442             {
05443                 *ok = false;
05444                 return;
05445             }
05446         }
05447 
05448         // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
05449         IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
05450                                 (int)dst.step[0], dstroi, coeffs, mode );
05451         if( status < 0)
05452             *ok = false;
05453         else
05454         {
05455             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05456         }
05457     }
05458 private:
05459     Mat &src;
05460     Mat &dst;
05461     int mode;
05462     double (&coeffs)[2][3];
05463     int borderType;
05464     Scalar borderValue;
05465     ippiWarpAffineBackFunc func;
05466     bool *ok;
05467     const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
05468 };
05469 #endif
05470 
05471 #ifdef HAVE_OPENCL
05472 
05473 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
05474 
05475 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
05476                               Size dsize, int flags, int borderType, const Scalar& borderValue,
05477                               int op_type)
05478 {
05479     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
05480     const ocl::Device & dev = ocl::Device::getDefault();
05481 
05482     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
05483     const bool doubleSupport = dev.doubleFPConfig() > 0;
05484 
05485     int interpolation = flags & INTER_MAX;
05486     if( interpolation == INTER_AREA )
05487         interpolation = INTER_LINEAR;
05488     int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
05489 
05490     if ( !(borderType == cv::BORDER_CONSTANT &&
05491            (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
05492          (!doubleSupport && depth == CV_64F) || cn > 4)
05493         return false;
05494 
05495     const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
05496     ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
05497                 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
05498     const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
05499 
05500     int scalarcn = cn == 3 ? 4 : cn;
05501     bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
05502     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
05503     int sctype = CV_MAKETYPE(wdepth, scalarcn);
05504 
05505     ocl::Kernel k;
05506     String opts;
05507     if (interpolation == INTER_NEAREST)
05508     {
05509         opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
05510                       ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
05511                       ocl::typeToStr(CV_MAT_DEPTH(type)),
05512                       ocl::typeToStr(sctype), cn, rowsPerWI);
05513     }
05514     else
05515     {
05516         char cvt[2][50];
05517         opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
05518                       " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
05519                       interpolationMap[interpolation], ocl::typeToStr(type),
05520                       ocl::typeToStr(CV_MAT_DEPTH(type)),
05521                       ocl::typeToStr(sctype),
05522                       ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
05523                       ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
05524                       ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
05525                       doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
05526     }
05527 
05528     k.create(kernelName, program, opts);
05529     if (k.empty())
05530         return false;
05531 
05532     double borderBuf[] = { 0, 0, 0, 0 };
05533     scalarToRawData(borderValue, borderBuf, sctype);
05534 
05535     UMat src = _src.getUMat(), M0;
05536     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
05537     UMat dst = _dst.getUMat();
05538 
05539     double M[9];
05540     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
05541     Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
05542     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
05543                M1.rows == matRows && M1.cols == 3 );
05544     M1.convertTo(matM, matM.type());
05545 
05546     if( !(flags & WARP_INVERSE_MAP) )
05547     {
05548         if (op_type == OCL_OP_PERSPECTIVE)
05549             invert(matM, matM);
05550         else
05551         {
05552             double D = M[0]*M[4] - M[1]*M[3];
05553             D = D != 0 ? 1./D : 0;
05554             double A11 = M[4]*D, A22=M[0]*D;
05555             M[0] = A11; M[1] *= -D;
05556             M[3] *= -D; M[4] = A22;
05557             double b1 = -M[0]*M[2] - M[1]*M[5];
05558             double b2 = -M[3]*M[2] - M[4]*M[5];
05559             M[2] = b1; M[5] = b2;
05560         }
05561     }
05562     matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
05563 
05564     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
05565            ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
05566 
05567     size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
05568     return k.run(2, globalThreads, NULL, false);
05569 }
05570 
05571 #endif
05572 
05573 }
05574 
05575 
05576 void cv::warpAffine( InputArray _src, OutputArray _dst,
05577                      InputArray _M0, Size dsize,
05578                      int flags, int borderType, const Scalar & borderValue )
05579 {
05580 #ifdef HAVE_OPENCL
05581     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
05582                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
05583                                  borderValue, OCL_OP_AFFINE))
05584 #endif
05585 
05586     Mat src = _src.getMat(), M0 = _M0.getMat();
05587     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
05588     Mat dst = _dst.getMat();
05589     CV_Assert( src.cols > 0 && src.rows > 0 );
05590     if( dst.data == src.data )
05591         src = src.clone();
05592 
05593     double M[6];
05594     Mat matM(2, 3, CV_64F, M);
05595     int interpolation = flags & INTER_MAX;
05596     if( interpolation == INTER_AREA )
05597         interpolation = INTER_LINEAR;
05598 
05599     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
05600     M0.convertTo(matM, matM.type());
05601 
05602 #ifdef HAVE_TEGRA_OPTIMIZATION
05603     if( tegra::useTegra() && tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
05604         return;
05605 #endif
05606 
05607     if( !(flags & WARP_INVERSE_MAP) )
05608     {
05609         double D = M[0]*M[4] - M[1]*M[3];
05610         D = D != 0 ? 1./D : 0;
05611         double A11 = M[4]*D, A22=M[0]*D;
05612         M[0] = A11; M[1] *= -D;
05613         M[3] *= -D; M[4] = A22;
05614         double b1 = -M[0]*M[2] - M[1]*M[5];
05615         double b2 = -M[3]*M[2] - M[4]*M[5];
05616         M[2] = b1; M[5] = b2;
05617     }
05618 
05619     int x;
05620     AutoBuffer<int>  _abdelta(dst.cols*2);
05621     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
05622     const int AB_BITS = MAX(10, (int)INTER_BITS);
05623     const int AB_SCALE = 1 << AB_BITS;
05624 
05625 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
05626     CV_IPP_CHECK()
05627     {
05628         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
05629         if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
05630            ( cn == 1 || cn == 3 || cn == 4 ) &&
05631            ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
05632            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
05633         {
05634             ippiWarpAffineBackFunc ippFunc = 0;
05635             if ((flags & WARP_INVERSE_MAP) != 0)
05636             {
05637                 ippFunc =
05638                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
05639                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
05640                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
05641                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
05642                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
05643                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
05644                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
05645                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
05646                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
05647                 0;
05648             }
05649             else
05650             {
05651                 ippFunc =
05652                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
05653                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
05654                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
05655                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
05656                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
05657                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
05658                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
05659                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
05660                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
05661                 0;
05662             }
05663             int mode =
05664             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
05665             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
05666             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
05667             0;
05668             CV_Assert(mode && ippFunc);
05669 
05670             double coeffs[2][3];
05671             for( int i = 0; i < 2; i++ )
05672                 for( int j = 0; j < 3; j++ )
05673                     coeffs[i][j] = matM.at<double>(i, j);
05674 
05675             bool ok;
05676             Range range(0, dst.rows);
05677             IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
05678             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
05679             if( ok )
05680             {
05681                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05682                 return;
05683             }
05684             setIppErrorStatus();
05685         }
05686     }
05687 #endif
05688 
05689     for( x = 0; x < dst.cols; x++ )
05690     {
05691         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
05692         bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
05693     }
05694 
05695     Range range(0, dst.rows);
05696     WarpAffineInvoker invoker(src, dst, interpolation, borderType,
05697                               borderValue, adelta, bdelta, M);
05698     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
05699 }
05700 
05701 
05702 namespace cv
05703 {
05704 
05705 class WarpPerspectiveInvoker :
05706     public ParallelLoopBody
05707 {
05708 public:
05709     WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
05710                            int _borderType, const Scalar &_borderValue) :
05711         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
05712         borderType(_borderType), borderValue(_borderValue)
05713     {
05714     }
05715 
05716     virtual void operator() (const Range& range) const
05717     {
05718         const int BLOCK_SZ = 32;
05719         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
05720         int x, y, x1, y1, width = dst.cols, height = dst.rows;
05721 
05722         int bh0 = std::min(BLOCK_SZ/2, height);
05723         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
05724         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
05725 
05726         #if CV_SSE4_1
05727         bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
05728         __m128d v_M0 = _mm_set1_pd(M[0]);
05729         __m128d v_M3 = _mm_set1_pd(M[3]);
05730         __m128d v_M6 = _mm_set1_pd(M[6]);
05731         __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
05732         __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
05733         __m128d v_2 = _mm_set1_pd(2),
05734                 v_zero = _mm_setzero_pd(),
05735                 v_1 = _mm_set1_pd(1),
05736                 v_its = _mm_set1_pd(INTER_TAB_SIZE);
05737         __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
05738         #endif
05739 
05740         for( y = range.start; y < range.end; y += bh0 )
05741         {
05742             for( x = 0; x < width; x += bw0 )
05743             {
05744                 int bw = std::min( bw0, width - x);
05745                 int bh = std::min( bh0, range.end - y); // height
05746 
05747                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
05748                 Mat dpart(dst, Rect(x, y, bw, bh));
05749 
05750                 for( y1 = 0; y1 < bh; y1++ )
05751                 {
05752                     short* xy = XY + y1*bw*2;
05753                     double X0 = M[0]*x + M[1]*(y + y1) + M[2];
05754                     double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
05755                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
05756 
05757                     if( interpolation == INTER_NEAREST )
05758                     {
05759                         x1 = 0;
05760 
05761                         #if CV_SSE4_1
05762                         if (haveSSE4_1)
05763                         {
05764                             __m128d v_X0d = _mm_set1_pd(X0);
05765                             __m128d v_Y0d = _mm_set1_pd(Y0);
05766                             __m128d v_W0 = _mm_set1_pd(W0);
05767                             __m128d v_x1 = _mm_set_pd(1, 0);
05768 
05769                             for( ; x1 <= bw - 16; x1 += 16 )
05770                             {
05771                                 // 0-3
05772                                 __m128i v_X0, v_Y0;
05773                                 {
05774                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05775                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05776                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05777                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05778                                     v_x1 = _mm_add_pd(v_x1, v_2);
05779 
05780                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05781                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05782                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05783                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05784                                     v_x1 = _mm_add_pd(v_x1, v_2);
05785 
05786                                     v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05787                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05788                                     v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05789                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05790                                 }
05791 
05792                                 // 4-8
05793                                 __m128i v_X1, v_Y1;
05794                                 {
05795                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05796                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05797                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05798                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05799                                     v_x1 = _mm_add_pd(v_x1, v_2);
05800 
05801                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05802                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05803                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05804                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05805                                     v_x1 = _mm_add_pd(v_x1, v_2);
05806 
05807                                     v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05808                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05809                                     v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05810                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05811                                 }
05812 
05813                                 // 8-11
05814                                 __m128i v_X2, v_Y2;
05815                                 {
05816                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05817                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05818                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05819                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05820                                     v_x1 = _mm_add_pd(v_x1, v_2);
05821 
05822                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05823                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05824                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05825                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05826                                     v_x1 = _mm_add_pd(v_x1, v_2);
05827 
05828                                     v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05829                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05830                                     v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05831                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05832                                 }
05833 
05834                                 // 12-15
05835                                 __m128i v_X3, v_Y3;
05836                                 {
05837                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05838                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05839                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05840                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05841                                     v_x1 = _mm_add_pd(v_x1, v_2);
05842 
05843                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05844                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
05845                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05846                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05847                                     v_x1 = _mm_add_pd(v_x1, v_2);
05848 
05849                                     v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05850                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05851                                     v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05852                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05853                                 }
05854 
05855                                 // convert to 16s
05856                                 v_X0 = _mm_packs_epi32(v_X0, v_X1);
05857                                 v_X1 = _mm_packs_epi32(v_X2, v_X3);
05858                                 v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
05859                                 v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
05860 
05861                                 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
05862 
05863                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
05864                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
05865                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
05866                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
05867                             }
05868                         }
05869                         #endif
05870 
05871                         for( ; x1 < bw; x1++ )
05872                         {
05873                             double W = W0 + M[6]*x1;
05874                             W = W ? 1./W : 0;
05875                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
05876                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
05877                             int X = saturate_cast<int>(fX);
05878                             int Y = saturate_cast<int>(fY);
05879 
05880                             xy[x1*2] = saturate_cast<short>(X);
05881                             xy[x1*2+1] = saturate_cast<short>(Y);
05882                         }
05883                     }
05884                     else
05885                     {
05886                         short* alpha = A + y1*bw;
05887                         x1 = 0;
05888 
05889                         #if CV_SSE4_1
05890                         if (haveSSE4_1)
05891                         {
05892                             __m128d v_X0d = _mm_set1_pd(X0);
05893                             __m128d v_Y0d = _mm_set1_pd(Y0);
05894                             __m128d v_W0 = _mm_set1_pd(W0);
05895                             __m128d v_x1 = _mm_set_pd(1, 0);
05896 
05897                             for( ; x1 <= bw - 16; x1 += 16 )
05898                             {
05899                                 // 0-3
05900                                 __m128i v_X0, v_Y0;
05901                                 {
05902                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05903                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05904                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05905                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05906                                     v_x1 = _mm_add_pd(v_x1, v_2);
05907 
05908                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05909                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05910                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05911                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05912                                     v_x1 = _mm_add_pd(v_x1, v_2);
05913 
05914                                     v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05915                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05916                                     v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05917                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05918                                 }
05919 
05920                                 // 4-8
05921                                 __m128i v_X1, v_Y1;
05922                                 {
05923                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05924                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05925                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05926                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05927                                     v_x1 = _mm_add_pd(v_x1, v_2);
05928 
05929                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05930                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05931                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05932                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05933                                     v_x1 = _mm_add_pd(v_x1, v_2);
05934 
05935                                     v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05936                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05937                                     v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05938                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05939                                 }
05940 
05941                                 // 8-11
05942                                 __m128i v_X2, v_Y2;
05943                                 {
05944                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05945                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05946                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05947                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05948                                     v_x1 = _mm_add_pd(v_x1, v_2);
05949 
05950                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05951                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05952                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05953                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05954                                     v_x1 = _mm_add_pd(v_x1, v_2);
05955 
05956                                     v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05957                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05958                                     v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05959                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05960                                 }
05961 
05962                                 // 12-15
05963                                 __m128i v_X3, v_Y3;
05964                                 {
05965                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05966                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05967                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05968                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05969                                     v_x1 = _mm_add_pd(v_x1, v_2);
05970 
05971                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
05972                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
05973                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
05974                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
05975                                     v_x1 = _mm_add_pd(v_x1, v_2);
05976 
05977                                     v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
05978                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
05979                                     v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
05980                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
05981                                 }
05982 
05983                                 // store alpha
05984                                 __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
05985                                                                  _mm_and_si128(v_X0, v_itsi1));
05986                                 __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
05987                                                                  _mm_and_si128(v_X1, v_itsi1));
05988                                 _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
05989 
05990                                 v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
05991                                                          _mm_and_si128(v_X2, v_itsi1));
05992                                 v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
05993                                                          _mm_and_si128(v_X3, v_itsi1));
05994                                 _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
05995 
05996                                 // convert to 16s
05997                                 v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
05998                                 v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
05999                                 v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
06000                                 v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
06001 
06002                                 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
06003 
06004                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
06005                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
06006                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
06007                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
06008                             }
06009                         }
06010                         #endif
06011 
06012                         for( ; x1 < bw; x1++ )
06013                         {
06014                             double W = W0 + M[6]*x1;
06015                             W = W ? INTER_TAB_SIZE/W : 0;
06016                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
06017                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
06018                             int X = saturate_cast<int>(fX);
06019                             int Y = saturate_cast<int>(fY);
06020 
06021                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
06022                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
06023                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
06024                                                 (X & (INTER_TAB_SIZE-1)));
06025                         }
06026                     }
06027                 }
06028 
06029                 if( interpolation == INTER_NEAREST )
06030                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
06031                 else
06032                 {
06033                     Mat _matA(bh, bw, CV_16U, A);
06034                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
06035                 }
06036             }
06037         }
06038     }
06039 
06040 private:
06041     Mat src;
06042     Mat dst;
06043     double* M;
06044     int interpolation, borderType;
06045     Scalar borderValue;
06046 };
06047 
06048 
06049 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
06050 class IPPWarpPerspectiveInvoker :
06051     public ParallelLoopBody
06052 {
06053 public:
06054     IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
06055                               int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
06056         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
06057         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
06058     {
06059         *ok = true;
06060     }
06061 
06062     virtual void operator() (const Range& range) const
06063     {
06064         IppiSize srcsize = {src.cols, src.rows};
06065         IppiRect srcroi = {0, 0, src.cols, src.rows};
06066         IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
06067         int cnn = src.channels();
06068 
06069         if( borderType == BORDER_CONSTANT )
06070         {
06071             IppiSize setSize = {dst.cols, range.end - range.start};
06072             void *dataPointer = dst.ptr(range.start);
06073             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
06074             {
06075                 *ok = false;
06076                 return;
06077             }
06078         }
06079 
06080         IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode);
06081         if (status != ippStsNoErr)
06082             *ok = false;
06083         else
06084         {
06085             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
06086         }
06087     }
06088 private:
06089     Mat &src;
06090     Mat &dst;
06091     int mode;
06092     double (&coeffs)[3][3];
06093     int borderType;
06094     const Scalar borderValue;
06095     ippiWarpPerspectiveFunc func;
06096     bool *ok;
06097 
06098     const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
06099 };
06100 #endif
06101 }
06102 
06103 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
06104                           Size dsize, int flags, int borderType, const Scalar & borderValue )
06105 {
06106     CV_Assert( _src.total() > 0 );
06107 
06108 #ifdef HAVE_OPENCL
06109     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
06110                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
06111                               OCL_OP_PERSPECTIVE))
06112 #endif
06113 
06114     Mat src = _src.getMat(), M0 = _M0.getMat();
06115     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
06116     Mat dst = _dst.getMat();
06117 
06118     if( dst.data == src.data )
06119         src = src.clone();
06120 
06121     double M[9];
06122     Mat matM(3, 3, CV_64F, M);
06123     int interpolation = flags & INTER_MAX;
06124     if( interpolation == INTER_AREA )
06125         interpolation = INTER_LINEAR;
06126 
06127     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
06128     M0.convertTo(matM, matM.type());
06129 
06130 #ifdef HAVE_TEGRA_OPTIMIZATION
06131     if( tegra::useTegra() && tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
06132         return;
06133 #endif
06134 
06135 
06136 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
06137     CV_IPP_CHECK()
06138     {
06139         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
06140         if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
06141            (cn == 1 || cn == 3 || cn == 4) &&
06142            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
06143            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
06144         {
06145             ippiWarpPerspectiveFunc ippFunc = 0;
06146             if ((flags & WARP_INVERSE_MAP) != 0)
06147             {
06148                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
06149                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
06150                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
06151                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
06152                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
06153                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
06154                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
06155                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
06156                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
06157             }
06158             else
06159             {
06160                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
06161                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
06162                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
06163                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
06164                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
06165                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
06166                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
06167                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
06168                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
06169             }
06170             int mode =
06171             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
06172             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
06173             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
06174             CV_Assert(mode && ippFunc);
06175 
06176             double coeffs[3][3];
06177             for( int i = 0; i < 3; i++ )
06178                 for( int j = 0; j < 3; j++ )
06179                     coeffs[i][j] = matM.at<double>(i, j);
06180 
06181             bool ok;
06182             Range range(0, dst.rows);
06183             IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
06184             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
06185             if( ok )
06186             {
06187                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
06188                 return;
06189             }
06190             setIppErrorStatus();
06191         }
06192     }
06193 #endif
06194 
06195     if( !(flags & WARP_INVERSE_MAP) )
06196         invert(matM, matM);
06197 
06198     Range range(0, dst.rows);
06199     WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
06200     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
06201 }
06202 
06203 
06204 cv::Mat cv::getRotationMatrix2D( Point2f  center, double angle, double scale )
06205 {
06206     angle *= CV_PI/180;
06207     double alpha = cos(angle)*scale;
06208     double beta = sin(angle)*scale;
06209 
06210     Mat M(2, 3, CV_64F);
06211     double* m = M.ptr<double>();
06212 
06213     m[0] = alpha;
06214     m[1] = beta;
06215     m[2] = (1-alpha)*center.x - beta*center.y;
06216     m[3] = -beta;
06217     m[4] = alpha;
06218     m[5] = beta*center.x + (1-alpha)*center.y;
06219 
06220     return M;
06221 }
06222 
06223 /* Calculates coefficients of perspective transformation
06224  * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
06225  *
06226  *      c00*xi + c01*yi + c02
06227  * ui = ---------------------
06228  *      c20*xi + c21*yi + c22
06229  *
06230  *      c10*xi + c11*yi + c12
06231  * vi = ---------------------
06232  *      c20*xi + c21*yi + c22
06233  *
06234  * Coefficients are calculated by solving linear system:
06235  * / x0 y0  1  0  0  0 -x0*u0 -y0*u0 \ /c00\ /u0\
06236  * | x1 y1  1  0  0  0 -x1*u1 -y1*u1 | |c01| |u1|
06237  * | x2 y2  1  0  0  0 -x2*u2 -y2*u2 | |c02| |u2|
06238  * | x3 y3  1  0  0  0 -x3*u3 -y3*u3 |.|c10|=|u3|,
06239  * |  0  0  0 x0 y0  1 -x0*v0 -y0*v0 | |c11| |v0|
06240  * |  0  0  0 x1 y1  1 -x1*v1 -y1*v1 | |c12| |v1|
06241  * |  0  0  0 x2 y2  1 -x2*v2 -y2*v2 | |c20| |v2|
06242  * \  0  0  0 x3 y3  1 -x3*v3 -y3*v3 / \c21/ \v3/
06243  *
06244  * where:
06245  *   cij - matrix coefficients, c22 = 1
06246  */
06247 cv::Mat cv::getPerspectiveTransform( const Point2f  src[], const Point2f  dst[] )
06248 {
06249     Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
06250     double a[8][8], b[8];
06251     Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
06252 
06253     for( int i = 0; i < 4; ++i )
06254     {
06255         a[i][0] = a[i+4][3] = src[i].x;
06256         a[i][1] = a[i+4][4] = src[i].y;
06257         a[i][2] = a[i+4][5] = 1;
06258         a[i][3] = a[i][4] = a[i][5] =
06259         a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
06260         a[i][6] = -src[i].x*dst[i].x;
06261         a[i][7] = -src[i].y*dst[i].x;
06262         a[i+4][6] = -src[i].x*dst[i].y;
06263         a[i+4][7] = -src[i].y*dst[i].y;
06264         b[i] = dst[i].x;
06265         b[i+4] = dst[i].y;
06266     }
06267 
06268     solve( A, B, X, DECOMP_SVD );
06269     M.ptr<double>()[8] = 1.;
06270 
06271     return M;
06272 }
06273 
06274 /* Calculates coefficients of affine transformation
06275  * which maps (xi,yi) to (ui,vi), (i=1,2,3):
06276  *
06277  * ui = c00*xi + c01*yi + c02
06278  *
06279  * vi = c10*xi + c11*yi + c12
06280  *
06281  * Coefficients are calculated by solving linear system:
06282  * / x0 y0  1  0  0  0 \ /c00\ /u0\
06283  * | x1 y1  1  0  0  0 | |c01| |u1|
06284  * | x2 y2  1  0  0  0 | |c02| |u2|
06285  * |  0  0  0 x0 y0  1 | |c10| |v0|
06286  * |  0  0  0 x1 y1  1 | |c11| |v1|
06287  * \  0  0  0 x2 y2  1 / |c12| |v2|
06288  *
06289  * where:
06290  *   cij - matrix coefficients
06291  */
06292 
06293 cv::Mat cv::getAffineTransform( const Point2f  src[], const Point2f  dst[] )
06294 {
06295     Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
06296     double a[6*6], b[6];
06297     Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
06298 
06299     for( int i = 0; i < 3; i++ )
06300     {
06301         int j = i*12;
06302         int k = i*12+6;
06303         a[j] = a[k+3] = src[i].x;
06304         a[j+1] = a[k+4] = src[i].y;
06305         a[j+2] = a[k+5] = 1;
06306         a[j+3] = a[j+4] = a[j+5] = 0;
06307         a[k] = a[k+1] = a[k+2] = 0;
06308         b[i*2] = dst[i].x;
06309         b[i*2+1] = dst[i].y;
06310     }
06311 
06312     solve( A, B, X );
06313     return M;
06314 }
06315 
06316 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
06317 {
06318     Mat matM = _matM.getMat();
06319     CV_Assert(matM.rows == 2 && matM.cols == 3);
06320     __iM.create(2, 3, matM.type());
06321     Mat _iM = __iM.getMat();
06322 
06323     if( matM.type() == CV_32F )
06324     {
06325         const float* M = matM.ptr<float>();
06326         float* iM = _iM.ptr<float>();
06327         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
06328 
06329         double D = M[0]*M[step+1] - M[1]*M[step];
06330         D = D != 0 ? 1./D : 0;
06331         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
06332         double b1 = -A11*M[2] - A12*M[step+2];
06333         double b2 = -A21*M[2] - A22*M[step+2];
06334 
06335         iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
06336         iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
06337     }
06338     else if( matM.type() == CV_64F )
06339     {
06340         const double* M = matM.ptr<double>();
06341         double* iM = _iM.ptr<double>();
06342         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
06343 
06344         double D = M[0]*M[step+1] - M[1]*M[step];
06345         D = D != 0 ? 1./D : 0;
06346         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
06347         double b1 = -A11*M[2] - A12*M[step+2];
06348         double b2 = -A21*M[2] - A22*M[step+2];
06349 
06350         iM[0] = A11; iM[1] = A12; iM[2] = b1;
06351         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
06352     }
06353     else
06354         CV_Error( CV_StsUnsupportedFormat, "" );
06355 }
06356 
06357 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
06358 {
06359     Mat src = _src.getMat(), dst = _dst.getMat();
06360     CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
06361     return getPerspectiveTransform((const Point2f *)src.data, (const Point2f *)dst.data);
06362 }
06363 
06364 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
06365 {
06366     Mat src = _src.getMat(), dst = _dst.getMat();
06367     CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
06368     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
06369 }
06370 
06371 CV_IMPL void
06372 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
06373 {
06374     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
06375     CV_Assert( src.type() == dst.type() );
06376     cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
06377         (double)dst.rows/src.rows, method );
06378 }
06379 
06380 
06381 CV_IMPL void
06382 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
06383               int flags, CvScalar  fillval )
06384 {
06385     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
06386     cv::Mat matrix = cv::cvarrToMat(marr);
06387     CV_Assert( src.type() == dst.type() );
06388     cv::warpAffine( src, dst, matrix, dst.size(), flags,
06389         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
06390         fillval );
06391 }
06392 
06393 CV_IMPL void
06394 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
06395                    int flags, CvScalar  fillval )
06396 {
06397     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
06398     cv::Mat matrix = cv::cvarrToMat(marr);
06399     CV_Assert( src.type() == dst.type() );
06400     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
06401         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
06402         fillval );
06403 }
06404 
06405 CV_IMPL void
06406 cvRemap( const CvArr* srcarr, CvArr* dstarr,
06407          const CvArr* _mapx, const CvArr* _mapy,
06408          int flags, CvScalar  fillval )
06409 {
06410     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
06411     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
06412     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
06413     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
06414         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
06415         fillval );
06416     CV_Assert( dst0.data == dst.data );
06417 }
06418 
06419 
06420 CV_IMPL CvMat*
06421 cv2DRotationMatrix( CvPoint2D32f center, double angle,
06422                     double scale, CvMat* matrix )
06423 {
06424     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
06425     CV_Assert( M.size() == M0.size() );
06426     M.convertTo(M0, M0.type());
06427     return matrix;
06428 }
06429 
06430 
06431 CV_IMPL CvMat*
06432 cvGetPerspectiveTransform( const CvPoint2D32f* src,
06433                           const CvPoint2D32f* dst,
06434                           CvMat* matrix )
06435 {
06436     cv::Mat M0 = cv::cvarrToMat(matrix),
06437         M = cv::getPerspectiveTransform((const cv::Point2f *)src, (const cv::Point2f *)dst);
06438     CV_Assert( M.size() == M0.size() );
06439     M.convertTo(M0, M0.type());
06440     return matrix;
06441 }
06442 
06443 
06444 CV_IMPL CvMat*
06445 cvGetAffineTransform( const CvPoint2D32f* src,
06446                           const CvPoint2D32f* dst,
06447                           CvMat* matrix )
06448 {
06449     cv::Mat M0 = cv::cvarrToMat(matrix),
06450         M = cv::getAffineTransform((const cv::Point2f *)src, (const cv::Point2f *)dst);
06451     CV_Assert( M.size() == M0.size() );
06452     M.convertTo(M0, M0.type());
06453     return matrix;
06454 }
06455 
06456 
06457 CV_IMPL void
06458 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
06459 {
06460     cv::Mat map1 = cv::cvarrToMat(arr1), map2;
06461     cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
06462 
06463     if( arr2 )
06464         map2 = cv::cvarrToMat(arr2);
06465     if( dstarr2 )
06466     {
06467         dstmap2 = cv::cvarrToMat(dstarr2);
06468         if( dstmap2.type() == CV_16SC1 )
06469             dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
06470     }
06471 
06472     cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
06473 }
06474 
06475 /****************************************************************************************\
06476 *                                   Log-Polar Transform                                  *
06477 \****************************************************************************************/
06478 
06479 /* now it is done via Remap; more correct implementation should use
06480    some super-sampling technique outside of the "fovea" circle */
06481 CV_IMPL void
06482 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
06483             CvPoint2D32f center, double M, int flags )
06484 {
06485     cv::Ptr<CvMat> mapx, mapy;
06486 
06487     CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
06488     CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
06489     CvSize ssize, dsize;
06490 
06491     if( !CV_ARE_TYPES_EQ( src, dst ))
06492         CV_Error( CV_StsUnmatchedFormats, "" );
06493 
06494     if( M <= 0 )
06495         CV_Error( CV_StsOutOfRange, "M should be >0" );
06496 
06497     ssize = cvGetMatSize(src);
06498     dsize = cvGetMatSize(dst);
06499 
06500     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
06501     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
06502 
06503     if( !(flags & CV_WARP_INVERSE_MAP) )
06504     {
06505         int phi, rho;
06506         cv::AutoBuffer<double> _exp_tab(dsize.width);
06507         double* exp_tab = _exp_tab;
06508 
06509         for( rho = 0; rho < dst->width; rho++ )
06510             exp_tab[rho] = std::exp(rho/M);
06511 
06512         for( phi = 0; phi < dsize.height; phi++ )
06513         {
06514             double cp = cos(phi*2*CV_PI/dsize.height);
06515             double sp = sin(phi*2*CV_PI/dsize.height);
06516             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
06517             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
06518 
06519             for( rho = 0; rho < dsize.width; rho++ )
06520             {
06521                 double r = exp_tab[rho];
06522                 double x = r*cp + center.x;
06523                 double y = r*sp + center.y;
06524 
06525                 mx[rho] = (float)x;
06526                 my[rho] = (float)y;
06527             }
06528         }
06529     }
06530     else
06531     {
06532         int x, y;
06533         CvMat bufx, bufy, bufp, bufa;
06534         double ascale = ssize.height/(2*CV_PI);
06535         cv::AutoBuffer<float> _buf(4*dsize.width);
06536         float* buf = _buf;
06537 
06538         bufx = cvMat( 1, dsize.width, CV_32F, buf );
06539         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
06540         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
06541         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
06542 
06543         for( x = 0; x < dsize.width; x++ )
06544             bufx.data.fl[x] = (float)x - center.x;
06545 
06546         for( y = 0; y < dsize.height; y++ )
06547         {
06548             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
06549             float* my = (float*)(mapy->data.ptr + y*mapy->step);
06550 
06551             for( x = 0; x < dsize.width; x++ )
06552                 bufy.data.fl[x] = (float)y - center.y;
06553 
06554 #if 1
06555             cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
06556 
06557             for( x = 0; x < dsize.width; x++ )
06558                 bufp.data.fl[x] += 1.f;
06559 
06560             cvLog( &bufp, &bufp );
06561 
06562             for( x = 0; x < dsize.width; x++ )
06563             {
06564                 double rho = bufp.data.fl[x]*M;
06565                 double phi = bufa.data.fl[x]*ascale;
06566 
06567                 mx[x] = (float)rho;
06568                 my[x] = (float)phi;
06569             }
06570 #else
06571             for( x = 0; x < dsize.width; x++ )
06572             {
06573                 double xx = bufx.data.fl[x];
06574                 double yy = bufy.data.fl[x];
06575 
06576                 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
06577                 double a = atan2(yy,xx);
06578                 if( a < 0 )
06579                     a = 2*CV_PI + a;
06580                 a *= ascale;
06581 
06582                 mx[x] = (float)p;
06583                 my[x] = (float)a;
06584             }
06585 #endif
06586         }
06587     }
06588 
06589     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
06590 }
06591 
06592 void cv::logPolar( InputArray _src, OutputArray _dst,
06593                    Point2f  center, double M, int flags )
06594 {
06595     Mat src = _src.getMat();
06596     _dst.create( src.size(), src.type() );
06597     CvMat c_src = src, c_dst = _dst.getMat();
06598     cvLogPolar( &c_src, &c_dst, center, M, flags );
06599 }
06600 
06601 /****************************************************************************************
06602                                    Linear-Polar Transform
06603   J.L. Blanco, Apr 2009
06604  ****************************************************************************************/
06605 CV_IMPL
06606 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
06607             CvPoint2D32f center, double maxRadius, int flags )
06608 {
06609     cv::Ptr<CvMat> mapx, mapy;
06610 
06611     CvMat srcstub, *src = (CvMat*)srcarr;
06612     CvMat dststub, *dst = (CvMat*)dstarr;
06613     CvSize ssize, dsize;
06614 
06615     src = cvGetMat( srcarr, &srcstub,0,0 );
06616     dst = cvGetMat( dstarr, &dststub,0,0 );
06617 
06618     if( !CV_ARE_TYPES_EQ( src, dst ))
06619         CV_Error( CV_StsUnmatchedFormats, "" );
06620 
06621     ssize.width = src->cols;
06622     ssize.height = src->rows;
06623     dsize.width = dst->cols;
06624     dsize.height = dst->rows;
06625 
06626     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
06627     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
06628 
06629     if( !(flags & CV_WARP_INVERSE_MAP) )
06630     {
06631         int phi, rho;
06632 
06633         for( phi = 0; phi < dsize.height; phi++ )
06634         {
06635             double cp = cos(phi*2*CV_PI/dsize.height);
06636             double sp = sin(phi*2*CV_PI/dsize.height);
06637             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
06638             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
06639 
06640             for( rho = 0; rho < dsize.width; rho++ )
06641             {
06642                 double r = maxRadius*(rho+1)/dsize.width;
06643                 double x = r*cp + center.x;
06644                 double y = r*sp + center.y;
06645 
06646                 mx[rho] = (float)x;
06647                 my[rho] = (float)y;
06648             }
06649         }
06650     }
06651     else
06652     {
06653         int x, y;
06654         CvMat bufx, bufy, bufp, bufa;
06655         const double ascale = ssize.height/(2*CV_PI);
06656         const double pscale = ssize.width/maxRadius;
06657 
06658         cv::AutoBuffer<float> _buf(4*dsize.width);
06659         float* buf = _buf;
06660 
06661         bufx = cvMat( 1, dsize.width, CV_32F, buf );
06662         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
06663         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
06664         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
06665 
06666         for( x = 0; x < dsize.width; x++ )
06667             bufx.data.fl[x] = (float)x - center.x;
06668 
06669         for( y = 0; y < dsize.height; y++ )
06670         {
06671             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
06672             float* my = (float*)(mapy->data.ptr + y*mapy->step);
06673 
06674             for( x = 0; x < dsize.width; x++ )
06675                 bufy.data.fl[x] = (float)y - center.y;
06676 
06677             cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
06678 
06679             for( x = 0; x < dsize.width; x++ )
06680                 bufp.data.fl[x] += 1.f;
06681 
06682             for( x = 0; x < dsize.width; x++ )
06683             {
06684                 double rho = bufp.data.fl[x]*pscale;
06685                 double phi = bufa.data.fl[x]*ascale;
06686                 mx[x] = (float)rho;
06687                 my[x] = (float)phi;
06688             }
06689         }
06690     }
06691 
06692     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
06693 }
06694 
06695 void cv::linearPolar( InputArray _src, OutputArray _dst,
06696                       Point2f  center, double maxRadius, int flags )
06697 {
06698     Mat src = _src.getMat();
06699     _dst.create( src.size(), src.type() );
06700     CvMat c_src = src, c_dst = _dst.getMat();
06701     cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags );
06702 }
06703 
06704 /* End of file. */
06705