Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers convert.cpp Source File

convert.cpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 #include "precomp.hpp"
00045 
00046 #include "opencl_kernels_core.hpp"
00047 
00048 #ifdef __APPLE__
00049 #undef CV_NEON
00050 #define CV_NEON 0
00051 #endif
00052 
00053 
00054 /****************************************************************************************\
00055 *                                       split & merge                                    *
00056 \****************************************************************************************/
00057 
00058 typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
00059 
00060 static SplitFunc getSplitFunc(int depth)
00061 {
00062     static SplitFunc splitTab[] =
00063     {
00064         (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
00065         (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0
00066     };
00067 
00068     return splitTab[depth];
00069 }
00070 
00071 typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
00072 
00073 static MergeFunc getMergeFunc(int depth)
00074 {
00075     static MergeFunc mergeTab[] =
00076     {
00077         (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
00078         (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
00079     };
00080 
00081     return mergeTab[depth];
00082 }
00083 
00084 void cv::split(const Mat& src, Mat* mv)
00085 {
00086     int k, depth = src.depth(), cn = src.channels();
00087     if( cn == 1 )
00088     {
00089         src.copyTo(mv[0]);
00090         return;
00091     }
00092 
00093     SplitFunc func = getSplitFunc(depth);
00094     CV_Assert( func != 0 );
00095 
00096     int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1();
00097     int blocksize0 = (BLOCK_SIZE + esz-1)/esz;
00098     AutoBuffer<uchar>  _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
00099     const Mat** arrays = (const Mat**)(uchar*)_buf;
00100     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
00101 
00102     arrays[0] = &src;
00103     for( k = 0; k < cn; k++ )
00104     {
00105         mv[k].create(src.dims, src.size, depth);
00106         arrays[k+1] = &mv[k];
00107     }
00108 
00109     NAryMatIterator it(arrays, ptrs, cn+1);
00110     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
00111 
00112     for( size_t i = 0; i < it.nplanes; i++, ++it )
00113     {
00114         for( int j = 0; j < total; j += blocksize )
00115         {
00116             int bsz = std::min(total - j, blocksize);
00117             func( ptrs[0], &ptrs[1], bsz, cn );
00118 
00119             if( j + blocksize < total )
00120             {
00121                 ptrs[0] += bsz*esz;
00122                 for( k = 0; k < cn; k++ )
00123                     ptrs[k+1] += bsz*esz1;
00124             }
00125         }
00126     }
00127 }
00128 
00129 #ifdef HAVE_OPENCL
00130 
00131 namespace cv {
00132 
00133 static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
00134 {
00135     int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
00136             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
00137 
00138     String dstargs, processelem, indexdecl;
00139     for (int i = 0; i < cn; ++i)
00140     {
00141         dstargs += format("DECLARE_DST_PARAM(%d)", i);
00142         indexdecl += format("DECLARE_INDEX(%d)", i);
00143         processelem += format("PROCESS_ELEM(%d)", i);
00144     }
00145 
00146     ocl::Kernel k("split", ocl::core::split_merge_oclsrc,
00147                   format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s"
00148                          " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s",
00149                          ocl::memopTypeToStr(depth), cn, dstargs.c_str(),
00150                          processelem.c_str(), indexdecl.c_str()));
00151     if (k.empty())
00152         return false;
00153 
00154     Size size = _m.size();
00155     _mv.create(cn, 1, depth);
00156     for (int i = 0; i < cn; ++i)
00157         _mv.create(size, depth, i);
00158 
00159     std::vector<UMat> dst;
00160     _mv.getUMatVector(dst);
00161 
00162     int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
00163     for (int i = 0; i < cn; ++i)
00164         argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i]));
00165     k.set(argidx, rowsPerWI);
00166 
00167     size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + rowsPerWI - 1) / rowsPerWI };
00168     return k.run(2, globalsize, NULL, false);
00169 }
00170 
00171 }
00172 
00173 #endif
00174 
00175 void cv::split(InputArray _m, OutputArrayOfArrays _mv)
00176 {
00177 #ifdef HAVE_OPENCL
00178     CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(),
00179                ocl_split(_m, _mv))
00180 #endif
00181 
00182     Mat m = _m.getMat();
00183     if( m.empty() )
00184     {
00185         _mv.release();
00186         return;
00187     }
00188 
00189     CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() );
00190 
00191     int depth = m.depth(), cn = m.channels();
00192     _mv.create(cn, 1, depth);
00193     for (int i = 0; i < cn; ++i)
00194         _mv.create(m.dims, m.size.p, depth, i);
00195 
00196     std::vector<Mat> dst;
00197     _mv.getMatVector(dst);
00198 
00199     split(m, &dst[0]);
00200 }
00201 
00202 void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
00203 {
00204     CV_Assert( mv && n > 0 );
00205 
00206     int depth = mv[0].depth();
00207     bool allch1 = true;
00208     int k, cn = 0;
00209     size_t i;
00210 
00211     for( i = 0; i < n; i++ )
00212     {
00213         CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth);
00214         allch1 = allch1 && mv[i].channels() == 1;
00215         cn += mv[i].channels();
00216     }
00217 
00218     CV_Assert( 0 < cn && cn <= CV_CN_MAX );
00219     _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn));
00220     Mat dst = _dst.getMat();
00221 
00222     if( n == 1 )
00223     {
00224         mv[0].copyTo(dst);
00225         return;
00226     }
00227 
00228     if( !allch1 )
00229     {
00230         AutoBuffer<int>  pairs(cn*2);
00231         int j, ni=0;
00232 
00233         for( i = 0, j = 0; i < n; i++, j += ni )
00234         {
00235             ni = mv[i].channels();
00236             for( k = 0; k < ni; k++ )
00237             {
00238                 pairs[(j+k)*2] = j + k;
00239                 pairs[(j+k)*2+1] = j + k;
00240             }
00241         }
00242         mixChannels( mv, n, &dst, 1, &pairs[0], cn );
00243         return;
00244     }
00245 
00246     size_t esz = dst.elemSize(), esz1 = dst.elemSize1();
00247     int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
00248     AutoBuffer<uchar>  _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
00249     const Mat** arrays = (const Mat**)(uchar*)_buf;
00250     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
00251 
00252     arrays[0] = &dst;
00253     for( k = 0; k < cn; k++ )
00254         arrays[k+1] = &mv[k];
00255 
00256     NAryMatIterator it(arrays, ptrs, cn+1);
00257     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
00258     MergeFunc func = getMergeFunc(depth);
00259 
00260     for( i = 0; i < it.nplanes; i++, ++it )
00261     {
00262         for( int j = 0; j < total; j += blocksize )
00263         {
00264             int bsz = std::min(total - j, blocksize);
00265             func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn );
00266 
00267             if( j + blocksize < total )
00268             {
00269                 ptrs[0] += bsz*esz;
00270                 for( int t = 0; t < cn; t++ )
00271                     ptrs[t+1] += bsz*esz1;
00272             }
00273         }
00274     }
00275 }
00276 
00277 #ifdef HAVE_OPENCL
00278 
00279 namespace cv {
00280 
00281 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
00282 {
00283     std::vector<UMat> src, ksrc;
00284     _mv.getUMatVector(src);
00285     CV_Assert(!src.empty());
00286 
00287     int type = src[0].type(), depth = CV_MAT_DEPTH(type),
00288             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
00289     Size size = src[0].size();
00290 
00291     for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
00292     {
00293         int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype),
00294                 esz1 = CV_ELEM_SIZE1(idepth);
00295         if (src[i].dims > 2)
00296             return false;
00297 
00298         CV_Assert(size == src[i].size() && depth == idepth);
00299 
00300         for (int cn = 0; cn < icn; ++cn)
00301         {
00302             UMat tsrc = src[i];
00303             tsrc.offset += cn * esz1;
00304             ksrc.push_back(tsrc);
00305         }
00306     }
00307     int dcn = (int)ksrc.size();
00308 
00309     String srcargs, processelem, cndecl, indexdecl;
00310     for (int i = 0; i < dcn; ++i)
00311     {
00312         srcargs += format("DECLARE_SRC_PARAM(%d)", i);
00313         processelem += format("PROCESS_ELEM(%d)", i);
00314         indexdecl += format("DECLARE_INDEX(%d)", i);
00315         cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
00316     }
00317 
00318     ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
00319                   format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
00320                          " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s",
00321                          dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
00322                          indexdecl.c_str(), processelem.c_str(), cndecl.c_str()));
00323     if (k.empty())
00324         return false;
00325 
00326     _dst.create(size, CV_MAKE_TYPE(depth, dcn));
00327     UMat dst = _dst.getUMat();
00328 
00329     int argidx = 0;
00330     for (int i = 0; i < dcn; ++i)
00331         argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
00332     argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst));
00333     k.set(argidx, rowsPerWI);
00334 
00335     size_t globalsize[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
00336     return k.run(2, globalsize, NULL, false);
00337 }
00338 
00339 }
00340 
00341 #endif
00342 
00343 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst)
00344 {
00345 #ifdef HAVE_OPENCL
00346     CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(),
00347                ocl_merge(_mv, _dst))
00348 #endif
00349 
00350     std::vector<Mat> mv;
00351     _mv.getMatVector(mv);
00352     merge(!mv.empty() ? &mv[0] : 0, mv.size(), _dst);
00353 }
00354 
00355 /****************************************************************************************\
00356 *                       Generalized split/merge: mixing channels                         *
00357 \****************************************************************************************/
00358 
00359 namespace cv
00360 {
00361 
00362 template<typename T> static void
00363 mixChannels_( const T** src, const int* sdelta,
00364               T** dst, const int* ddelta,
00365               int len, int npairs )
00366 {
00367     int i, k;
00368     for( k = 0; k < npairs; k++ )
00369     {
00370         const T* s = src[k];
00371         T* d = dst[k];
00372         int ds = sdelta[k], dd = ddelta[k];
00373         if( s )
00374         {
00375             for( i = 0; i <= len - 2; i += 2, s += ds*2, d += dd*2 )
00376             {
00377                 T t0 = s[0], t1 = s[ds];
00378                 d[0] = t0; d[dd] = t1;
00379             }
00380             if( i < len )
00381                 d[0] = s[0];
00382         }
00383         else
00384         {
00385             for( i = 0; i <= len - 2; i += 2, d += dd*2 )
00386                 d[0] = d[dd] = 0;
00387             if( i < len )
00388                 d[0] = 0;
00389         }
00390     }
00391 }
00392 
00393 
00394 static void mixChannels8u( const uchar** src, const int* sdelta,
00395                            uchar** dst, const int* ddelta,
00396                            int len, int npairs )
00397 {
00398     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
00399 }
00400 
00401 static void mixChannels16u( const ushort** src, const int* sdelta,
00402                             ushort** dst, const int* ddelta,
00403                             int len, int npairs )
00404 {
00405     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
00406 }
00407 
00408 static void mixChannels32s( const int** src, const int* sdelta,
00409                             int** dst, const int* ddelta,
00410                             int len, int npairs )
00411 {
00412     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
00413 }
00414 
00415 static void mixChannels64s( const int64** src, const int* sdelta,
00416                             int64** dst, const int* ddelta,
00417                             int len, int npairs )
00418 {
00419     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
00420 }
00421 
00422 typedef void (*MixChannelsFunc)( const uchar** src, const int* sdelta,
00423         uchar** dst, const int* ddelta, int len, int npairs );
00424 
00425 static MixChannelsFunc getMixchFunc(int depth)
00426 {
00427     static MixChannelsFunc mixchTab[] =
00428     {
00429         (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels16u,
00430         (MixChannelsFunc)mixChannels16u, (MixChannelsFunc)mixChannels32s, (MixChannelsFunc)mixChannels32s,
00431         (MixChannelsFunc)mixChannels64s, 0
00432     };
00433 
00434     return mixchTab[depth];
00435 }
00436 
00437 }
00438 
00439 void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, const int* fromTo, size_t npairs )
00440 {
00441     if( npairs == 0 )
00442         return;
00443     CV_Assert( src && nsrcs > 0 && dst && ndsts > 0 && fromTo && npairs > 0 );
00444 
00445     size_t i, j, k, esz1 = dst[0].elemSize1();
00446     int depth = dst[0].depth();
00447 
00448     AutoBuffer<uchar>  buf((nsrcs + ndsts + 1)*(sizeof(Mat*) + sizeof(uchar*)) + npairs*(sizeof(uchar*)*2 + sizeof(int)*6));
00449     const Mat** arrays = (const Mat**)(uchar*)buf;
00450     uchar** ptrs = (uchar**)(arrays + nsrcs + ndsts);
00451     const uchar** srcs = (const uchar**)(ptrs + nsrcs + ndsts + 1);
00452     uchar** dsts = (uchar**)(srcs + npairs);
00453     int* tab = (int*)(dsts + npairs);
00454     int *sdelta = (int*)(tab + npairs*4), *ddelta = sdelta + npairs;
00455 
00456     for( i = 0; i < nsrcs; i++ )
00457         arrays[i] = &src[i];
00458     for( i = 0; i < ndsts; i++ )
00459         arrays[i + nsrcs] = &dst[i];
00460     ptrs[nsrcs + ndsts] = 0;
00461 
00462     for( i = 0; i < npairs; i++ )
00463     {
00464         int i0 = fromTo[i*2], i1 = fromTo[i*2+1];
00465         if( i0 >= 0 )
00466         {
00467             for( j = 0; j < nsrcs; i0 -= src[j].channels(), j++ )
00468                 if( i0 < src[j].channels() )
00469                     break;
00470             CV_Assert(j < nsrcs && src[j].depth() == depth);
00471             tab[i*4] = (int)j; tab[i*4+1] = (int)(i0*esz1);
00472             sdelta[i] = src[j].channels();
00473         }
00474         else
00475         {
00476             tab[i*4] = (int)(nsrcs + ndsts); tab[i*4+1] = 0;
00477             sdelta[i] = 0;
00478         }
00479 
00480         for( j = 0; j < ndsts; i1 -= dst[j].channels(), j++ )
00481             if( i1 < dst[j].channels() )
00482                 break;
00483         CV_Assert(i1 >= 0 && j < ndsts && dst[j].depth() == depth);
00484         tab[i*4+2] = (int)(j + nsrcs); tab[i*4+3] = (int)(i1*esz1);
00485         ddelta[i] = dst[j].channels();
00486     }
00487 
00488     NAryMatIterator it(arrays, ptrs, (int)(nsrcs + ndsts));
00489     int total = (int)it.size, blocksize = std::min(total, (int)((BLOCK_SIZE + esz1-1)/esz1));
00490     MixChannelsFunc func = getMixchFunc(depth);
00491 
00492     for( i = 0; i < it.nplanes; i++, ++it )
00493     {
00494         for( k = 0; k < npairs; k++ )
00495         {
00496             srcs[k] = ptrs[tab[k*4]] + tab[k*4+1];
00497             dsts[k] = ptrs[tab[k*4+2]] + tab[k*4+3];
00498         }
00499 
00500         for( int t = 0; t < total; t += blocksize )
00501         {
00502             int bsz = std::min(total - t, blocksize);
00503             func( srcs, sdelta, dsts, ddelta, bsz, (int)npairs );
00504 
00505             if( t + blocksize < total )
00506                 for( k = 0; k < npairs; k++ )
00507                 {
00508                     srcs[k] += blocksize*sdelta[k]*esz1;
00509                     dsts[k] += blocksize*ddelta[k]*esz1;
00510                 }
00511         }
00512     }
00513 }
00514 
00515 #ifdef HAVE_OPENCL
00516 
00517 namespace cv {
00518 
00519 static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx)
00520 {
00521     int totalChannels = 0;
00522     for (size_t i = 0, size = um.size(); i < size; ++i)
00523     {
00524         int ccn = um[i].channels();
00525         totalChannels += ccn;
00526 
00527         if (totalChannels == cn)
00528         {
00529             idx = (int)(i + 1);
00530             cnidx = 0;
00531             return;
00532         }
00533         else if (totalChannels > cn)
00534         {
00535             idx = (int)i;
00536             cnidx = i == 0 ? cn : (cn - totalChannels + ccn);
00537             return;
00538         }
00539     }
00540 
00541     idx = cnidx = -1;
00542 }
00543 
00544 static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _dst,
00545                             const int* fromTo, size_t npairs)
00546 {
00547     std::vector<UMat> src, dst;
00548     _src.getUMatVector(src);
00549     _dst.getUMatVector(dst);
00550 
00551     size_t nsrc = src.size(), ndst = dst.size();
00552     CV_Assert(nsrc > 0 && ndst > 0);
00553 
00554     Size size = src[0].size();
00555     int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth),
00556             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
00557 
00558     for (size_t i = 1, ssize = src.size(); i < ssize; ++i)
00559         CV_Assert(src[i].size() == size && src[i].depth() == depth);
00560     for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
00561         CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
00562 
00563     String declsrc, decldst, declproc, declcn, indexdecl;
00564     std::vector<UMat> srcargs(npairs), dstargs(npairs);
00565 
00566     for (size_t i = 0; i < npairs; ++i)
00567     {
00568         int scn = fromTo[i<<1], dcn = fromTo[(i<<1) + 1];
00569         int src_idx, src_cnidx, dst_idx, dst_cnidx;
00570 
00571         getUMatIndex(src, scn, src_idx, src_cnidx);
00572         getUMatIndex(dst, dcn, dst_idx, dst_cnidx);
00573 
00574         CV_Assert(dst_idx >= 0 && src_idx >= 0);
00575 
00576         srcargs[i] = src[src_idx];
00577         srcargs[i].offset += src_cnidx * esz;
00578 
00579         dstargs[i] = dst[dst_idx];
00580         dstargs[i].offset += dst_cnidx * esz;
00581 
00582         declsrc += format("DECLARE_INPUT_MAT(%d)", i);
00583         decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
00584         indexdecl += format("DECLARE_INDEX(%d)", i);
00585         declproc += format("PROCESS_ELEM(%d)", i);
00586         declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
00587     }
00588 
00589     ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
00590                   format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s"
00591                          " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s",
00592                          ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(),
00593                          declproc.c_str(), indexdecl.c_str(), declcn.c_str()));
00594     if (k.empty())
00595         return false;
00596 
00597     int argindex = 0;
00598     for (size_t i = 0; i < npairs; ++i)
00599         argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i]));
00600     for (size_t i = 0; i < npairs; ++i)
00601         argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i]));
00602     argindex = k.set(argindex, size.height);
00603     argindex = k.set(argindex, size.width);
00604     k.set(argindex, rowsPerWI);
00605 
00606     size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + rowsPerWI - 1) / rowsPerWI };
00607     return k.run(2, globalsize, NULL, false);
00608 }
00609 
00610 }
00611 
00612 #endif
00613 
00614 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
00615                  const int* fromTo, size_t npairs)
00616 {
00617     if (npairs == 0 || fromTo == NULL)
00618         return;
00619 
00620 #ifdef HAVE_OPENCL
00621     CV_OCL_RUN(dst.isUMatVector(),
00622                ocl_mixChannels(src, dst, fromTo, npairs))
00623 #endif
00624 
00625     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
00626             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
00627             src.kind() != _InputArray::STD_VECTOR_UMAT;
00628     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
00629             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
00630             dst.kind() != _InputArray::STD_VECTOR_UMAT;
00631     int i;
00632     int nsrc = src_is_mat ? 1 : (int)src.total();
00633     int ndst = dst_is_mat ? 1 : (int)dst.total();
00634 
00635     CV_Assert(nsrc > 0 && ndst > 0);
00636     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
00637     Mat* buf = _buf;
00638     for( i = 0; i < nsrc; i++ )
00639         buf[i] = src.getMat(src_is_mat ? -1 : i);
00640     for( i = 0; i < ndst; i++ )
00641         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
00642     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, fromTo, npairs);
00643 }
00644 
00645 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
00646                      const std::vector<int>& fromTo)
00647 {
00648     if (fromTo.empty())
00649         return;
00650 
00651 #ifdef HAVE_OPENCL
00652     CV_OCL_RUN(dst.isUMatVector(),
00653                ocl_mixChannels(src, dst, &fromTo[0], fromTo.size()>>1))
00654 #endif
00655 
00656     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
00657             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
00658             src.kind() != _InputArray::STD_VECTOR_UMAT;
00659     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
00660             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
00661             dst.kind() != _InputArray::STD_VECTOR_UMAT;
00662     int i;
00663     int nsrc = src_is_mat ? 1 : (int)src.total();
00664     int ndst = dst_is_mat ? 1 : (int)dst.total();
00665 
00666     CV_Assert(fromTo.size()%2 == 0 && nsrc > 0 && ndst > 0);
00667     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
00668     Mat* buf = _buf;
00669     for( i = 0; i < nsrc; i++ )
00670         buf[i] = src.getMat(src_is_mat ? -1 : i);
00671     for( i = 0; i < ndst; i++ )
00672         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
00673     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, &fromTo[0], fromTo.size()/2);
00674 }
00675 
00676 void cv::extractChannel(InputArray _src, OutputArray _dst, int coi)
00677 {
00678     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
00679     CV_Assert( 0 <= coi && coi < cn );
00680     int ch[] = { coi, 0 };
00681 
00682 #ifdef HAVE_OPENCL
00683     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
00684     {
00685         UMat  src = _src.getUMat();
00686         _dst.create(src.dims, &src.size[0], depth);
00687         UMat  dst = _dst.getUMat();
00688         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
00689         return;
00690     }
00691 #endif
00692 
00693     Mat src = _src.getMat();
00694     _dst.create(src.dims, &src.size[0], depth);
00695     Mat dst = _dst.getMat();
00696     mixChannels(&src, 1, &dst, 1, ch, 1);
00697 }
00698 
00699 void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi)
00700 {
00701     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
00702     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
00703     CV_Assert( _src.sameSize(_dst) && sdepth == ddepth );
00704     CV_Assert( 0 <= coi && coi < dcn && scn == 1 );
00705 
00706     int ch[] = { 0, coi };
00707     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
00708     {
00709         UMat  src = _src.getUMat(), dst = _dst.getUMat();
00710         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
00711         return;
00712     }
00713 
00714     Mat src = _src.getMat(), dst = _dst.getMat();
00715     mixChannels(&src, 1, &dst, 1, ch, 1);
00716 }
00717 
00718 /****************************************************************************************\
00719 *                                convertScale[Abs]                                       *
00720 \****************************************************************************************/
00721 
00722 namespace cv
00723 {
00724 
00725 template<typename T, typename DT, typename WT>
00726 struct cvtScaleAbs_SIMD
00727 {
00728     int operator () (const T *, DT *, int, WT, WT) const
00729     {
00730         return 0;
00731     }
00732 };
00733 
00734 #if CV_SSE2
00735 
00736 template <>
00737 struct cvtScaleAbs_SIMD<uchar, uchar, float>
00738 {
00739     int operator () (const uchar * src, uchar * dst, int width,
00740                      float scale, float shift) const
00741     {
00742         int x = 0;
00743 
00744         if (USE_SSE2)
00745         {
00746             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00747                 v_zero_f = _mm_setzero_ps();
00748             __m128i v_zero_i = _mm_setzero_si128();
00749 
00750             for ( ; x <= width - 16; x += 16)
00751             {
00752                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00753                 __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
00754                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
00755                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00756                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
00757                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00758                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
00759                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
00760                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
00761                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
00762 
00763                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
00764                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
00765                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
00766             }
00767         }
00768 
00769         return x;
00770     }
00771 };
00772 
00773 template <>
00774 struct cvtScaleAbs_SIMD<schar, uchar, float>
00775 {
00776     int operator () (const schar * src, uchar * dst, int width,
00777                      float scale, float shift) const
00778     {
00779         int x = 0;
00780 
00781         if (USE_SSE2)
00782         {
00783             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00784                 v_zero_f = _mm_setzero_ps();
00785             __m128i v_zero_i = _mm_setzero_si128();
00786 
00787             for ( ; x <= width - 16; x += 16)
00788             {
00789                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00790                 __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8),
00791                         v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8);
00792                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
00793                     _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
00794                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00795                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
00796                     _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
00797                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00798                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
00799                     _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
00800                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
00801                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
00802                     _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
00803                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
00804 
00805                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
00806                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
00807                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
00808             }
00809         }
00810 
00811         return x;
00812     }
00813 };
00814 
00815 template <>
00816 struct cvtScaleAbs_SIMD<ushort, uchar, float>
00817 {
00818     int operator () (const ushort * src, uchar * dst, int width,
00819                      float scale, float shift) const
00820     {
00821         int x = 0;
00822 
00823         if (USE_SSE2)
00824         {
00825             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00826                 v_zero_f = _mm_setzero_ps();
00827             __m128i v_zero_i = _mm_setzero_si128();
00828 
00829             for ( ; x <= width - 8; x += 8)
00830             {
00831                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00832                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
00833                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00834                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
00835                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00836 
00837                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
00838                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
00839             }
00840         }
00841 
00842         return x;
00843     }
00844 };
00845 
00846 template <>
00847 struct cvtScaleAbs_SIMD<short, uchar, float>
00848 {
00849     int operator () (const short * src, uchar * dst, int width,
00850                      float scale, float shift) const
00851     {
00852         int x = 0;
00853 
00854         if (USE_SSE2)
00855         {
00856             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00857                 v_zero_f = _mm_setzero_ps();
00858             __m128i v_zero_i = _mm_setzero_si128();
00859 
00860             for ( ; x <= width - 8; x += 8)
00861             {
00862                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00863                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
00864                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00865                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
00866                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00867 
00868                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
00869                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
00870             }
00871         }
00872 
00873         return x;
00874     }
00875 };
00876 
00877 template <>
00878 struct cvtScaleAbs_SIMD<int, uchar, float>
00879 {
00880     int operator () (const int * src, uchar * dst, int width,
00881                      float scale, float shift) const
00882     {
00883         int x = 0;
00884 
00885         if (USE_SSE2)
00886         {
00887             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00888                 v_zero_f = _mm_setzero_ps();
00889             __m128i v_zero_i = _mm_setzero_si128();
00890 
00891             for ( ; x <= width - 8; x += 4)
00892             {
00893                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00894                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
00895                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00896 
00897                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
00898                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
00899             }
00900         }
00901 
00902         return x;
00903     }
00904 };
00905 
00906 template <>
00907 struct cvtScaleAbs_SIMD<float, uchar, float>
00908 {
00909     int operator () (const float * src, uchar * dst, int width,
00910                      float scale, float shift) const
00911     {
00912         int x = 0;
00913 
00914         if (USE_SSE2)
00915         {
00916             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00917                 v_zero_f = _mm_setzero_ps();
00918             __m128i v_zero_i = _mm_setzero_si128();
00919 
00920             for ( ; x <= width - 8; x += 4)
00921             {
00922                 __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
00923                 v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
00924 
00925                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
00926                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
00927             }
00928         }
00929 
00930         return x;
00931     }
00932 };
00933 
00934 template <>
00935 struct cvtScaleAbs_SIMD<double, uchar, float>
00936 {
00937     int operator () (const double * src, uchar * dst, int width,
00938                      float scale, float shift) const
00939     {
00940         int x = 0;
00941 
00942         if (USE_SSE2)
00943         {
00944             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00945                 v_zero_f = _mm_setzero_ps();
00946             __m128i v_zero_i = _mm_setzero_si128();
00947 
00948             for ( ; x <= width - 8; x += 8)
00949             {
00950                 __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
00951                                               _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
00952                 __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
00953                                               _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
00954 
00955                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift);
00956                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00957 
00958                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift);
00959                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00960 
00961                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1),
00962                                                   _mm_cvtps_epi32(v_dst2));
00963 
00964                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
00965             }
00966         }
00967 
00968         return x;
00969     }
00970 };
00971 
00972 #elif CV_NEON
00973 
00974 template <>
00975 struct cvtScaleAbs_SIMD<uchar, uchar, float>
00976 {
00977     int operator () (const uchar * src, uchar * dst, int width,
00978                      float scale, float shift) const
00979     {
00980         int x = 0;
00981         float32x4_t v_shift = vdupq_n_f32(shift);
00982 
00983         for ( ; x <= width - 16; x += 16)
00984         {
00985             uint8x16_t v_src = vld1q_u8(src + x);
00986             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
00987 
00988             uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half));
00989             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
00990             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
00991 
00992             v_quat = vmovl_u16(vget_high_u16(v_half));
00993             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
00994             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
00995 
00996             v_half = vmovl_u8(vget_high_u8(v_src));
00997 
00998             v_quat = vmovl_u16(vget_low_u16(v_half));
00999             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
01000             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
01001 
01002             v_quat = vmovl_u16(vget_high_u16(v_half));
01003             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
01004             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
01005 
01006             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
01007                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
01008             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
01009                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
01010 
01011             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
01012         }
01013 
01014         return x;
01015     }
01016 };
01017 
01018 template <>
01019 struct cvtScaleAbs_SIMD<schar, uchar, float>
01020 {
01021     int operator () (const schar * src, uchar * dst, int width,
01022                      float scale, float shift) const
01023     {
01024         int x = 0;
01025         float32x4_t v_shift = vdupq_n_f32(shift);
01026 
01027         for ( ; x <= width - 16; x += 16)
01028         {
01029             int8x16_t v_src = vld1q_s8(src + x);
01030             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
01031 
01032             int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half));
01033             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
01034             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01035 
01036             v_quat = vmovl_s16(vget_high_s16(v_half));
01037             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
01038             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01039 
01040             v_half = vmovl_s8(vget_high_s8(v_src));
01041 
01042             v_quat = vmovl_s16(vget_low_s16(v_half));
01043             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
01044             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
01045 
01046             v_quat = vmovl_s16(vget_high_s16(v_half));
01047             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
01048             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
01049 
01050             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
01051                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
01052             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
01053                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
01054 
01055             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
01056         }
01057 
01058         return x;
01059     }
01060 };
01061 
01062 template <>
01063 struct cvtScaleAbs_SIMD<ushort, uchar, float>
01064 {
01065     int operator () (const ushort * src, uchar * dst, int width,
01066                      float scale, float shift) const
01067     {
01068         int x = 0;
01069         float32x4_t v_shift = vdupq_n_f32(shift);
01070 
01071         for ( ; x <= width - 8; x += 8)
01072         {
01073             uint16x8_t v_src = vld1q_u16(src + x);
01074 
01075             uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src));
01076             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
01077             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01078 
01079             v_half = vmovl_u16(vget_high_u16(v_src));
01080             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
01081             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01082 
01083             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
01084                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
01085 
01086             vst1_u8(dst + x, vqmovn_u16(v_dst));
01087         }
01088 
01089         return x;
01090     }
01091 };
01092 
01093 template <>
01094 struct cvtScaleAbs_SIMD<short, uchar, float>
01095 {
01096     int operator () (const short * src, uchar * dst, int width,
01097                      float scale, float shift) const
01098     {
01099         int x = 0;
01100         float32x4_t v_shift = vdupq_n_f32(shift);
01101 
01102         for ( ; x <= width - 8; x += 8)
01103         {
01104             int16x8_t v_src = vld1q_s16(src + x);
01105 
01106             int32x4_t v_half = vmovl_s16(vget_low_s16(v_src));
01107             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
01108             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01109 
01110             v_half = vmovl_s16(vget_high_s16(v_src));
01111             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
01112             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01113 
01114             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
01115                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
01116 
01117             vst1_u8(dst + x, vqmovn_u16(v_dst));
01118         }
01119 
01120         return x;
01121     }
01122 };
01123 
01124 template <>
01125 struct cvtScaleAbs_SIMD<int, uchar, float>
01126 {
01127     int operator () (const int * src, uchar * dst, int width,
01128                      float scale, float shift) const
01129     {
01130         int x = 0;
01131         float32x4_t v_shift = vdupq_n_f32(shift);
01132 
01133         for ( ; x <= width - 8; x += 8)
01134         {
01135             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale);
01136             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01137             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
01138 
01139             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale);
01140             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01141             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
01142 
01143             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
01144             vst1_u8(dst + x, vqmovn_u16(v_dst));
01145         }
01146 
01147         return x;
01148     }
01149 };
01150 
01151 template <>
01152 struct cvtScaleAbs_SIMD<float, uchar, float>
01153 {
01154     int operator () (const float * src, uchar * dst, int width,
01155                      float scale, float shift) const
01156     {
01157         int x = 0;
01158         float32x4_t v_shift = vdupq_n_f32(shift);
01159 
01160         for ( ; x <= width - 8; x += 8)
01161         {
01162             float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
01163             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01164             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
01165 
01166             float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
01167             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01168             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
01169 
01170             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
01171             vst1_u8(dst + x, vqmovn_u16(v_dst));
01172         }
01173 
01174         return x;
01175     }
01176 };
01177 
01178 #endif
01179 
01180 template<typename T, typename DT, typename WT> static void
01181 cvtScaleAbs_( const T* src, size_t sstep,
01182               DT* dst, size_t dstep, Size size,
01183               WT scale, WT shift )
01184 {
01185     sstep /= sizeof(src[0]);
01186     dstep /= sizeof(dst[0]);
01187     cvtScaleAbs_SIMD<T, DT, WT> vop;
01188 
01189     for( ; size.height--; src += sstep, dst += dstep )
01190     {
01191         int x = vop(src, dst, size.width, scale, shift);
01192 
01193         #if CV_ENABLE_UNROLLED
01194         for( ; x <= size.width - 4; x += 4 )
01195         {
01196             DT t0, t1;
01197             t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift));
01198             t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift));
01199             dst[x] = t0; dst[x+1] = t1;
01200             t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift));
01201             t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift));
01202             dst[x+2] = t0; dst[x+3] = t1;
01203         }
01204         #endif
01205         for( ; x < size.width; x++ )
01206             dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
01207     }
01208 }
01209 
01210 template <typename T, typename DT, typename WT>
01211 struct cvtScale_SIMD
01212 {
01213     int operator () (const T *, DT *, int, WT, WT) const
01214     {
01215         return 0;
01216     }
01217 };
01218 
01219 #if CV_SSE2
01220 
01221 // from uchar
01222 
01223 template <>
01224 struct cvtScale_SIMD<uchar, uchar, float>
01225 {
01226     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
01227     {
01228         int x = 0;
01229 
01230         if (!USE_SSE2)
01231             return x;
01232 
01233         __m128i v_zero = _mm_setzero_si128();
01234         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01235 
01236         for ( ; x <= width - 8; x += 8)
01237         {
01238             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01239             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01240             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01241 
01242             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01243             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01244 
01245             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01246                                             _mm_cvtps_epi32(v_dst_1));
01247             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
01248         }
01249 
01250         return x;
01251     }
01252 };
01253 
01254 template <>
01255 struct cvtScale_SIMD<uchar, schar, float>
01256 {
01257     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
01258     {
01259         int x = 0;
01260 
01261         if (!USE_SSE2)
01262             return x;
01263 
01264         __m128i v_zero = _mm_setzero_si128();
01265         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01266 
01267         for ( ; x <= width - 8; x += 8)
01268         {
01269             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01270             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01271             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01272 
01273             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01274             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01275 
01276             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01277                                             _mm_cvtps_epi32(v_dst_1));
01278             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
01279         }
01280 
01281         return x;
01282     }
01283 };
01284 
01285 #if CV_SSE4_1
01286 
01287 template <>
01288 struct cvtScale_SIMD<uchar, ushort, float>
01289 {
01290     cvtScale_SIMD()
01291     {
01292         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01293     }
01294 
01295     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
01296     {
01297         int x = 0;
01298 
01299         if (!haveSSE)
01300             return x;
01301 
01302         __m128i v_zero = _mm_setzero_si128();
01303         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01304 
01305         for ( ; x <= width - 8; x += 8)
01306         {
01307             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01308             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01309             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01310 
01311             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01312             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01313 
01314             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
01315                                              _mm_cvtps_epi32(v_dst_1));
01316             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01317         }
01318 
01319         return x;
01320     }
01321 
01322     bool haveSSE;
01323 };
01324 
01325 #endif
01326 
01327 template <>
01328 struct cvtScale_SIMD<uchar, short, float>
01329 {
01330     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
01331     {
01332         int x = 0;
01333 
01334         if (!USE_SSE2)
01335             return x;
01336 
01337         __m128i v_zero = _mm_setzero_si128();
01338         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01339 
01340         for ( ; x <= width - 8; x += 8)
01341         {
01342             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01343             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01344             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01345 
01346             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01347             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01348 
01349             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01350                                             _mm_cvtps_epi32(v_dst_1));
01351             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01352         }
01353 
01354         return x;
01355     }
01356 };
01357 
01358 template <>
01359 struct cvtScale_SIMD<uchar, int, float>
01360 {
01361     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
01362     {
01363         int x = 0;
01364 
01365         if (!USE_SSE2)
01366             return x;
01367 
01368         __m128i v_zero = _mm_setzero_si128();
01369         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01370 
01371         for ( ; x <= width - 8; x += 8)
01372         {
01373             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01374             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01375             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01376 
01377             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01378             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01379 
01380             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
01381             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
01382         }
01383 
01384         return x;
01385     }
01386 };
01387 
01388 template <>
01389 struct cvtScale_SIMD<uchar, float, float>
01390 {
01391     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
01392     {
01393         int x = 0;
01394 
01395         if (!USE_SSE2)
01396             return x;
01397 
01398         __m128i v_zero = _mm_setzero_si128();
01399         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01400 
01401         for ( ; x <= width - 8; x += 8)
01402         {
01403             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01404             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01405             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01406 
01407             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01408             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01409 
01410             _mm_storeu_ps(dst + x, v_dst_0);
01411             _mm_storeu_ps(dst + x + 4, v_dst_1);
01412         }
01413 
01414         return x;
01415     }
01416 };
01417 
01418 template <>
01419 struct cvtScale_SIMD<uchar, double, double>
01420 {
01421     int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
01422     {
01423         int x = 0;
01424 
01425         if (!USE_SSE2)
01426             return x;
01427 
01428         __m128i v_zero = _mm_setzero_si128();
01429         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
01430 
01431         for ( ; x <= width - 8; x += 8)
01432         {
01433             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01434 
01435             __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
01436             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01437             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01438             _mm_storeu_pd(dst + x, v_dst_0);
01439             _mm_storeu_pd(dst + x + 2, v_dst_1);
01440 
01441             v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
01442             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01443             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01444             _mm_storeu_pd(dst + x + 4, v_dst_0);
01445             _mm_storeu_pd(dst + x + 6, v_dst_1);
01446         }
01447 
01448         return x;
01449     }
01450 };
01451 
01452 // from schar
01453 
01454 template <>
01455 struct cvtScale_SIMD<schar, uchar, float>
01456 {
01457     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
01458     {
01459         int x = 0;
01460 
01461         if (!USE_SSE2)
01462             return x;
01463 
01464         __m128i v_zero = _mm_setzero_si128();
01465         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01466 
01467         for ( ; x <= width - 8; x += 8)
01468         {
01469             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01470             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01471             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01472 
01473             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01474             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01475 
01476             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01477                                             _mm_cvtps_epi32(v_dst_1));
01478             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
01479         }
01480 
01481         return x;
01482     }
01483 };
01484 
01485 template <>
01486 struct cvtScale_SIMD<schar, schar, float>
01487 {
01488     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
01489     {
01490         int x = 0;
01491 
01492         if (!USE_SSE2)
01493             return x;
01494 
01495         __m128i v_zero = _mm_setzero_si128();
01496         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01497 
01498         for ( ; x <= width - 8; x += 8)
01499         {
01500             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01501             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01502             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01503 
01504             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01505             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01506 
01507             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01508                                             _mm_cvtps_epi32(v_dst_1));
01509             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
01510         }
01511 
01512         return x;
01513     }
01514 };
01515 
01516 #if CV_SSE4_1
01517 
01518 template <>
01519 struct cvtScale_SIMD<schar, ushort, float>
01520 {
01521     cvtScale_SIMD()
01522     {
01523         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01524     }
01525 
01526     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
01527     {
01528         int x = 0;
01529 
01530         if (!haveSSE)
01531             return x;
01532 
01533         __m128i v_zero = _mm_setzero_si128();
01534         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01535 
01536         for ( ; x <= width - 8; x += 8)
01537         {
01538             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01539             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01540             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01541 
01542             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01543             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01544 
01545             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
01546                                              _mm_cvtps_epi32(v_dst_1));
01547             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01548         }
01549 
01550         return x;
01551     }
01552 
01553     bool haveSSE;
01554 };
01555 
01556 #endif
01557 
01558 template <>
01559 struct cvtScale_SIMD<schar, short, float>
01560 {
01561     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
01562     {
01563         int x = 0;
01564 
01565         if (!USE_SSE2)
01566             return x;
01567 
01568         __m128i v_zero = _mm_setzero_si128();
01569         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01570 
01571         for ( ; x <= width - 8; x += 8)
01572         {
01573             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01574             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01575             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01576 
01577             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01578             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01579 
01580             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01581                                             _mm_cvtps_epi32(v_dst_1));
01582             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01583         }
01584 
01585         return x;
01586     }
01587 };
01588 
01589 template <>
01590 struct cvtScale_SIMD<schar, int, float>
01591 {
01592     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
01593     {
01594         int x = 0;
01595 
01596         if (!USE_SSE2)
01597             return x;
01598 
01599         __m128i v_zero = _mm_setzero_si128();
01600         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01601 
01602         for ( ; x <= width - 8; x += 8)
01603         {
01604             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01605             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01606             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01607 
01608             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01609             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01610 
01611             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
01612             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
01613         }
01614 
01615         return x;
01616     }
01617 };
01618 
01619 template <>
01620 struct cvtScale_SIMD<schar, float, float>
01621 {
01622     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
01623     {
01624         int x = 0;
01625 
01626         if (!USE_SSE2)
01627             return x;
01628 
01629         __m128i v_zero = _mm_setzero_si128();
01630         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01631 
01632         for ( ; x <= width - 8; x += 8)
01633         {
01634             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01635             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01636             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01637 
01638             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01639             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01640 
01641             _mm_storeu_ps(dst + x, v_dst_0);
01642             _mm_storeu_ps(dst + x + 4, v_dst_1);
01643         }
01644 
01645         return x;
01646     }
01647 };
01648 
01649 template <>
01650 struct cvtScale_SIMD<schar, double, double>
01651 {
01652     int operator () (const schar * src, double * dst, int width, double scale, double shift) const
01653     {
01654         int x = 0;
01655 
01656         if (!USE_SSE2)
01657             return x;
01658 
01659         __m128i v_zero = _mm_setzero_si128();
01660         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
01661 
01662         for ( ; x <= width - 8; x += 8)
01663         {
01664             __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x)));
01665             v_src = _mm_srai_epi16(v_src, 8);
01666 
01667             __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
01668             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01669             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01670             _mm_storeu_pd(dst + x, v_dst_0);
01671             _mm_storeu_pd(dst + x + 2, v_dst_1);
01672 
01673             v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
01674             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01675             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01676             _mm_storeu_pd(dst + x + 4, v_dst_0);
01677             _mm_storeu_pd(dst + x + 6, v_dst_1);
01678         }
01679 
01680         return x;
01681     }
01682 };
01683 
01684 // from ushort
01685 
01686 template <>
01687 struct cvtScale_SIMD<ushort, uchar, float>
01688 {
01689     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
01690     {
01691         int x = 0;
01692 
01693         if (!USE_SSE2)
01694             return x;
01695 
01696         __m128i v_zero = _mm_setzero_si128();
01697         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01698 
01699         for ( ; x <= width - 8; x += 8)
01700         {
01701             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01702             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01703             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01704 
01705             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01706             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01707 
01708             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01709                                             _mm_cvtps_epi32(v_dst_1));
01710             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
01711         }
01712 
01713         return x;
01714     }
01715 };
01716 
01717 template <>
01718 struct cvtScale_SIMD<ushort, schar, float>
01719 {
01720     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
01721     {
01722         int x = 0;
01723 
01724         if (!USE_SSE2)
01725             return x;
01726 
01727         __m128i v_zero = _mm_setzero_si128();
01728         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01729 
01730         for ( ; x <= width - 8; x += 8)
01731         {
01732             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01733             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01734             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01735 
01736             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01737             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01738 
01739             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01740                                             _mm_cvtps_epi32(v_dst_1));
01741             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
01742         }
01743 
01744         return x;
01745     }
01746 };
01747 
01748 #if CV_SSE4_1
01749 
01750 template <>
01751 struct cvtScale_SIMD<ushort, ushort, float>
01752 {
01753     cvtScale_SIMD()
01754     {
01755         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01756     }
01757 
01758     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
01759     {
01760         int x = 0;
01761 
01762         if (!haveSSE)
01763             return x;
01764 
01765         __m128i v_zero = _mm_setzero_si128();
01766         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01767 
01768         for ( ; x <= width - 8; x += 8)
01769         {
01770             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01771             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01772             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01773 
01774             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01775             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01776 
01777             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
01778                                              _mm_cvtps_epi32(v_dst_1));
01779             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01780         }
01781 
01782         return x;
01783     }
01784 
01785     bool haveSSE;
01786 };
01787 
01788 #endif
01789 
01790 template <>
01791 struct cvtScale_SIMD<ushort, short, float>
01792 {
01793     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
01794     {
01795         int x = 0;
01796 
01797         if (!USE_SSE2)
01798             return x;
01799 
01800         __m128i v_zero = _mm_setzero_si128();
01801         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01802 
01803         for ( ; x <= width - 8; x += 8)
01804         {
01805             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01806             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01807             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01808 
01809             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01810             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01811 
01812             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01813                                             _mm_cvtps_epi32(v_dst_1));
01814             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01815         }
01816 
01817         return x;
01818     }
01819 };
01820 
01821 template <>
01822 struct cvtScale_SIMD<ushort, int, float>
01823 {
01824     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
01825     {
01826         int x = 0;
01827 
01828         if (!USE_SSE2)
01829             return x;
01830 
01831         __m128i v_zero = _mm_setzero_si128();
01832         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01833 
01834         for ( ; x <= width - 8; x += 8)
01835         {
01836             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01837             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01838             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01839 
01840             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01841             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01842 
01843             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
01844             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
01845         }
01846 
01847         return x;
01848     }
01849 };
01850 
01851 template <>
01852 struct cvtScale_SIMD<ushort, float, float>
01853 {
01854     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
01855     {
01856         int x = 0;
01857 
01858         if (!USE_SSE2)
01859             return x;
01860 
01861         __m128i v_zero = _mm_setzero_si128();
01862         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01863 
01864         for ( ; x <= width - 8; x += 8)
01865         {
01866             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01867             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01868             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01869 
01870             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01871             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01872 
01873             _mm_storeu_ps(dst + x, v_dst_0);
01874             _mm_storeu_ps(dst + x + 4, v_dst_1);
01875         }
01876 
01877         return x;
01878     }
01879 };
01880 
01881 template <>
01882 struct cvtScale_SIMD<ushort, double, double>
01883 {
01884     int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
01885     {
01886         int x = 0;
01887 
01888         if (!USE_SSE2)
01889             return x;
01890 
01891         __m128i v_zero = _mm_setzero_si128();
01892         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
01893 
01894         for ( ; x <= width - 8; x += 8)
01895         {
01896             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01897 
01898             __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
01899             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01900             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01901             _mm_storeu_pd(dst + x, v_dst_0);
01902             _mm_storeu_pd(dst + x + 2, v_dst_1);
01903 
01904             v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
01905             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01906             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01907             _mm_storeu_pd(dst + x + 4, v_dst_0);
01908             _mm_storeu_pd(dst + x + 6, v_dst_1);
01909         }
01910 
01911         return x;
01912     }
01913 };
01914 
01915 // from short
01916 
01917 template <>
01918 struct cvtScale_SIMD<short, uchar, float>
01919 {
01920     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
01921     {
01922         int x = 0;
01923 
01924         if (!USE_SSE2)
01925             return x;
01926 
01927         __m128i v_zero = _mm_setzero_si128();
01928         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01929 
01930         for ( ; x <= width - 8; x += 8)
01931         {
01932             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01933             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01934             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01935 
01936             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01937             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01938 
01939             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01940                                             _mm_cvtps_epi32(v_dst_1));
01941             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
01942         }
01943 
01944         return x;
01945     }
01946 };
01947 
01948 template <>
01949 struct cvtScale_SIMD<short, schar, float>
01950 {
01951     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
01952     {
01953         int x = 0;
01954 
01955         if (!USE_SSE2)
01956             return x;
01957 
01958         __m128i v_zero = _mm_setzero_si128();
01959         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01960 
01961         for ( ; x <= width - 8; x += 8)
01962         {
01963             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01964             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01965             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01966 
01967             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01968             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01969 
01970             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01971                                             _mm_cvtps_epi32(v_dst_1));
01972             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
01973         }
01974 
01975         return x;
01976     }
01977 };
01978 
01979 #if CV_SSE4_1
01980 
01981 template <>
01982 struct cvtScale_SIMD<short, ushort, float>
01983 {
01984     cvtScale_SIMD()
01985     {
01986         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01987     }
01988 
01989     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
01990     {
01991         int x = 0;
01992 
01993         if (!haveSSE)
01994             return x;
01995 
01996         __m128i v_zero = _mm_setzero_si128();
01997         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01998 
01999         for ( ; x <= width - 8; x += 8)
02000         {
02001             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02002             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
02003             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02004 
02005             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
02006             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02007 
02008             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
02009                                              _mm_cvtps_epi32(v_dst_1));
02010             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02011         }
02012 
02013         return x;
02014     }
02015 
02016     bool haveSSE;
02017 };
02018 
02019 #endif
02020 
02021 template <>
02022 struct cvtScale_SIMD<short, short, float>
02023 {
02024     int operator () (const short * src, short * dst, int width, float scale, float shift) const
02025     {
02026         int x = 0;
02027 
02028         if (!USE_SSE2)
02029             return x;
02030 
02031         __m128i v_zero = _mm_setzero_si128();
02032         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02033 
02034         for ( ; x <= width - 8; x += 8)
02035         {
02036             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02037             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
02038             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02039 
02040             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
02041             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02042 
02043             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02044                                             _mm_cvtps_epi32(v_dst_1));
02045             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02046         }
02047 
02048         return x;
02049     }
02050 };
02051 
02052 template <>
02053 struct cvtScale_SIMD<short, int, float>
02054 {
02055     int operator () (const short * src, int * dst, int width, float scale, float shift) const
02056     {
02057         int x = 0;
02058 
02059         if (!USE_SSE2)
02060             return x;
02061 
02062         __m128i v_zero = _mm_setzero_si128();
02063         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02064 
02065         for ( ; x <= width - 8; x += 8)
02066         {
02067             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02068             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
02069             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02070 
02071             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
02072             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02073 
02074             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
02075             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
02076         }
02077 
02078         return x;
02079     }
02080 };
02081 
02082 template <>
02083 struct cvtScale_SIMD<short, float, float>
02084 {
02085     int operator () (const short * src, float * dst, int width, float scale, float shift) const
02086     {
02087         int x = 0;
02088 
02089         if (!USE_SSE2)
02090             return x;
02091 
02092         __m128i v_zero = _mm_setzero_si128();
02093         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02094 
02095         for ( ; x <= width - 8; x += 8)
02096         {
02097             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02098             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
02099             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02100 
02101             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
02102             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02103 
02104             _mm_storeu_ps(dst + x, v_dst_0);
02105             _mm_storeu_ps(dst + x + 4, v_dst_1);
02106         }
02107 
02108         return x;
02109     }
02110 };
02111 
02112 template <>
02113 struct cvtScale_SIMD<short, double, double>
02114 {
02115     int operator () (const short * src, double * dst, int width, double scale, double shift) const
02116     {
02117         int x = 0;
02118 
02119         if (!USE_SSE2)
02120             return x;
02121 
02122         __m128i v_zero = _mm_setzero_si128();
02123         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02124 
02125         for ( ; x <= width - 8; x += 8)
02126         {
02127             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02128 
02129             __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
02130             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
02131             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
02132             _mm_storeu_pd(dst + x, v_dst_0);
02133             _mm_storeu_pd(dst + x + 2, v_dst_1);
02134 
02135             v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
02136             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
02137             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
02138             _mm_storeu_pd(dst + x + 4, v_dst_0);
02139             _mm_storeu_pd(dst + x + 6, v_dst_1);
02140         }
02141 
02142         return x;
02143     }
02144 };
02145 
02146 // from int
02147 
02148 template <>
02149 struct cvtScale_SIMD<int, uchar, float>
02150 {
02151     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
02152     {
02153         int x = 0;
02154 
02155         if (!USE_SSE2)
02156             return x;
02157 
02158         __m128i v_zero = _mm_setzero_si128();
02159         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02160 
02161         for ( ; x <= width - 8; x += 8)
02162         {
02163             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02164             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02165 
02166             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
02167             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02168 
02169             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02170                                             _mm_cvtps_epi32(v_dst_1));
02171             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
02172         }
02173 
02174         return x;
02175     }
02176 };
02177 
02178 template <>
02179 struct cvtScale_SIMD<int, schar, float>
02180 {
02181     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
02182     {
02183         int x = 0;
02184 
02185         if (!USE_SSE2)
02186             return x;
02187 
02188         __m128i v_zero = _mm_setzero_si128();
02189         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02190 
02191         for ( ; x <= width - 8; x += 8)
02192         {
02193             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02194             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02195 
02196             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
02197             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02198 
02199             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02200                                             _mm_cvtps_epi32(v_dst_1));
02201             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
02202         }
02203 
02204         return x;
02205     }
02206 };
02207 
02208 #if CV_SSE4_1
02209 
02210 template <>
02211 struct cvtScale_SIMD<int, ushort, float>
02212 {
02213     cvtScale_SIMD()
02214     {
02215         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
02216     }
02217 
02218     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
02219     {
02220         int x = 0;
02221 
02222         if (!haveSSE)
02223             return x;
02224 
02225         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02226 
02227         for ( ; x <= width - 8; x += 8)
02228         {
02229             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02230             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02231 
02232             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
02233             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02234 
02235             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
02236                                              _mm_cvtps_epi32(v_dst_1));
02237             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02238         }
02239 
02240         return x;
02241     }
02242 
02243     bool haveSSE;
02244 };
02245 
02246 #endif
02247 
02248 template <>
02249 struct cvtScale_SIMD<int, short, float>
02250 {
02251     int operator () (const int * src, short * dst, int width, float scale, float shift) const
02252     {
02253         int x = 0;
02254 
02255         if (!USE_SSE2)
02256             return x;
02257 
02258         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02259 
02260         for ( ; x <= width - 8; x += 8)
02261         {
02262             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02263             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02264 
02265             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
02266             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02267 
02268             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02269                                             _mm_cvtps_epi32(v_dst_1));
02270             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02271         }
02272 
02273         return x;
02274     }
02275 };
02276 
02277 template <>
02278 struct cvtScale_SIMD<int, int, double>
02279 {
02280     int operator () (const int * src, int * dst, int width, double scale, double shift) const
02281     {
02282         int x = 0;
02283 
02284         if (!USE_SSE2)
02285             return x;
02286 
02287         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02288 
02289         for ( ; x <= width - 4; x += 4)
02290         {
02291             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02292             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02293 
02294             v_src = _mm_srli_si128(v_src, 8);
02295             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02296 
02297             __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)),
02298                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1)));
02299 
02300             _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
02301         }
02302 
02303         return x;
02304     }
02305 };
02306 
02307 template <>
02308 struct cvtScale_SIMD<int, float, double>
02309 {
02310     int operator () (const int * src, float * dst, int width, double scale, double shift) const
02311     {
02312         int x = 0;
02313 
02314         if (!USE_SSE2)
02315             return x;
02316 
02317         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02318 
02319         for ( ; x <= width - 4; x += 4)
02320         {
02321             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02322             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02323 
02324             v_src = _mm_srli_si128(v_src, 8);
02325             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02326 
02327             _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0),
02328                                                  _mm_cvtpd_ps(v_dst_1)));
02329         }
02330 
02331         return x;
02332     }
02333 };
02334 
02335 template <>
02336 struct cvtScale_SIMD<int, double, double>
02337 {
02338     int operator () (const int * src, double * dst, int width, double scale, double shift) const
02339     {
02340         int x = 0;
02341 
02342         if (!USE_SSE2)
02343             return x;
02344 
02345         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02346 
02347         for ( ; x <= width - 4; x += 4)
02348         {
02349             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02350             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02351 
02352             v_src = _mm_srli_si128(v_src, 8);
02353             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02354 
02355             _mm_storeu_pd(dst + x, v_dst_0);
02356             _mm_storeu_pd(dst + x + 2, v_dst_1);
02357         }
02358 
02359         return x;
02360     }
02361 };
02362 
02363 // from float
02364 
02365 template <>
02366 struct cvtScale_SIMD<float, uchar, float>
02367 {
02368     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
02369     {
02370         int x = 0;
02371 
02372         if (!USE_SSE2)
02373             return x;
02374 
02375         __m128i v_zero = _mm_setzero_si128();
02376         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02377 
02378         for ( ; x <= width - 8; x += 8)
02379         {
02380             __m128 v_src = _mm_loadu_ps(src + x);
02381             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02382 
02383             v_src = _mm_loadu_ps(src + x + 4);
02384             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02385 
02386             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02387                                             _mm_cvtps_epi32(v_dst_1));
02388             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
02389         }
02390 
02391         return x;
02392     }
02393 };
02394 
02395 template <>
02396 struct cvtScale_SIMD<float, schar, float>
02397 {
02398     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
02399     {
02400         int x = 0;
02401 
02402         if (!USE_SSE2)
02403             return x;
02404 
02405         __m128i v_zero = _mm_setzero_si128();
02406         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02407 
02408         for ( ; x <= width - 8; x += 8)
02409         {
02410             __m128 v_src = _mm_loadu_ps(src + x);
02411             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02412 
02413             v_src = _mm_loadu_ps(src + x + 4);
02414             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02415 
02416             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02417                                             _mm_cvtps_epi32(v_dst_1));
02418             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
02419         }
02420 
02421         return x;
02422     }
02423 };
02424 
02425 #if CV_SSE4_1
02426 
02427 template <>
02428 struct cvtScale_SIMD<float, ushort, float>
02429 {
02430     cvtScale_SIMD()
02431     {
02432         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
02433     }
02434 
02435     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
02436     {
02437         int x = 0;
02438 
02439         if (!haveSSE)
02440             return x;
02441 
02442         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02443 
02444         for ( ; x <= width - 8; x += 8)
02445         {
02446             __m128 v_src = _mm_loadu_ps(src + x);
02447             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02448 
02449             v_src = _mm_loadu_ps(src + x + 4);
02450             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02451 
02452             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
02453                                              _mm_cvtps_epi32(v_dst_1));
02454             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02455         }
02456 
02457         return x;
02458     }
02459 
02460     bool haveSSE;
02461 };
02462 
02463 #endif
02464 
02465 template <>
02466 struct cvtScale_SIMD<float, short, float>
02467 {
02468     int operator () (const float * src, short * dst, int width, float scale, float shift) const
02469     {
02470         int x = 0;
02471 
02472         if (!USE_SSE2)
02473             return x;
02474 
02475         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02476 
02477         for ( ; x <= width - 8; x += 8)
02478         {
02479             __m128 v_src = _mm_loadu_ps(src + x);
02480             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02481 
02482             v_src = _mm_loadu_ps(src + x + 4);
02483             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02484 
02485             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02486                                             _mm_cvtps_epi32(v_dst_1));
02487             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02488         }
02489 
02490         return x;
02491     }
02492 };
02493 
02494 template <>
02495 struct cvtScale_SIMD<float, int, float>
02496 {
02497     int operator () (const float * src, int * dst, int width, float scale, float shift) const
02498     {
02499         int x = 0;
02500 
02501         if (!USE_SSE2)
02502             return x;
02503 
02504         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02505 
02506         for ( ; x <= width - 8; x += 8)
02507         {
02508             __m128 v_src = _mm_loadu_ps(src + x);
02509             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02510 
02511             v_src = _mm_loadu_ps(src + x + 4);
02512             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02513 
02514             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
02515             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
02516         }
02517 
02518         return x;
02519     }
02520 };
02521 
02522 template <>
02523 struct cvtScale_SIMD<float, float, float>
02524 {
02525     int operator () (const float * src, float * dst, int width, float scale, float shift) const
02526     {
02527         int x = 0;
02528 
02529         if (!USE_SSE2)
02530             return x;
02531 
02532         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02533 
02534         for ( ; x <= width - 4; x += 4)
02535         {
02536             __m128 v_src = _mm_loadu_ps(src + x);
02537             __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02538             _mm_storeu_ps(dst + x, v_dst);
02539         }
02540 
02541         return x;
02542     }
02543 };
02544 
02545 template <>
02546 struct cvtScale_SIMD<float, double, double>
02547 {
02548     int operator () (const float * src, double * dst, int width, double scale, double shift) const
02549     {
02550         int x = 0;
02551 
02552         if (!USE_SSE2)
02553             return x;
02554 
02555         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02556 
02557         for ( ; x <= width - 4; x += 4)
02558         {
02559             __m128 v_src = _mm_loadu_ps(src + x);
02560             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
02561             v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
02562             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
02563 
02564             _mm_storeu_pd(dst + x, v_dst_0);
02565             _mm_storeu_pd(dst + x + 2, v_dst_1);
02566         }
02567 
02568         return x;
02569     }
02570 };
02571 
02572 // from double
02573 
02574 template <>
02575 struct cvtScale_SIMD<double, uchar, float>
02576 {
02577     int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
02578     {
02579         int x = 0;
02580 
02581         if (!USE_SSE2)
02582             return x;
02583 
02584         __m128i v_zero = _mm_setzero_si128();
02585         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02586 
02587         for ( ; x <= width - 8; x += 8)
02588         {
02589             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
02590                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
02591             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02592 
02593             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
02594                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
02595             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02596 
02597             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02598                                             _mm_cvtps_epi32(v_dst_1));
02599             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
02600         }
02601 
02602         return x;
02603     }
02604 };
02605 
02606 template <>
02607 struct cvtScale_SIMD<double, schar, float>
02608 {
02609     int operator () (const double * src, schar * dst, int width, float scale, float shift) const
02610     {
02611         int x = 0;
02612 
02613         if (!USE_SSE2)
02614             return x;
02615 
02616         __m128i v_zero = _mm_setzero_si128();
02617         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02618 
02619         for ( ; x <= width - 8; x += 8)
02620         {
02621             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
02622                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
02623             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02624 
02625             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
02626                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
02627             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02628 
02629             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02630                                             _mm_cvtps_epi32(v_dst_1));
02631             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
02632         }
02633 
02634         return x;
02635     }
02636 };
02637 
02638 #if CV_SSE4_1
02639 
02640 template <>
02641 struct cvtScale_SIMD<double, ushort, float>
02642 {
02643     cvtScale_SIMD()
02644     {
02645         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
02646     }
02647 
02648     int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
02649     {
02650         int x = 0;
02651 
02652         if (!haveSSE)
02653             return x;
02654 
02655         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02656 
02657         for ( ; x <= width - 8; x += 8)
02658         {
02659             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
02660                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
02661             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02662 
02663             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
02664                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
02665             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02666 
02667             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
02668                                              _mm_cvtps_epi32(v_dst_1));
02669             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02670         }
02671 
02672         return x;
02673     }
02674 
02675     bool haveSSE;
02676 };
02677 
02678 #endif
02679 
02680 template <>
02681 struct cvtScale_SIMD<double, short, float>
02682 {
02683     int operator () (const double * src, short * dst, int width, float scale, float shift) const
02684     {
02685         int x = 0;
02686 
02687         if (!USE_SSE2)
02688             return x;
02689 
02690         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02691 
02692         for ( ; x <= width - 8; x += 8)
02693         {
02694             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
02695                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
02696             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02697 
02698             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
02699                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
02700             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02701 
02702             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02703                                             _mm_cvtps_epi32(v_dst_1));
02704             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02705         }
02706 
02707         return x;
02708     }
02709 };
02710 
02711 template <>
02712 struct cvtScale_SIMD<double, int, double>
02713 {
02714     int operator () (const double * src, int * dst, int width, double scale, double shift) const
02715     {
02716         int x = 0;
02717 
02718         if (!USE_SSE2)
02719             return x;
02720 
02721         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02722 
02723         for ( ; x <= width - 4; x += 4)
02724         {
02725             __m128d v_src = _mm_loadu_pd(src + x);
02726             __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02727 
02728             v_src = _mm_loadu_pd(src + x + 2);
02729             __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02730 
02731             __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)),
02732                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1)));
02733 
02734             _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
02735         }
02736 
02737         return x;
02738     }
02739 };
02740 
02741 template <>
02742 struct cvtScale_SIMD<double, float, double>
02743 {
02744     int operator () (const double * src, float * dst, int width, double scale, double shift) const
02745     {
02746         int x = 0;
02747 
02748         if (!USE_SSE2)
02749             return x;
02750 
02751         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02752 
02753         for ( ; x <= width - 4; x += 4)
02754         {
02755             __m128d v_src = _mm_loadu_pd(src + x);
02756             __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02757 
02758             v_src = _mm_loadu_pd(src + x + 2);
02759             __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02760 
02761             __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0),
02762                                          _mm_cvtpd_ps(v_dst1));
02763 
02764             _mm_storeu_ps(dst + x, v_dst);
02765         }
02766 
02767         return x;
02768     }
02769 };
02770 
02771 template <>
02772 struct cvtScale_SIMD<double, double, double>
02773 {
02774     int operator () (const double * src, double * dst, int width, double scale, double shift) const
02775     {
02776         int x = 0;
02777 
02778         if (!USE_SSE2)
02779             return x;
02780 
02781         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02782 
02783         for ( ; x <= width - 2; x += 2)
02784         {
02785             __m128d v_src = _mm_loadu_pd(src + x);
02786             __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02787             _mm_storeu_pd(dst + x, v_dst);
02788         }
02789 
02790         return x;
02791     }
02792 };
02793 
02794 #elif CV_NEON
02795 
02796 // from uchar
02797 
02798 template <>
02799 struct cvtScale_SIMD<uchar, uchar, float>
02800 {
02801     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
02802     {
02803         int x = 0;
02804         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02805 
02806         for ( ; x <= width - 8; x += 8)
02807         {
02808             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02809             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02810             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02811 
02812             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
02813                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
02814             vst1_u8(dst + x, vqmovn_u16(v_dst));
02815         }
02816 
02817         return x;
02818     }
02819 };
02820 
02821 template <>
02822 struct cvtScale_SIMD<uchar, schar, float>
02823 {
02824     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
02825     {
02826         int x = 0;
02827         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02828 
02829         for ( ; x <= width - 8; x += 8)
02830         {
02831             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02832             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02833             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02834 
02835             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
02836                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
02837             vst1_s8(dst + x, vqmovn_s16(v_dst));
02838         }
02839 
02840         return x;
02841     }
02842 };
02843 
02844 template <>
02845 struct cvtScale_SIMD<uchar, ushort, float>
02846 {
02847     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
02848     {
02849         int x = 0;
02850         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02851 
02852         for ( ; x <= width - 8; x += 8)
02853         {
02854             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02855             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02856             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02857 
02858             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
02859                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
02860             vst1q_u16(dst + x, v_dst);
02861         }
02862 
02863         return x;
02864     }
02865 };
02866 
02867 template <>
02868 struct cvtScale_SIMD<uchar, short, float>
02869 {
02870     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
02871     {
02872         int x = 0;
02873         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02874 
02875         for ( ; x <= width - 8; x += 8)
02876         {
02877             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02878             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02879             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02880 
02881             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
02882                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
02883             vst1q_s16(dst + x, v_dst);
02884         }
02885 
02886         return x;
02887     }
02888 };
02889 
02890 template <>
02891 struct cvtScale_SIMD<uchar, int, float>
02892 {
02893     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
02894     {
02895         int x = 0;
02896         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02897 
02898         for ( ; x <= width - 8; x += 8)
02899         {
02900             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02901             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02902             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02903 
02904             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
02905             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
02906         }
02907 
02908         return x;
02909     }
02910 };
02911 
02912 template <>
02913 struct cvtScale_SIMD<uchar, float, float>
02914 {
02915     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
02916     {
02917         int x = 0;
02918         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02919 
02920         for ( ; x <= width - 8; x += 8)
02921         {
02922             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02923             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
02924             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
02925         }
02926 
02927         return x;
02928     }
02929 };
02930 
02931 // from schar
02932 
02933 template <>
02934 struct cvtScale_SIMD<schar, uchar, float>
02935 {
02936     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
02937     {
02938         int x = 0;
02939         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02940 
02941         for ( ; x <= width - 8; x += 8)
02942         {
02943             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
02944             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
02945             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
02946 
02947             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
02948                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
02949             vst1_u8(dst + x, vqmovn_u16(v_dst));
02950         }
02951 
02952         return x;
02953     }
02954 };
02955 
02956 template <>
02957 struct cvtScale_SIMD<schar, schar, float>
02958 {
02959     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
02960     {
02961         int x = 0;
02962         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02963 
02964         for ( ; x <= width - 8; x += 8)
02965         {
02966             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
02967             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
02968             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
02969 
02970             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
02971                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
02972             vst1_s8(dst + x, vqmovn_s16(v_dst));
02973         }
02974 
02975         return x;
02976     }
02977 };
02978 
02979 template <>
02980 struct cvtScale_SIMD<schar, ushort, float>
02981 {
02982     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
02983     {
02984         int x = 0;
02985         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02986 
02987         for ( ; x <= width - 8; x += 8)
02988         {
02989             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
02990             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
02991             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
02992 
02993             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
02994                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
02995             vst1q_u16(dst + x, v_dst);
02996         }
02997 
02998         return x;
02999     }
03000 };
03001 
03002 template <>
03003 struct cvtScale_SIMD<schar, short, float>
03004 {
03005     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
03006     {
03007         int x = 0;
03008         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03009 
03010         for ( ; x <= width - 8; x += 8)
03011         {
03012             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03013             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03014             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03015 
03016             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03017                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03018             vst1q_s16(dst + x, v_dst);
03019         }
03020 
03021         return x;
03022     }
03023 };
03024 
03025 template <>
03026 struct cvtScale_SIMD<schar, int, float>
03027 {
03028     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
03029     {
03030         int x = 0;
03031         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03032 
03033         for ( ; x <= width - 8; x += 8)
03034         {
03035             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03036             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03037             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03038 
03039             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
03040             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
03041         }
03042 
03043         return x;
03044     }
03045 };
03046 
03047 template <>
03048 struct cvtScale_SIMD<schar, float, float>
03049 {
03050     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
03051     {
03052         int x = 0;
03053         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03054 
03055         for ( ; x <= width - 8; x += 8)
03056         {
03057             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03058             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
03059             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
03060         }
03061 
03062         return x;
03063     }
03064 };
03065 
03066 // from ushort
03067 
03068 template <>
03069 struct cvtScale_SIMD<ushort, uchar, float>
03070 {
03071     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
03072     {
03073         int x = 0;
03074         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03075 
03076         for ( ; x <= width - 8; x += 8)
03077         {
03078             uint16x8_t v_src = vld1q_u16(src + x);
03079             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03080             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03081 
03082             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03083                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03084             vst1_u8(dst + x, vqmovn_u16(v_dst));
03085         }
03086 
03087         return x;
03088     }
03089 };
03090 
03091 template <>
03092 struct cvtScale_SIMD<ushort, schar, float>
03093 {
03094     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
03095     {
03096         int x = 0;
03097         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03098 
03099         for ( ; x <= width - 8; x += 8)
03100         {
03101             uint16x8_t v_src = vld1q_u16(src + x);
03102             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03103             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03104 
03105             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03106                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03107             vst1_s8(dst + x, vqmovn_s16(v_dst));
03108         }
03109 
03110         return x;
03111     }
03112 };
03113 
03114 template <>
03115 struct cvtScale_SIMD<ushort, ushort, float>
03116 {
03117     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
03118     {
03119         int x = 0;
03120         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03121 
03122         for ( ; x <= width - 8; x += 8)
03123         {
03124             uint16x8_t v_src = vld1q_u16(src + x);
03125             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03126             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03127 
03128             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03129                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03130             vst1q_u16(dst + x, v_dst);
03131         }
03132 
03133         return x;
03134     }
03135 };
03136 
03137 template <>
03138 struct cvtScale_SIMD<ushort, short, float>
03139 {
03140     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
03141     {
03142         int x = 0;
03143         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03144 
03145         for ( ; x <= width - 8; x += 8)
03146         {
03147             uint16x8_t v_src = vld1q_u16(src + x);
03148             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03149             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03150 
03151             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03152                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03153             vst1q_s16(dst + x, v_dst);
03154         }
03155 
03156         return x;
03157     }
03158 };
03159 
03160 template <>
03161 struct cvtScale_SIMD<ushort, int, float>
03162 {
03163     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
03164     {
03165         int x = 0;
03166         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03167 
03168         for ( ; x <= width - 8; x += 8)
03169         {
03170             uint16x8_t v_src = vld1q_u16(src + x);
03171             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03172             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03173 
03174             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
03175             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
03176         }
03177 
03178         return x;
03179     }
03180 };
03181 
03182 template <>
03183 struct cvtScale_SIMD<ushort, float, float>
03184 {
03185     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
03186     {
03187         int x = 0;
03188         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03189 
03190         for ( ; x <= width - 8; x += 8)
03191         {
03192             uint16x8_t v_src = vld1q_u16(src + x);
03193             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
03194             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
03195         }
03196 
03197         return x;
03198     }
03199 };
03200 
03201 // from short
03202 
03203 template <>
03204 struct cvtScale_SIMD<short, uchar, float>
03205 {
03206     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
03207     {
03208         int x = 0;
03209         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03210 
03211         for ( ; x <= width - 8; x += 8)
03212         {
03213             int16x8_t v_src = vld1q_s16(src + x);
03214             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03215             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03216 
03217             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03218                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03219             vst1_u8(dst + x, vqmovn_u16(v_dst));
03220         }
03221 
03222         return x;
03223     }
03224 };
03225 
03226 template <>
03227 struct cvtScale_SIMD<short, schar, float>
03228 {
03229     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
03230     {
03231         int x = 0;
03232         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03233 
03234         for ( ; x <= width - 8; x += 8)
03235         {
03236             int16x8_t v_src = vld1q_s16(src + x);
03237             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03238             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03239 
03240             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03241                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03242             vst1_s8(dst + x, vqmovn_s16(v_dst));
03243         }
03244 
03245         return x;
03246     }
03247 };
03248 
03249 template <>
03250 struct cvtScale_SIMD<short, ushort, float>
03251 {
03252     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
03253     {
03254         int x = 0;
03255         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03256 
03257         for ( ; x <= width - 8; x += 8)
03258         {
03259             int16x8_t v_src = vld1q_s16(src + x);
03260             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03261             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03262 
03263             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03264                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03265             vst1q_u16(dst + x, v_dst);
03266         }
03267 
03268         return x;
03269     }
03270 };
03271 
03272 template <>
03273 struct cvtScale_SIMD<short, float, float>
03274 {
03275     int operator () (const short * src, float * dst, int width, float scale, float shift) const
03276     {
03277         int x = 0;
03278         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03279 
03280         for ( ; x <= width - 8; x += 8)
03281         {
03282             int16x8_t v_src = vld1q_s16(src + x);
03283             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
03284             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
03285         }
03286 
03287         return x;
03288     }
03289 };
03290 
03291 // from int
03292 
03293 template <>
03294 struct cvtScale_SIMD<int, uchar, float>
03295 {
03296     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
03297     {
03298         int x = 0;
03299         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03300 
03301         for ( ; x <= width - 8; x += 8)
03302         {
03303             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
03304             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
03305 
03306             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03307                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03308             vst1_u8(dst + x, vqmovn_u16(v_dst));
03309         }
03310 
03311         return x;
03312     }
03313 };
03314 
03315 template <>
03316 struct cvtScale_SIMD<int, schar, float>
03317 {
03318     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
03319     {
03320         int x = 0;
03321         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03322 
03323         for ( ; x <= width - 8; x += 8)
03324         {
03325             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
03326             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
03327 
03328             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03329                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03330             vst1_s8(dst + x, vqmovn_s16(v_dst));
03331         }
03332 
03333         return x;
03334     }
03335 };
03336 
03337 template <>
03338 struct cvtScale_SIMD<int, ushort, float>
03339 {
03340     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
03341     {
03342         int x = 0;
03343         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03344 
03345         for ( ; x <= width - 8; x += 8)
03346         {
03347             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
03348             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
03349 
03350             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03351                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03352             vst1q_u16(dst + x, v_dst);
03353         }
03354 
03355         return x;
03356     }
03357 };
03358 
03359 template <>
03360 struct cvtScale_SIMD<int, short, float>
03361 {
03362     int operator () (const int * src, short * dst, int width, float scale, float shift) const
03363     {
03364         int x = 0;
03365         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03366 
03367         for ( ; x <= width - 8; x += 8)
03368         {
03369             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
03370             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
03371 
03372             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03373                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03374             vst1q_s16(dst + x, v_dst);
03375         }
03376 
03377         return x;
03378     }
03379 };
03380 
03381 // from float
03382 
03383 template <>
03384 struct cvtScale_SIMD<float, uchar, float>
03385 {
03386     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
03387     {
03388         int x = 0;
03389         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03390 
03391         for ( ; x <= width - 8; x += 8)
03392         {
03393             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
03394             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
03395 
03396             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03397                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03398             vst1_u8(dst + x, vqmovn_u16(v_dst));
03399         }
03400 
03401         return x;
03402     }
03403 };
03404 
03405 template <>
03406 struct cvtScale_SIMD<float, schar, float>
03407 {
03408     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
03409     {
03410         int x = 0;
03411         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03412 
03413         for ( ; x <= width - 8; x += 8)
03414         {
03415             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
03416             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
03417 
03418             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03419                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03420             vst1_s8(dst + x, vqmovn_s16(v_dst));
03421         }
03422 
03423         return x;
03424     }
03425 };
03426 
03427 template <>
03428 struct cvtScale_SIMD<float, ushort, float>
03429 {
03430     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
03431     {
03432         int x = 0;
03433         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03434 
03435         for ( ; x <= width - 8; x += 8)
03436         {
03437             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
03438             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
03439 
03440             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03441                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03442             vst1q_u16(dst + x, v_dst);
03443         }
03444 
03445         return x;
03446     }
03447 };
03448 
03449 template <>
03450 struct cvtScale_SIMD<float, short, float>
03451 {
03452     int operator () (const float * src, short * dst, int width, float scale, float shift) const
03453     {
03454         int x = 0;
03455         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03456 
03457         for ( ; x <= width - 8; x += 8)
03458         {
03459             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
03460             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
03461 
03462             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03463                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03464             vst1q_s16(dst + x, v_dst);
03465         }
03466 
03467         return x;
03468     }
03469 };
03470 
03471 template <>
03472 struct cvtScale_SIMD<float, int, float>
03473 {
03474     int operator () (const float * src, int * dst, int width, float scale, float shift) const
03475     {
03476         int x = 0;
03477         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03478 
03479         for ( ; x <= width - 4; x += 4)
03480             vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
03481 
03482         return x;
03483     }
03484 };
03485 
03486 template <>
03487 struct cvtScale_SIMD<float, float, float>
03488 {
03489     int operator () (const float * src, float * dst, int width, float scale, float shift) const
03490     {
03491         int x = 0;
03492         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03493 
03494         for ( ; x <= width - 4; x += 4)
03495             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift));
03496 
03497         return x;
03498     }
03499 };
03500 
03501 #endif
03502 
03503 template<typename T, typename DT, typename WT> static void
03504 cvtScale_( const T* src, size_t sstep,
03505            DT* dst, size_t dstep, Size size,
03506            WT scale, WT shift )
03507 {
03508     sstep /= sizeof(src[0]);
03509     dstep /= sizeof(dst[0]);
03510 
03511     cvtScale_SIMD<T, DT, WT> vop;
03512 
03513     for( ; size.height--; src += sstep, dst += dstep )
03514     {
03515         int x = vop(src, dst, size.width, scale, shift);
03516 
03517         #if CV_ENABLE_UNROLLED
03518         for( ; x <= size.width - 4; x += 4 )
03519         {
03520             DT t0, t1;
03521             t0 = saturate_cast<DT>(src[x]*scale + shift);
03522             t1 = saturate_cast<DT>(src[x+1]*scale + shift);
03523             dst[x] = t0; dst[x+1] = t1;
03524             t0 = saturate_cast<DT>(src[x+2]*scale + shift);
03525             t1 = saturate_cast<DT>(src[x+3]*scale + shift);
03526             dst[x+2] = t0; dst[x+3] = t1;
03527         }
03528         #endif
03529 
03530         for( ; x < size.width; x++ )
03531             dst[x] = saturate_cast<DT>(src[x]*scale + shift);
03532     }
03533 }
03534 
03535 //vz optimized template specialization
03536 template<> void
03537 cvtScale_<short, short, float>( const short* src, size_t sstep,
03538            short* dst, size_t dstep, Size size,
03539            float scale, float shift )
03540 {
03541     sstep /= sizeof(src[0]);
03542     dstep /= sizeof(dst[0]);
03543 
03544     for( ; size.height--; src += sstep, dst += dstep )
03545     {
03546         int x = 0;
03547         #if CV_SSE2
03548             if(USE_SSE2)
03549             {
03550                 __m128 scale128 = _mm_set1_ps (scale);
03551                 __m128 shift128 = _mm_set1_ps (shift);
03552                 for(; x <= size.width - 8; x += 8 )
03553                 {
03554                     __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
03555                     __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
03556                     __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
03557                     __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
03558                     rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
03559                     rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
03560                     r0 = _mm_cvtps_epi32(rf0);
03561                     r1 = _mm_cvtps_epi32(rf1);
03562                     r0 = _mm_packs_epi32(r0, r1);
03563                     _mm_storeu_si128((__m128i*)(dst + x), r0);
03564                 }
03565             }
03566         #elif CV_NEON
03567         float32x4_t v_shift = vdupq_n_f32(shift);
03568         for(; x <= size.width - 8; x += 8 )
03569         {
03570             int16x8_t v_src = vld1q_s16(src + x);
03571             float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
03572             float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
03573 
03574             v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
03575             v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
03576 
03577             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)),
03578                                             vqmovn_s32(cv_vrndq_s32_f32(v_tmp2))));
03579         }
03580         #endif
03581 
03582         for(; x < size.width; x++ )
03583             dst[x] = saturate_cast<short>(src[x]*scale + shift);
03584     }
03585 }
03586 
03587 template<> void
03588 cvtScale_<short, int, float>( const short* src, size_t sstep,
03589            int* dst, size_t dstep, Size size,
03590            float scale, float shift )
03591 {
03592     sstep /= sizeof(src[0]);
03593     dstep /= sizeof(dst[0]);
03594 
03595     for( ; size.height--; src += sstep, dst += dstep )
03596     {
03597         int x = 0;
03598 
03599         #if CV_AVX2
03600         if (USE_AVX2)
03601         {
03602             __m256 scale256 = _mm256_set1_ps(scale);
03603             __m256 shift256 = _mm256_set1_ps(shift);
03604             const int shuffle = 0xD8;
03605 
03606             for ( ; x <= size.width - 16; x += 16)
03607             {
03608                 __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
03609                 v_src = _mm256_permute4x64_epi64(v_src, shuffle);
03610                 __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
03611                 __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
03612                 __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
03613                 __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
03614                 _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
03615                 _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
03616             }
03617         }
03618         #endif
03619         #if CV_SSE2
03620         if (USE_SSE2)//~5X
03621         {
03622             __m128 scale128 = _mm_set1_ps (scale);
03623             __m128 shift128 = _mm_set1_ps (shift);
03624             for(; x <= size.width - 8; x += 8 )
03625             {
03626                 __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x));
03627 
03628                 __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
03629                 __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16));
03630                 rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
03631                 rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
03632 
03633                 _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0));
03634                 _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1));
03635             }
03636         }
03637         #elif CV_NEON
03638         float32x4_t v_shift = vdupq_n_f32(shift);
03639         for(; x <= size.width - 8; x += 8 )
03640         {
03641             int16x8_t v_src = vld1q_s16(src + x);
03642             float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
03643             float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
03644 
03645             v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
03646             v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
03647 
03648             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1));
03649             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2));
03650         }
03651         #endif
03652 
03653         for(; x < size.width; x++ )
03654             dst[x] = saturate_cast<int>(src[x]*scale + shift);
03655     }
03656 }
03657 
03658 template <typename T, typename DT>
03659 struct Cvt_SIMD
03660 {
03661     int operator() (const T *, DT *, int) const
03662     {
03663         return 0;
03664     }
03665 };
03666 
03667 #if CV_SSE2
03668 
03669 // from double
03670 
03671 template <>
03672 struct Cvt_SIMD<double, uchar>
03673 {
03674     int operator() (const double * src, uchar * dst, int width) const
03675     {
03676         int x = 0;
03677 
03678         if (!USE_SSE2)
03679             return x;
03680 
03681         for ( ; x <= width - 8; x += 8)
03682         {
03683             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03684             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03685             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
03686             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
03687 
03688             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03689             v_src1 = _mm_movelh_ps(v_src2, v_src3);
03690 
03691             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
03692                                             _mm_cvtps_epi32(v_src1));
03693             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst));
03694         }
03695 
03696         return x;
03697     }
03698 };
03699 
03700 template <>
03701 struct Cvt_SIMD<double, schar>
03702 {
03703     int operator() (const double * src, schar * dst, int width) const
03704     {
03705         int x = 0;
03706 
03707         if (!USE_SSE2)
03708             return x;
03709 
03710         for ( ; x <= width - 8; x += 8)
03711         {
03712             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03713             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03714             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
03715             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
03716 
03717             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03718             v_src1 = _mm_movelh_ps(v_src2, v_src3);
03719 
03720             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
03721                                             _mm_cvtps_epi32(v_src1));
03722             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst));
03723         }
03724 
03725         return x;
03726     }
03727 };
03728 
03729 #if CV_SSE4_1
03730 
03731 template <>
03732 struct Cvt_SIMD<double, ushort>
03733 {
03734     bool haveSIMD;
03735     Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
03736 
03737     int operator() (const double * src, ushort * dst, int width) const
03738     {
03739         int x = 0;
03740 
03741         if (!haveSIMD)
03742             return x;
03743 
03744         for ( ; x <= width - 8; x += 8)
03745         {
03746             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03747             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03748             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
03749             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
03750 
03751             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03752             v_src1 = _mm_movelh_ps(v_src2, v_src3);
03753 
03754             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
03755                                              _mm_cvtps_epi32(v_src1));
03756             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
03757         }
03758 
03759         return x;
03760     }
03761 };
03762 
03763 #endif // CV_SSE4_1
03764 
03765 template <>
03766 struct Cvt_SIMD<double, short>
03767 {
03768     int operator() (const double * src, short * dst, int width) const
03769     {
03770         int x = 0;
03771 
03772         if (!USE_SSE2)
03773             return x;
03774 
03775         for ( ; x <= width - 8; x += 8)
03776         {
03777             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03778             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03779             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
03780             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
03781 
03782             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03783             v_src1 = _mm_movelh_ps(v_src2, v_src3);
03784 
03785             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
03786                                             _mm_cvtps_epi32(v_src1));
03787             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
03788         }
03789 
03790         return x;
03791     }
03792 };
03793 
03794 template <>
03795 struct Cvt_SIMD<double, int>
03796 {
03797     int operator() (const double * src, int * dst, int width) const
03798     {
03799         int x = 0;
03800 
03801         if (!USE_SSE2)
03802             return x;
03803 
03804         for ( ; x <= width - 4; x += 4)
03805         {
03806             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03807             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03808             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03809 
03810             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0));
03811         }
03812 
03813         return x;
03814     }
03815 };
03816 
03817 template <>
03818 struct Cvt_SIMD<double, float>
03819 {
03820     int operator() (const double * src, float * dst, int width) const
03821     {
03822         int x = 0;
03823 
03824         if (!USE_SSE2)
03825             return x;
03826 
03827         for ( ; x <= width - 4; x += 4)
03828         {
03829             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03830             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03831 
03832             _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1));
03833         }
03834 
03835         return x;
03836     }
03837 };
03838 
03839 
03840 #elif CV_NEON
03841 
03842 // from uchar
03843 
03844 template <>
03845 struct Cvt_SIMD<uchar, schar>
03846 {
03847     int operator() (const uchar * src, schar * dst, int width) const
03848     {
03849         int x = 0;
03850 
03851         for ( ; x <= width - 8; x += 8)
03852             vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))));
03853 
03854         return x;
03855     }
03856 };
03857 
03858 
03859 template <>
03860 struct Cvt_SIMD<uchar, ushort>
03861 {
03862     int operator() (const uchar * src, ushort * dst, int width) const
03863     {
03864         int x = 0;
03865 
03866         for ( ; x <= width - 8; x += 8)
03867             vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x)));
03868 
03869         return x;
03870     }
03871 };
03872 
03873 template <>
03874 struct Cvt_SIMD<uchar, short>
03875 {
03876     int operator() (const uchar * src, short * dst, int width) const
03877     {
03878         int x = 0;
03879 
03880         for ( ; x <= width - 8; x += 8)
03881             vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))));
03882 
03883         return x;
03884     }
03885 };
03886 
03887 template <>
03888 struct Cvt_SIMD<uchar, int>
03889 {
03890     int operator() (const uchar * src, int * dst, int width) const
03891     {
03892         int x = 0;
03893 
03894         for ( ; x <= width - 8; x += 8)
03895         {
03896             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
03897             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
03898             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
03899         }
03900 
03901         return x;
03902     }
03903 };
03904 
03905 template <>
03906 struct Cvt_SIMD<uchar, float>
03907 {
03908     int operator() (const uchar * src, float * dst, int width) const
03909     {
03910         int x = 0;
03911 
03912         for ( ; x <= width - 8; x += 8)
03913         {
03914             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
03915             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
03916             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
03917         }
03918 
03919         return x;
03920     }
03921 };
03922 
03923 // from schar
03924 
03925 template <>
03926 struct Cvt_SIMD<schar, uchar>
03927 {
03928     int operator() (const schar * src, uchar * dst, int width) const
03929     {
03930         int x = 0;
03931 
03932         for ( ; x <= width - 8; x += 8)
03933             vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x))));
03934 
03935         return x;
03936     }
03937 };
03938 
03939 template <>
03940 struct Cvt_SIMD<schar, short>
03941 {
03942     int operator() (const schar * src, short * dst, int width) const
03943     {
03944         int x = 0;
03945 
03946         for ( ; x <= width - 8; x += 8)
03947             vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x)));
03948 
03949         return x;
03950     }
03951 };
03952 
03953 template <>
03954 struct Cvt_SIMD<schar, ushort>
03955 {
03956     int operator() (const schar * src, ushort * dst, int width) const
03957     {
03958         int x = 0;
03959 
03960         for ( ; x <= width - 8; x += 8)
03961         {
03962             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03963             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))),
03964                                             vqmovun_s32(vmovl_s16(vget_high_s16(v_src)))));
03965         }
03966 
03967         return x;
03968     }
03969 };
03970 
03971 
03972 template <>
03973 struct Cvt_SIMD<schar, int>
03974 {
03975     int operator() (const schar * src, int * dst, int width) const
03976     {
03977         int x = 0;
03978 
03979         for ( ; x <= width - 8; x += 8)
03980         {
03981             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03982             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
03983             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
03984         }
03985 
03986         return x;
03987     }
03988 };
03989 
03990 template <>
03991 struct Cvt_SIMD<schar, float>
03992 {
03993     int operator() (const schar * src, float * dst, int width) const
03994     {
03995         int x = 0;
03996 
03997         for ( ; x <= width - 8; x += 8)
03998         {
03999             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
04000             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
04001             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
04002         }
04003 
04004         return x;
04005     }
04006 };
04007 
04008 // from ushort
04009 
04010 template <>
04011 struct Cvt_SIMD<ushort, uchar>
04012 {
04013     int operator() (const ushort * src, uchar * dst, int width) const
04014     {
04015         int x = 0;
04016 
04017         for ( ; x <= width - 16; x += 16)
04018         {
04019             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
04020             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2)));
04021         }
04022 
04023         return x;
04024     }
04025 };
04026 
04027 template <>
04028 struct Cvt_SIMD<ushort, schar>
04029 {
04030     int operator() (const ushort * src, schar * dst, int width) const
04031     {
04032         int x = 0;
04033 
04034         for ( ; x <= width - 16; x += 16)
04035         {
04036             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
04037             int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1)));
04038             int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1)));
04039             int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2)));
04040             int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2)));
04041 
04042             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))),
04043                                           vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21)))));
04044         }
04045 
04046         return x;
04047     }
04048 };
04049 
04050 template <>
04051 struct Cvt_SIMD<ushort, short>
04052 {
04053     int operator() (const ushort * src, short * dst, int width) const
04054     {
04055         int x = 0;
04056 
04057         for ( ; x <= width - 8; x += 8)
04058         {
04059             uint16x8_t v_src = vld1q_u16(src + x);
04060             int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)));
04061             int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)));
04062 
04063             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
04064         }
04065 
04066         return x;
04067     }
04068 };
04069 
04070 template <>
04071 struct Cvt_SIMD<ushort, int>
04072 {
04073     int operator() (const ushort * src, int * dst, int width) const
04074     {
04075         int x = 0;
04076 
04077         for ( ; x <= width - 8; x += 8)
04078         {
04079             uint16x8_t v_src = vld1q_u16(src + x);
04080             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
04081             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
04082         }
04083 
04084         return x;
04085     }
04086 };
04087 
04088 template <>
04089 struct Cvt_SIMD<ushort, float>
04090 {
04091     int operator() (const ushort * src, float * dst, int width) const
04092     {
04093         int x = 0;
04094 
04095         for ( ; x <= width - 8; x += 8)
04096         {
04097             uint16x8_t v_src = vld1q_u16(src + x);
04098             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
04099             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
04100         }
04101 
04102         return x;
04103     }
04104 };
04105 
04106 // from short
04107 
04108 template <>
04109 struct Cvt_SIMD<short, uchar>
04110 {
04111     int operator() (const short * src, uchar * dst, int width) const
04112     {
04113         int x = 0;
04114 
04115         for ( ; x <= width - 16; x += 16)
04116         {
04117             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
04118             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2)));
04119         }
04120 
04121         return x;
04122     }
04123 };
04124 
04125 template <>
04126 struct Cvt_SIMD<short, schar>
04127 {
04128     int operator() (const short * src, schar * dst, int width) const
04129     {
04130         int x = 0;
04131 
04132         for ( ; x <= width - 16; x += 16)
04133         {
04134             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
04135             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2)));
04136         }
04137 
04138         return x;
04139     }
04140 };
04141 
04142 template <>
04143 struct Cvt_SIMD<short, ushort>
04144 {
04145     int operator() (const short * src, ushort * dst, int width) const
04146     {
04147         int x = 0;
04148 
04149         for ( ; x <= width - 8; x += 8)
04150         {
04151             int16x8_t v_src = vld1q_s16(src + x);
04152             uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src)));
04153             uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src)));
04154             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
04155         }
04156 
04157         return x;
04158     }
04159 };
04160 
04161 template <>
04162 struct Cvt_SIMD<short, int>
04163 {
04164     int operator() (const short * src, int * dst, int width) const
04165     {
04166         int x = 0;
04167 
04168         for ( ; x <= width - 8; x += 8)
04169         {
04170             int16x8_t v_src = vld1q_s16(src + x);
04171             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
04172             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
04173         }
04174 
04175         return x;
04176     }
04177 };
04178 
04179 template <>
04180 struct Cvt_SIMD<short, float>
04181 {
04182     int operator() (const short * src, float * dst, int width) const
04183     {
04184         int x = 0;
04185 
04186         for ( ; x <= width - 8; x += 8)
04187         {
04188             int16x8_t v_src = vld1q_s16(src + x);
04189             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
04190             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
04191         }
04192 
04193         return x;
04194     }
04195 };
04196 
04197 // from int
04198 
04199 template <>
04200 struct Cvt_SIMD<int, uchar>
04201 {
04202     int operator() (const int * src, uchar * dst, int width) const
04203     {
04204         int x = 0;
04205 
04206         for ( ; x <= width - 16; x += 16)
04207         {
04208             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
04209             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
04210             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
04211             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4)));
04212             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
04213         }
04214 
04215         return x;
04216     }
04217 };
04218 
04219 template <>
04220 struct Cvt_SIMD<int, schar>
04221 {
04222     int operator() (const int * src, schar * dst, int width) const
04223     {
04224         int x = 0;
04225 
04226         for ( ; x <= width - 16; x += 16)
04227         {
04228             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
04229             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
04230             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
04231             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
04232             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
04233         }
04234 
04235         return x;
04236     }
04237 };
04238 
04239 
04240 template <>
04241 struct Cvt_SIMD<int, ushort>
04242 {
04243     int operator() (const int * src, ushort * dst, int width) const
04244     {
04245         int x = 0;
04246 
04247         for ( ; x <= width - 8; x += 8)
04248         {
04249             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
04250             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
04251         }
04252 
04253         return x;
04254     }
04255 };
04256 
04257 template <>
04258 struct Cvt_SIMD<int, short>
04259 {
04260     int operator() (const int * src, short * dst, int width) const
04261     {
04262         int x = 0;
04263 
04264         for ( ; x <= width - 8; x += 8)
04265         {
04266             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
04267             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
04268         }
04269 
04270         return x;
04271     }
04272 };
04273 
04274 template <>
04275 struct Cvt_SIMD<int, float>
04276 {
04277     int operator() (const int * src, float * dst, int width) const
04278     {
04279         int x = 0;
04280 
04281         for ( ; x <= width - 4; x += 4)
04282             vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x)));
04283 
04284         return x;
04285     }
04286 };
04287 
04288 // from float
04289 
04290 template <>
04291 struct Cvt_SIMD<float, uchar>
04292 {
04293     int operator() (const float * src, uchar * dst, int width) const
04294     {
04295         int x = 0;
04296 
04297         for ( ; x <= width - 16; x += 16)
04298         {
04299             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
04300             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
04301             uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8));
04302             uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12));
04303             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
04304             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4)));
04305             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
04306         }
04307 
04308         return x;
04309     }
04310 };
04311 
04312 template <>
04313 struct Cvt_SIMD<float, schar>
04314 {
04315     int operator() (const float * src, schar * dst, int width) const
04316     {
04317         int x = 0;
04318 
04319         for ( ; x <= width - 16; x += 16)
04320         {
04321             int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x));
04322             int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4));
04323             int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8));
04324             int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12));
04325             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
04326             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
04327             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
04328         }
04329 
04330         return x;
04331     }
04332 };
04333 
04334 
04335 template <>
04336 struct Cvt_SIMD<float, ushort>
04337 {
04338     int operator() (const float * src, ushort * dst, int width) const
04339     {
04340         int x = 0;
04341 
04342         for ( ; x <= width - 8; x += 8)
04343         {
04344             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
04345             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
04346             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
04347         }
04348 
04349         return x;
04350     }
04351 };
04352 
04353 template <>
04354 struct Cvt_SIMD<float, int>
04355 {
04356     int operator() (const float * src, int * dst, int width) const
04357     {
04358         int x = 0;
04359 
04360         for ( ; x <= width - 4; x += 4)
04361             vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x)));
04362 
04363         return x;
04364     }
04365 };
04366 
04367 #endif
04368 
04369 template<typename T, typename DT> static void
04370 cvt_( const T* src, size_t sstep,
04371       DT* dst, size_t dstep, Size size )
04372 {
04373     sstep /= sizeof(src[0]);
04374     dstep /= sizeof(dst[0]);
04375     Cvt_SIMD<T, DT> vop;
04376 
04377     for( ; size.height--; src += sstep, dst += dstep )
04378     {
04379         int x = vop(src, dst, size.width);
04380         #if CV_ENABLE_UNROLLED
04381         for( ; x <= size.width - 4; x += 4 )
04382         {
04383             DT t0, t1;
04384             t0 = saturate_cast<DT>(src[x]);
04385             t1 = saturate_cast<DT>(src[x+1]);
04386             dst[x] = t0; dst[x+1] = t1;
04387             t0 = saturate_cast<DT>(src[x+2]);
04388             t1 = saturate_cast<DT>(src[x+3]);
04389             dst[x+2] = t0; dst[x+3] = t1;
04390         }
04391         #endif
04392         for( ; x < size.width; x++ )
04393             dst[x] = saturate_cast<DT>(src[x]);
04394     }
04395 }
04396 
04397 //vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
04398 template<>  void
04399 cvt_<float, short>( const float* src, size_t sstep,
04400      short* dst, size_t dstep, Size size )
04401 {
04402     sstep /= sizeof(src[0]);
04403     dstep /= sizeof(dst[0]);
04404 
04405     for( ; size.height--; src += sstep, dst += dstep )
04406     {
04407         int x = 0;
04408         #if   CV_SSE2
04409         if(USE_SSE2)
04410         {
04411             for( ; x <= size.width - 8; x += 8 )
04412             {
04413                 __m128 src128 = _mm_loadu_ps (src + x);
04414                 __m128i src_int128 = _mm_cvtps_epi32 (src128);
04415 
04416                 src128 = _mm_loadu_ps (src + x + 4);
04417                 __m128i src1_int128 = _mm_cvtps_epi32 (src128);
04418 
04419                 src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
04420                 _mm_storeu_si128((__m128i*)(dst + x),src1_int128);
04421             }
04422         }
04423         #elif CV_NEON
04424         for( ; x <= size.width - 8; x += 8 )
04425         {
04426             float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4);
04427             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)),
04428                                            vqmovn_s32(cv_vrndq_s32_f32(v_src2)));
04429             vst1q_s16(dst + x, v_dst);
04430         }
04431         #endif
04432         for( ; x < size.width; x++ )
04433             dst[x] = saturate_cast<short>(src[x]);
04434     }
04435 
04436 }
04437 
04438 
04439 template<typename T> static void
04440 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
04441 {
04442     sstep /= sizeof(src[0]);
04443     dstep /= sizeof(dst[0]);
04444 
04445     for( ; size.height--; src += sstep, dst += dstep )
04446         memcpy(dst, src, size.width*sizeof(src[0]));
04447 }
04448 
04449 #define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \
04450 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04451                          dtype* dst, size_t dstep, Size size, double* scale) \
04452 { \
04453     tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
04454 }
04455 
04456 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
04457 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04458 dtype* dst, size_t dstep, Size size, double* scale) \
04459 { \
04460     cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
04461 }
04462 
04463 #if defined(HAVE_IPP)
04464 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
04465 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04466                          dtype* dst, size_t dstep, Size size, double*) \
04467 { \
04468     CV_IPP_RUN(src && dst, ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0)\
04469     cvt_(src, sstep, dst, dstep, size); \
04470 }
04471 
04472 #define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \
04473 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04474                          dtype* dst, size_t dstep, Size size, double*) \
04475 { \
04476     CV_IPP_RUN(src && dst, ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0)\
04477     cvt_(src, sstep, dst, dstep, size); \
04478 }
04479 #else
04480 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
04481 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04482                          dtype* dst, size_t dstep, Size size, double*) \
04483 { \
04484     cvt_(src, sstep, dst, dstep, size); \
04485 }
04486 #define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F
04487 #endif
04488 
04489 #define DEF_CVT_FUNC(suffix, stype, dtype) \
04490 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04491                          dtype* dst, size_t dstep, Size size, double*) \
04492 { \
04493     cvt_(src, sstep, dst, dstep, size); \
04494 }
04495 
04496 #define DEF_CPY_FUNC(suffix, stype) \
04497 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04498                          stype* dst, size_t dstep, Size size, double*) \
04499 { \
04500     cpy_(src, sstep, dst, dstep, size); \
04501 }
04502 
04503 
04504 DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float)
04505 DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float)
04506 DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float)
04507 DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float)
04508 DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
04509 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
04510 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
04511 
04512 DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
04513 DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
04514 DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
04515 DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float)
04516 DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float)
04517 DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float)
04518 DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float)
04519 
04520 DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float)
04521 DEF_CVT_SCALE_FUNC(8s,     schar, schar, float)
04522 DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float)
04523 DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float)
04524 DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float)
04525 DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float)
04526 DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float)
04527 
04528 DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float)
04529 DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float)
04530 DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float)
04531 DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float)
04532 DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float)
04533 DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float)
04534 DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float)
04535 
04536 DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float)
04537 DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float)
04538 DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float)
04539 DEF_CVT_SCALE_FUNC(16s,    short, short, float)
04540 DEF_CVT_SCALE_FUNC(32s16s, int, short, float)
04541 DEF_CVT_SCALE_FUNC(32f16s, float, short, float)
04542 DEF_CVT_SCALE_FUNC(64f16s, double, short, float)
04543 
04544 DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float)
04545 DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float)
04546 DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float)
04547 DEF_CVT_SCALE_FUNC(16s32s, short, int, float)
04548 DEF_CVT_SCALE_FUNC(32s,    int, int, double)
04549 DEF_CVT_SCALE_FUNC(32f32s, float, int, float)
04550 DEF_CVT_SCALE_FUNC(64f32s, double, int, double)
04551 
04552 DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float)
04553 DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float)
04554 DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float)
04555 DEF_CVT_SCALE_FUNC(16s32f, short, float, float)
04556 DEF_CVT_SCALE_FUNC(32s32f, int, float, double)
04557 DEF_CVT_SCALE_FUNC(32f,    float, float, float)
04558 DEF_CVT_SCALE_FUNC(64f32f, double, float, double)
04559 
04560 DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double)
04561 DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double)
04562 DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double)
04563 DEF_CVT_SCALE_FUNC(16s64f, short, double, double)
04564 DEF_CVT_SCALE_FUNC(32s64f, int, double, double)
04565 DEF_CVT_SCALE_FUNC(32f64f, float, double, double)
04566 DEF_CVT_SCALE_FUNC(64f,    double, double, double)
04567 
04568 DEF_CPY_FUNC(8u,     uchar)
04569 DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
04570 DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
04571 DEF_CVT_FUNC_F(16s8u,  short, uchar, 16s8u_C1R)
04572 DEF_CVT_FUNC_F(32s8u,  int, uchar, 32s8u_C1R)
04573 DEF_CVT_FUNC_F2(32f8u,  float, uchar, 32f8u_C1RSfs)
04574 DEF_CVT_FUNC(64f8u,  double, uchar)
04575 
04576 DEF_CVT_FUNC_F2(8u8s,   uchar, schar, 8u8s_C1RSfs)
04577 DEF_CVT_FUNC_F2(16u8s,  ushort, schar, 16u8s_C1RSfs)
04578 DEF_CVT_FUNC_F2(16s8s,  short, schar, 16s8s_C1RSfs)
04579 DEF_CVT_FUNC_F(32s8s,  int, schar, 32s8s_C1R)
04580 DEF_CVT_FUNC_F2(32f8s,  float, schar, 32f8s_C1RSfs)
04581 DEF_CVT_FUNC(64f8s,  double, schar)
04582 
04583 DEF_CVT_FUNC_F(8u16u,  uchar, ushort, 8u16u_C1R)
04584 DEF_CVT_FUNC_F(8s16u,  schar, ushort, 8s16u_C1Rs)
04585 DEF_CPY_FUNC(16u,    ushort)
04586 DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs)
04587 DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs)
04588 DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs)
04589 DEF_CVT_FUNC(64f16u, double, ushort)
04590 
04591 DEF_CVT_FUNC_F(8u16s,  uchar, short, 8u16s_C1R)
04592 DEF_CVT_FUNC_F(8s16s,  schar, short, 8s16s_C1R)
04593 DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs)
04594 DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs)
04595 DEF_CVT_FUNC(32f16s, float, short)
04596 DEF_CVT_FUNC(64f16s, double, short)
04597 
04598 DEF_CVT_FUNC_F(8u32s,  uchar, int, 8u32s_C1R)
04599 DEF_CVT_FUNC_F(8s32s,  schar, int, 8s32s_C1R)
04600 DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R)
04601 DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R)
04602 DEF_CPY_FUNC(32s,    int)
04603 DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs)
04604 DEF_CVT_FUNC(64f32s, double, int)
04605 
04606 DEF_CVT_FUNC_F(8u32f,  uchar, float, 8u32f_C1R)
04607 DEF_CVT_FUNC_F(8s32f,  schar, float, 8s32f_C1R)
04608 DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R)
04609 DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R)
04610 DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R)
04611 DEF_CVT_FUNC(64f32f, double, float)
04612 
04613 DEF_CVT_FUNC(8u64f,  uchar, double)
04614 DEF_CVT_FUNC(8s64f,  schar, double)
04615 DEF_CVT_FUNC(16u64f, ushort, double)
04616 DEF_CVT_FUNC(16s64f, short, double)
04617 DEF_CVT_FUNC(32s64f, int, double)
04618 DEF_CVT_FUNC(32f64f, float, double)
04619 DEF_CPY_FUNC(64s,    int64)
04620 
04621 static BinaryFunc getCvtScaleAbsFunc(int depth)
04622 {
04623     static BinaryFunc cvtScaleAbsTab[] =
04624     {
04625         (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u,
04626         (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u,
04627         (BinaryFunc)cvtScaleAbs64f8u, 0
04628     };
04629 
04630     return cvtScaleAbsTab[depth];
04631 }
04632 
04633 BinaryFunc getConvertFunc(int sdepth, int ddepth)
04634 {
04635     static BinaryFunc cvtTab[][8] =
04636     {
04637         {
04638             (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
04639             (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
04640             (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0
04641         },
04642         {
04643             (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
04644             (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
04645             (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0
04646         },
04647         {
04648             (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
04649             (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
04650             (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0
04651         },
04652         {
04653             (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
04654             (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
04655             (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0
04656         },
04657         {
04658             (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
04659             (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
04660             (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0
04661         },
04662         {
04663             (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
04664             (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
04665             (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0
04666         },
04667         {
04668             (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
04669             (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
04670             (BinaryFunc)(cvt64s), 0
04671         },
04672         {
04673             0, 0, 0, 0, 0, 0, 0, 0
04674         }
04675     };
04676 
04677     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
04678 }
04679 
04680 static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
04681 {
04682     static BinaryFunc cvtScaleTab[][8] =
04683     {
04684         {
04685             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
04686             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
04687             (BinaryFunc)cvtScale64f8u, 0
04688         },
04689         {
04690             (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
04691             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
04692             (BinaryFunc)cvtScale64f8s, 0
04693         },
04694         {
04695             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
04696             (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
04697             (BinaryFunc)cvtScale64f16u, 0
04698         },
04699         {
04700             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
04701             (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
04702             (BinaryFunc)cvtScale64f16s, 0
04703         },
04704         {
04705             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
04706             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
04707             (BinaryFunc)cvtScale64f32s, 0
04708         },
04709         {
04710             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
04711             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
04712             (BinaryFunc)cvtScale64f32f, 0
04713         },
04714         {
04715             (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
04716             (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
04717             (BinaryFunc)cvtScale64f, 0
04718         },
04719         {
04720             0, 0, 0, 0, 0, 0, 0, 0
04721         }
04722     };
04723 
04724     return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
04725 }
04726 
04727 #ifdef HAVE_OPENCL
04728 
04729 static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
04730 {
04731     const ocl::Device & d = ocl::Device::getDefault();
04732 
04733     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
04734     bool doubleSupport = d.doubleFPConfig() > 0;
04735     if (!doubleSupport && depth == CV_64F)
04736         return false;
04737 
04738     _dst.create(_src.size(), CV_8UC(cn));
04739     int kercn = 1;
04740     if (d.isIntel())
04741     {
04742         static const int vectorWidths[] = {4, 4, 4, 4, 4, 4, 4, -1};
04743         kercn = ocl::checkOptimalVectorWidth( vectorWidths, _src, _dst,
04744                                               noArray(), noArray(), noArray(),
04745                                               noArray(), noArray(), noArray(),
04746                                               noArray(), ocl::OCL_VECTOR_MAX);
04747     }
04748     else
04749         kercn = ocl::predictOptimalVectorWidthMax(_src, _dst);
04750 
04751     int rowsPerWI = d.isIntel() ? 4 : 1;
04752     char cvt[2][50];
04753     int wdepth = std::max(depth, CV_32F);
04754     String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
04755                          " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s"
04756                          " -D workT1=%s -D rowsPerWI=%d%s",
04757                          ocl::typeToStr(CV_8UC(kercn)),
04758                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
04759                          ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
04760                          ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
04761                          ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
04762                          ocl::typeToStr(wdepth), rowsPerWI,
04763                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
04764     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, build_opt);
04765     if (k.empty())
04766         return false;
04767 
04768     UMat src = _src.getUMat();
04769     UMat dst = _dst.getUMat();
04770 
04771     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
04772             dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
04773 
04774     if (wdepth == CV_32F)
04775         k.args(srcarg, dstarg, (float)alpha, (float)beta);
04776     else if (wdepth == CV_64F)
04777         k.args(srcarg, dstarg, alpha, beta);
04778 
04779     size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
04780     return k.run(2, globalsize, NULL, false);
04781 }
04782 
04783 #endif
04784 
04785 }
04786 
04787 void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
04788 {
04789 #ifdef HAVE_OPENCL
04790     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
04791                ocl_convertScaleAbs(_src, _dst, alpha, beta))
04792 #endif
04793 
04794     Mat src = _src.getMat();
04795     int cn = src.channels();
04796     double scale[] = {alpha, beta};
04797     _dst.create( src.dims, src.size, CV_8UC(cn) );
04798     Mat dst = _dst.getMat();
04799     BinaryFunc func = getCvtScaleAbsFunc(src.depth());
04800     CV_Assert( func != 0 );
04801 
04802     if( src.dims <= 2 )
04803     {
04804         Size sz = getContinuousSize(src, dst, cn);
04805         func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale );
04806     }
04807     else
04808     {
04809         const Mat* arrays[] = {&src, &dst, 0};
04810         uchar* ptrs[2];
04811         NAryMatIterator it(arrays, ptrs);
04812         Size sz((int)it.size*cn, 1);
04813 
04814         for( size_t i = 0; i < it.nplanes; i++, ++it )
04815             func( ptrs[0], 0, 0, 0, ptrs[1], 0, sz, scale );
04816     }
04817 }
04818 
04819 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
04820 {
04821     bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
04822 
04823     if( _type < 0 )
04824         _type = _dst.fixedType() ? _dst.type() : type();
04825     else
04826         _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
04827 
04828     int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
04829     if( sdepth == ddepth && noScale )
04830     {
04831         copyTo(_dst);
04832         return;
04833     }
04834 
04835     Mat src = *this;
04836 
04837     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
04838     double scale[] = {alpha, beta};
04839     int cn = channels();
04840     CV_Assert( func != 0 );
04841 
04842     if( dims <= 2 )
04843     {
04844         _dst.create( size(), _type );
04845         Mat dst = _dst.getMat();
04846         Size sz = getContinuousSize(src, dst, cn);
04847         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
04848     }
04849     else
04850     {
04851         _dst.create( dims, size, _type );
04852         Mat dst = _dst.getMat();
04853         const Mat* arrays[] = {&src, &dst, 0};
04854         uchar* ptrs[2];
04855         NAryMatIterator it(arrays, ptrs);
04856         Size sz((int)(it.size*cn), 1);
04857 
04858         for( size_t i = 0; i < it.nplanes; i++, ++it )
04859             func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, scale);
04860     }
04861 }
04862 
04863 /****************************************************************************************\
04864 *                                    LUT Transform                                       *
04865 \****************************************************************************************/
04866 
04867 namespace cv
04868 {
04869 
04870 template<typename T> static void
04871 LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn )
04872 {
04873     if( lutcn == 1 )
04874     {
04875         for( int i = 0; i < len*cn; i++ )
04876             dst[i] = lut[src[i]];
04877     }
04878     else
04879     {
04880         for( int i = 0; i < len*cn; i += cn )
04881             for( int k = 0; k < cn; k++ )
04882                 dst[i+k] = lut[src[i+k]*cn+k];
04883     }
04884 }
04885 
04886 static void LUT8u_8u( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn )
04887 {
04888     LUT8u_( src, lut, dst, len, cn, lutcn );
04889 }
04890 
04891 static void LUT8u_8s( const uchar* src, const schar* lut, schar* dst, int len, int cn, int lutcn )
04892 {
04893     LUT8u_( src, lut, dst, len, cn, lutcn );
04894 }
04895 
04896 static void LUT8u_16u( const uchar* src, const ushort* lut, ushort* dst, int len, int cn, int lutcn )
04897 {
04898     LUT8u_( src, lut, dst, len, cn, lutcn );
04899 }
04900 
04901 static void LUT8u_16s( const uchar* src, const short* lut, short* dst, int len, int cn, int lutcn )
04902 {
04903     LUT8u_( src, lut, dst, len, cn, lutcn );
04904 }
04905 
04906 static void LUT8u_32s( const uchar* src, const int* lut, int* dst, int len, int cn, int lutcn )
04907 {
04908     LUT8u_( src, lut, dst, len, cn, lutcn );
04909 }
04910 
04911 static void LUT8u_32f( const uchar* src, const float* lut, float* dst, int len, int cn, int lutcn )
04912 {
04913     LUT8u_( src, lut, dst, len, cn, lutcn );
04914 }
04915 
04916 static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len, int cn, int lutcn )
04917 {
04918     LUT8u_( src, lut, dst, len, cn, lutcn );
04919 }
04920 
04921 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn );
04922 
04923 static LUTFunc lutTab[] =
04924 {
04925     (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s,
04926     (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0
04927 };
04928 
04929 #ifdef HAVE_OPENCL
04930 
04931 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
04932 {
04933     int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth();
04934 
04935     UMat src = _src.getUMat(), lut = _lut.getUMat();
04936     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
04937     UMat dst = _dst.getUMat();
04938     int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_src, _dst)) : dcn;
04939 
04940     ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
04941                   format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
04942                          ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
04943     if (k.empty())
04944         return false;
04945 
04946     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
04947         ocl::KernelArg::WriteOnly(dst, dcn, kercn));
04948 
04949     size_t globalSize[2] = { (size_t)dst.cols * dcn / kercn, ((size_t)dst.rows + 3) / 4 };
04950     return k.run(2, globalSize, NULL, false);
04951 }
04952 
04953 #endif
04954 
04955 #if defined(HAVE_IPP)
04956 namespace ipp {
04957 
04958 #if IPP_DISABLE_BLOCK // there are no performance benefits (PR #2653)
04959 class IppLUTParallelBody_LUTC1 : public ParallelLoopBody
04960 {
04961 public:
04962     bool* ok;
04963     const Mat& src_;
04964     const Mat& lut_;
04965     Mat& dst_;
04966 
04967     typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep,
04968                           IppiSize roiSize, const void* pTable, int nBitSize);
04969     IppFn fn;
04970 
04971     int width;
04972 
04973     IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
04974         : ok(_ok), src_(src), lut_(lut), dst_(dst)
04975     {
04976         width = dst.cols * dst.channels();
04977 
04978         size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
04979 
04980         fn =
04981                 elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R :
04982                 elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R :
04983                 NULL;
04984 
04985         *ok = (fn != NULL);
04986     }
04987 
04988     void operator()( const cv::Range& range ) const
04989     {
04990         if (!*ok)
04991             return;
04992 
04993         const int row0 = range.start;
04994         const int row1 = range.end;
04995 
04996         Mat src = src_.rowRange(row0, row1);
04997         Mat dst = dst_.rowRange(row0, row1);
04998 
04999         IppiSize sz = { width, dst.rows };
05000 
05001         CV_DbgAssert(fn != NULL);
05002         if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0)
05003         {
05004             setIppErrorStatus();
05005             *ok = false;
05006         }
05007         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05008     }
05009 private:
05010     IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&);
05011     IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&);
05012 };
05013 #endif
05014 
05015 class IppLUTParallelBody_LUTCN : public ParallelLoopBody
05016 {
05017 public:
05018     bool *ok;
05019     const Mat& src_;
05020     const Mat& lut_;
05021     Mat& dst_;
05022 
05023     int lutcn;
05024 
05025     uchar* lutBuffer;
05026     uchar* lutTable[4];
05027 
05028     IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
05029         : ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL)
05030     {
05031         lutcn = lut.channels();
05032         IppiSize sz256 = {256, 1};
05033 
05034         size_t elemSize1 = dst.elemSize1();
05035         CV_DbgAssert(elemSize1 == 1);
05036         lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4);
05037         lutTable[0] = lutBuffer + 0;
05038         lutTable[1] = lutBuffer + 1 * 256 * elemSize1;
05039         lutTable[2] = lutBuffer + 2 * 256 * elemSize1;
05040         lutTable[3] = lutBuffer + 3 * 256 * elemSize1;
05041 
05042         CV_DbgAssert(lutcn == 3 || lutcn == 4);
05043         if (lutcn == 3)
05044         {
05045             IppStatus status = ippiCopy_8u_C3P3R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
05046             if (status < 0)
05047             {
05048                 setIppErrorStatus();
05049                 return;
05050             }
05051             CV_IMPL_ADD(CV_IMPL_IPP);
05052         }
05053         else if (lutcn == 4)
05054         {
05055             IppStatus status = ippiCopy_8u_C4P4R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
05056             if (status < 0)
05057             {
05058                 setIppErrorStatus();
05059                 return;
05060             }
05061             CV_IMPL_ADD(CV_IMPL_IPP);
05062         }
05063 
05064         *ok = true;
05065     }
05066 
05067     ~IppLUTParallelBody_LUTCN()
05068     {
05069         if (lutBuffer != NULL)
05070             ippFree(lutBuffer);
05071         lutBuffer = NULL;
05072         lutTable[0] = NULL;
05073     }
05074 
05075     void operator()( const cv::Range& range ) const
05076     {
05077         if (!*ok)
05078             return;
05079 
05080         const int row0 = range.start;
05081         const int row1 = range.end;
05082 
05083         Mat src = src_.rowRange(row0, row1);
05084         Mat dst = dst_.rowRange(row0, row1);
05085 
05086         if (lutcn == 3)
05087         {
05088             if (ippiLUTPalette_8u_C3R(
05089                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
05090                     ippiSize(dst.size()), lutTable, 8) >= 0)
05091             {
05092                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05093                 return;
05094             }
05095         }
05096         else if (lutcn == 4)
05097         {
05098             if (ippiLUTPalette_8u_C4R(
05099                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
05100                     ippiSize(dst.size()), lutTable, 8) >= 0)
05101             {
05102                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05103                 return;
05104             }
05105         }
05106         setIppErrorStatus();
05107         *ok = false;
05108     }
05109 private:
05110     IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&);
05111     IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&);
05112 };
05113 } // namespace ipp
05114 
05115 static bool ipp_lut(Mat &src, Mat &lut, Mat &dst)
05116 {
05117     int lutcn = lut.channels();
05118 
05119     if(src.dims > 2)
05120         return false;
05121 
05122     bool ok = false;
05123     Ptr<ParallelLoopBody> body;
05124 
05125     size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
05126 #if IPP_DISABLE_BLOCK // there are no performance benefits (PR #2653)
05127     if (lutcn == 1)
05128     {
05129         ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok);
05130         body.reset(p);
05131     }
05132     else
05133 #endif
05134     if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1)
05135     {
05136         ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok);
05137         body.reset(p);
05138     }
05139 
05140     if (body != NULL && ok)
05141     {
05142         Range all(0, dst.rows);
05143         if (dst.total()>>18)
05144             parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16));
05145         else
05146             (*body)(all);
05147         if (ok)
05148             return true;
05149     }
05150 
05151     return false;
05152 }
05153 #endif // IPP
05154 
05155 class LUTParallelBody : public ParallelLoopBody
05156 {
05157 public:
05158     bool* ok;
05159     const Mat& src_;
05160     const Mat& lut_;
05161     Mat& dst_;
05162 
05163     LUTFunc func;
05164 
05165     LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
05166         : ok(_ok), src_(src), lut_(lut), dst_(dst)
05167     {
05168         func = lutTab[lut.depth()];
05169         *ok = (func != NULL);
05170     }
05171 
05172     void operator()( const cv::Range& range ) const
05173     {
05174         CV_DbgAssert(*ok);
05175 
05176         const int row0 = range.start;
05177         const int row1 = range.end;
05178 
05179         Mat src = src_.rowRange(row0, row1);
05180         Mat dst = dst_.rowRange(row0, row1);
05181 
05182         int cn = src.channels();
05183         int lutcn = lut_.channels();
05184 
05185         const Mat* arrays[] = {&src, &dst, 0};
05186         uchar* ptrs[2];
05187         NAryMatIterator it(arrays, ptrs);
05188         int len = (int)it.size;
05189 
05190         for( size_t i = 0; i < it.nplanes; i++, ++it )
05191             func(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn);
05192     }
05193 private:
05194     LUTParallelBody(const LUTParallelBody&);
05195     LUTParallelBody& operator=(const LUTParallelBody&);
05196 };
05197 
05198 }
05199 
05200 void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
05201 {
05202     int cn = _src.channels(), depth = _src.depth();
05203     int lutcn = _lut.channels();
05204 
05205     CV_Assert( (lutcn == cn || lutcn == 1) &&
05206         _lut.total() == 256 && _lut.isContinuous() &&
05207         (depth == CV_8U || depth == CV_8S) );
05208 
05209 #ifdef HAVE_OPENCL
05210     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
05211                ocl_LUT(_src, _lut, _dst))
05212 #endif
05213 
05214     Mat src = _src.getMat(), lut = _lut.getMat();
05215     _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn));
05216     Mat dst = _dst.getMat();
05217 
05218     CV_IPP_RUN(_src.dims() <= 2, ipp_lut(src, lut, dst));
05219 
05220     if (_src.dims() <= 2)
05221     {
05222         bool ok = false;
05223         Ptr<ParallelLoopBody> body;
05224 
05225         if (body == NULL || ok == false)
05226         {
05227             ok = false;
05228             ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok);
05229             body.reset(p);
05230         }
05231         if (body != NULL && ok)
05232         {
05233             Range all(0, dst.rows);
05234             if (dst.total()>>18)
05235                 parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16));
05236             else
05237                 (*body)(all);
05238             if (ok)
05239                 return;
05240         }
05241     }
05242 
05243     LUTFunc func = lutTab[lut.depth()];
05244     CV_Assert( func != 0 );
05245 
05246     const Mat* arrays[] = {&src, &dst, 0};
05247     uchar* ptrs[2];
05248     NAryMatIterator it(arrays, ptrs);
05249     int len = (int)it.size;
05250 
05251     for( size_t i = 0; i < it.nplanes; i++, ++it )
05252         func(ptrs[0], lut.ptr(), ptrs[1], len, cn, lutcn);
05253 }
05254 
05255 namespace cv {
05256 
05257 #ifdef HAVE_OPENCL
05258 
05259 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
05260                            double scale, double delta )
05261 {
05262     UMat src = _src.getUMat();
05263 
05264     if( _mask.empty() )
05265         src.convertTo( _dst, dtype, scale, delta );
05266     else if (src.channels() <= 4)
05267     {
05268         const ocl::Device & dev = ocl::Device::getDefault();
05269 
05270         int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
05271                 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
05272                 rowsPerWI = dev.isIntel() ? 4 : 1;
05273 
05274         float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
05275         bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
05276                 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
05277                 haveDelta = std::fabs(delta) > DBL_EPSILON,
05278                 doubleSupport = dev.doubleFPConfig() > 0;
05279 
05280         if (!haveScale && !haveDelta && stype == dtype)
05281         {
05282             _src.copyTo(_dst, _mask);
05283             return true;
05284         }
05285         if (haveZeroScale)
05286         {
05287             _dst.setTo(Scalar(delta), _mask);
05288             return true;
05289         }
05290 
05291         if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
05292             return false;
05293 
05294         char cvt[2][40];
05295         String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
05296                              " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
05297                              ocl::typeToStr(stype), ocl::typeToStr(dtype),
05298                              ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
05299                              rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
05300                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
05301                              doubleSupport ? " -D DOUBLE_SUPPORT" : "",
05302                              haveScale ? " -D HAVE_SCALE" : "",
05303                              haveDelta ? " -D HAVE_DELTA" : "",
05304                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
05305 
05306         ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
05307         if (k.empty())
05308             return false;
05309 
05310         UMat mask = _mask.getUMat(), dst = _dst.getUMat();
05311 
05312         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
05313                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
05314                 dstarg = ocl::KernelArg::ReadWrite(dst);
05315 
05316         if (haveScale)
05317         {
05318             if (haveDelta)
05319                 k.args(srcarg, maskarg, dstarg, fscale, fdelta);
05320             else
05321                 k.args(srcarg, maskarg, dstarg, fscale);
05322         }
05323         else
05324         {
05325             if (haveDelta)
05326                 k.args(srcarg, maskarg, dstarg, fdelta);
05327             else
05328                 k.args(srcarg, maskarg, dstarg);
05329         }
05330 
05331         size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
05332         return k.run(2, globalsize, NULL, false);
05333     }
05334     else
05335     {
05336         UMat temp;
05337         src.convertTo( temp, dtype, scale, delta );
05338         temp.copyTo( _dst, _mask );
05339     }
05340 
05341     return true;
05342 }
05343 
05344 #endif
05345 
05346 }
05347 
05348 void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b,
05349                     int norm_type, int rtype, InputArray _mask )
05350 {
05351     double scale = 1, shift = 0;
05352     if( norm_type == CV_MINMAX )
05353     {
05354         double smin = 0, smax = 0;
05355         double dmin = MIN( a, b ), dmax = MAX( a, b );
05356         minMaxLoc( _src, &smin, &smax, 0, 0, _mask );
05357         scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
05358         shift = dmin - smin*scale;
05359     }
05360     else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
05361     {
05362         scale = norm( _src, norm_type, _mask );
05363         scale = scale > DBL_EPSILON ? a/scale : 0.;
05364         shift = 0;
05365     }
05366     else
05367         CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
05368 
05369     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
05370     if( rtype < 0 )
05371         rtype = _dst.fixedType() ? _dst.depth() : depth;
05372     _dst.createSameSize(_src, CV_MAKETYPE(rtype, cn));
05373 
05374 #ifdef HAVE_OPENCL
05375     CV_OCL_RUN(_dst.isUMat(),
05376                ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
05377 #endif
05378 
05379     Mat src = _src.getMat(), dst = _dst.getMat();
05380     if( _mask.empty() )
05381         src.convertTo( dst, rtype, scale, shift );
05382     else
05383     {
05384         Mat temp;
05385         src.convertTo( temp, rtype, scale, shift );
05386         temp.copyTo( dst, _mask );
05387     }
05388 }
05389 
05390 CV_IMPL void
05391 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 )
05392 {
05393     void* dptrs[] = { dstarr0, dstarr1, dstarr2, dstarr3 };
05394     cv::Mat src = cv::cvarrToMat(srcarr);
05395     int i, j, nz = 0;
05396     for( i = 0; i < 4; i++ )
05397         nz += dptrs[i] != 0;
05398     CV_Assert( nz > 0 );
05399     std::vector<cv::Mat> dvec(nz);
05400     std::vector<int> pairs(nz*2);
05401 
05402     for( i = j = 0; i < 4; i++ )
05403     {
05404         if( dptrs[i] != 0 )
05405         {
05406             dvec[j] = cv::cvarrToMat(dptrs[i]);
05407             CV_Assert( dvec[j].size() == src.size() );
05408             CV_Assert( dvec[j].depth() == src.depth() );
05409             CV_Assert( dvec[j].channels() == 1 );
05410             CV_Assert( i < src.channels() );
05411             pairs[j*2] = i;
05412             pairs[j*2+1] = j;
05413             j++;
05414         }
05415     }
05416     if( nz == src.channels() )
05417         cv::split( src, dvec );
05418     else
05419     {
05420         cv::mixChannels( &src, 1, &dvec[0], nz, &pairs[0], nz );
05421     }
05422 }
05423 
05424 
05425 CV_IMPL void
05426 cvMerge( const void* srcarr0, const void* srcarr1, const void* srcarr2,
05427          const void* srcarr3, void* dstarr )
05428 {
05429     const void* sptrs[] = { srcarr0, srcarr1, srcarr2, srcarr3 };
05430     cv::Mat dst = cv::cvarrToMat(dstarr);
05431     int i, j, nz = 0;
05432     for( i = 0; i < 4; i++ )
05433         nz += sptrs[i] != 0;
05434     CV_Assert( nz > 0 );
05435     std::vector<cv::Mat> svec(nz);
05436     std::vector<int> pairs(nz*2);
05437 
05438     for( i = j = 0; i < 4; i++ )
05439     {
05440         if( sptrs[i] != 0 )
05441         {
05442             svec[j] = cv::cvarrToMat(sptrs[i]);
05443             CV_Assert( svec[j].size == dst.size &&
05444                 svec[j].depth() == dst.depth() &&
05445                 svec[j].channels() == 1 && i < dst.channels() );
05446             pairs[j*2] = j;
05447             pairs[j*2+1] = i;
05448             j++;
05449         }
05450     }
05451 
05452     if( nz == dst.channels() )
05453         cv::merge( svec, dst );
05454     else
05455     {
05456         cv::mixChannels( &svec[0], nz, &dst, 1, &pairs[0], nz );
05457     }
05458 }
05459 
05460 
05461 CV_IMPL void
05462 cvMixChannels( const CvArr** src, int src_count,
05463                CvArr** dst, int dst_count,
05464                const int* from_to, int pair_count )
05465 {
05466     cv::AutoBuffer<cv::Mat> buf(src_count + dst_count);
05467 
05468     int i;
05469     for( i = 0; i < src_count; i++ )
05470         buf[i] = cv::cvarrToMat(src[i]);
05471     for( i = 0; i < dst_count; i++ )
05472         buf[i+src_count] = cv::cvarrToMat(dst[i]);
05473     cv::mixChannels(&buf[0], src_count, &buf[src_count], dst_count, from_to, pair_count);
05474 }
05475 
05476 CV_IMPL void
05477 cvConvertScaleAbs( const void* srcarr, void* dstarr,
05478                    double scale, double shift )
05479 {
05480     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
05481     CV_Assert( src.size == dst.size && dst.type() == CV_8UC(src.channels()));
05482     cv::convertScaleAbs( src, dst, scale, shift );
05483 }
05484 
05485 CV_IMPL void
05486 cvConvertScale( const void* srcarr, void* dstarr,
05487                 double scale, double shift )
05488 {
05489     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
05490 
05491     CV_Assert( src.size == dst.size && src.channels() == dst.channels() );
05492     src.convertTo(dst, dst.type(), scale, shift);
05493 }
05494 
05495 CV_IMPL void cvLUT( const void* srcarr, void* dstarr, const void* lutarr )
05496 {
05497     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), lut = cv::cvarrToMat(lutarr);
05498 
05499     CV_Assert( dst.size() == src.size() && dst.type() == CV_MAKETYPE(lut.depth(), src.channels()) );
05500     cv::LUT( src, lut, dst );
05501 }
05502 
05503 CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr,
05504                           double a, double b, int norm_type, const CvArr* maskarr )
05505 {
05506     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
05507     if( maskarr )
05508         mask = cv::cvarrToMat(maskarr);
05509     CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() );
05510     cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
05511 }
05512 
05513 /* End of file. */
05514