gr-peach-opencv-project-sd-card_update

Renesas GR-PEACH OpenCV Development » Code » Documentation
Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update
Fork of gr-peach-opencv-project-sd-card by the do
Embed: (wiki syntax)
Show/hide line numbers convert.cpp Source File
00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 #include "precomp.hpp"
00045 
00046 #include "opencl_kernels_core.hpp"
00047 
00048 #ifdef __APPLE__
00049 #undef CV_NEON
00050 #define CV_NEON 0
00051 #endif
00052 
00053 
00054 /****************************************************************************************\
00055 *                                       split & merge                                    *
00056 \****************************************************************************************/
00057 
00058 typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
00059 
00060 static SplitFunc getSplitFunc(int depth)
00061 {
00062     static SplitFunc splitTab[] =
00063     {
00064         (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
00065         (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0
00066     };
00067 
00068     return splitTab[depth];
00069 }
00070 
00071 typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
00072 
00073 static MergeFunc getMergeFunc(int depth)
00074 {
00075     static MergeFunc mergeTab[] =
00076     {
00077         (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
00078         (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
00079     };
00080 
00081     return mergeTab[depth];
00082 }
00083 
00084 void cv::split(const Mat& src, Mat* mv)
00085 {
00086     int k, depth = src.depth(), cn = src.channels();
00087     if( cn == 1 )
00088     {
00089         src.copyTo(mv[0]);
00090         return;
00091     }
00092 
00093     SplitFunc func = getSplitFunc(depth);
00094     CV_Assert( func != 0 );
00095 
00096     int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1();
00097     int blocksize0 = (BLOCK_SIZE + esz-1)/esz;
00098     AutoBuffer<uchar>  _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
00099     const Mat** arrays = (const Mat**)(uchar*)_buf;
00100     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
00101 
00102     arrays[0] = &src;
00103     for( k = 0; k < cn; k++ )
00104     {
00105         mv[k].create(src.dims, src.size, depth);
00106         arrays[k+1] = &mv[k];
00107     }
00108 
00109     NAryMatIterator it(arrays, ptrs, cn+1);
00110     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
00111 
00112     for( size_t i = 0; i < it.nplanes; i++, ++it )
00113     {
00114         for( int j = 0; j < total; j += blocksize )
00115         {
00116             int bsz = std::min(total - j, blocksize);
00117             func( ptrs[0], &ptrs[1], bsz, cn );
00118 
00119             if( j + blocksize < total )
00120             {
00121                 ptrs[0] += bsz*esz;
00122                 for( k = 0; k < cn; k++ )
00123                     ptrs[k+1] += bsz*esz1;
00124             }
00125         }
00126     }
00127 }
00128 
00129 #ifdef HAVE_OPENCL
00130 
00131 namespace cv {
00132 
00133 static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
00134 {
00135     int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
00136             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
00137 
00138     String dstargs, processelem, indexdecl;
00139     for (int i = 0; i < cn; ++i)
00140     {
00141         dstargs += format("DECLARE_DST_PARAM(%d)", i);
00142         indexdecl += format("DECLARE_INDEX(%d)", i);
00143         processelem += format("PROCESS_ELEM(%d)", i);
00144     }
00145 
00146     ocl::Kernel k("split", ocl::core::split_merge_oclsrc,
00147                   format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s"
00148                          " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s",
00149                          ocl::memopTypeToStr(depth), cn, dstargs.c_str(),
00150                          processelem.c_str(), indexdecl.c_str()));
00151     if (k.empty())
00152         return false;
00153 
00154     Size size = _m.size();
00155     _mv.create(cn, 1, depth);
00156     for (int i = 0; i < cn; ++i)
00157         _mv.create(size, depth, i);
00158 
00159     std::vector<UMat> dst;
00160     _mv.getUMatVector(dst);
00161 
00162     int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
00163     for (int i = 0; i < cn; ++i)
00164         argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i]));
00165     k.set(argidx, rowsPerWI);
00166 
00167     size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + rowsPerWI - 1) / rowsPerWI };
00168     return k.run(2, globalsize, NULL, false);
00169 }
00170 
00171 }
00172 
00173 #endif
00174 
00175 void cv::split(InputArray _m, OutputArrayOfArrays _mv)
00176 {
00177 #ifdef HAVE_OPENCL
00178     CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(),
00179                ocl_split(_m, _mv))
00180 #endif
00181 
00182     Mat m = _m.getMat();
00183     if( m.empty() )
00184     {
00185         _mv.release();
00186         return;
00187     }
00188 
00189     CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() );
00190 
00191     int depth = m.depth(), cn = m.channels();
00192     _mv.create(cn, 1, depth);
00193     for (int i = 0; i < cn; ++i)
00194         _mv.create(m.dims, m.size.p, depth, i);
00195 
00196     std::vector<Mat> dst;
00197     _mv.getMatVector(dst);
00198 
00199     split(m, &dst[0]);
00200 }
00201 
00202 void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
00203 {
00204     CV_Assert( mv && n > 0 );
00205 
00206     int depth = mv[0].depth();
00207     bool allch1 = true;
00208     int k, cn = 0;
00209     size_t i;
00210 
00211     for( i = 0; i < n; i++ )
00212     {
00213         CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth);
00214         allch1 = allch1 && mv[i].channels() == 1;
00215         cn += mv[i].channels();
00216     }
00217 
00218     CV_Assert( 0 < cn && cn <= CV_CN_MAX );
00219     _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn));
00220     Mat dst = _dst.getMat();
00221 
00222     if( n == 1 )
00223     {
00224         mv[0].copyTo(dst);
00225         return;
00226     }
00227 
00228     if( !allch1 )
00229     {
00230         AutoBuffer<int>  pairs(cn*2);
00231         int j, ni=0;
00232 
00233         for( i = 0, j = 0; i < n; i++, j += ni )
00234         {
00235             ni = mv[i].channels();
00236             for( k = 0; k < ni; k++ )
00237             {
00238                 pairs[(j+k)*2] = j + k;
00239                 pairs[(j+k)*2+1] = j + k;
00240             }
00241         }
00242         mixChannels( mv, n, &dst, 1, &pairs[0], cn );
00243         return;
00244     }
00245 
00246     size_t esz = dst.elemSize(), esz1 = dst.elemSize1();
00247     int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
00248     AutoBuffer<uchar>  _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
00249     const Mat** arrays = (const Mat**)(uchar*)_buf;
00250     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
00251 
00252     arrays[0] = &dst;
00253     for( k = 0; k < cn; k++ )
00254         arrays[k+1] = &mv[k];
00255 
00256     NAryMatIterator it(arrays, ptrs, cn+1);
00257     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
00258     MergeFunc func = getMergeFunc(depth);
00259 
00260     for( i = 0; i < it.nplanes; i++, ++it )
00261     {
00262         for( int j = 0; j < total; j += blocksize )
00263         {
00264             int bsz = std::min(total - j, blocksize);
00265             func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn );
00266 
00267             if( j + blocksize < total )
00268             {
00269                 ptrs[0] += bsz*esz;
00270                 for( int t = 0; t < cn; t++ )
00271                     ptrs[t+1] += bsz*esz1;
00272             }
00273         }
00274     }
00275 }
00276 
00277 #ifdef HAVE_OPENCL
00278 
00279 namespace cv {
00280 
00281 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
00282 {
00283     std::vector<UMat> src, ksrc;
00284     _mv.getUMatVector(src);
00285     CV_Assert(!src.empty());
00286 
00287     int type = src[0].type(), depth = CV_MAT_DEPTH(type),
00288             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
00289     Size size = src[0].size();
00290 
00291     for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
00292     {
00293         int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype),
00294                 esz1 = CV_ELEM_SIZE1(idepth);
00295         if (src[i].dims > 2)
00296             return false;
00297 
00298         CV_Assert(size == src[i].size() && depth == idepth);
00299 
00300         for (int cn = 0; cn < icn; ++cn)
00301         {
00302             UMat tsrc = src[i];
00303             tsrc.offset += cn * esz1;
00304             ksrc.push_back(tsrc);
00305         }
00306     }
00307     int dcn = (int)ksrc.size();
00308 
00309     String srcargs, processelem, cndecl, indexdecl;
00310     for (int i = 0; i < dcn; ++i)
00311     {
00312         srcargs += format("DECLARE_SRC_PARAM(%d)", i);
00313         processelem += format("PROCESS_ELEM(%d)", i);
00314         indexdecl += format("DECLARE_INDEX(%d)", i);
00315         cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
00316     }
00317 
00318     ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
00319                   format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
00320                          " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s",
00321                          dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
00322                          indexdecl.c_str(), processelem.c_str(), cndecl.c_str()));
00323     if (k.empty())
00324         return false;
00325 
00326     _dst.create(size, CV_MAKE_TYPE(depth, dcn));
00327     UMat dst = _dst.getUMat();
00328 
00329     int argidx = 0;
00330     for (int i = 0; i < dcn; ++i)
00331         argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
00332     argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst));
00333     k.set(argidx, rowsPerWI);
00334 
00335     size_t globalsize[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
00336     return k.run(2, globalsize, NULL, false);
00337 }
00338 
00339 }
00340 
00341 #endif
00342 
00343 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst)
00344 {
00345 #ifdef HAVE_OPENCL
00346     CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(),
00347                ocl_merge(_mv, _dst))
00348 #endif
00349 
00350     std::vector<Mat> mv;
00351     _mv.getMatVector(mv);
00352     merge(!mv.empty() ? &mv[0] : 0, mv.size(), _dst);
00353 }
00354 
00355 /****************************************************************************************\
00356 *                       Generalized split/merge: mixing channels                         *
00357 \****************************************************************************************/
00358 
00359 namespace cv
00360 {
00361 
00362 template<typename T> static void
00363 mixChannels_( const T** src, const int* sdelta,
00364               T** dst, const int* ddelta,
00365               int len, int npairs )
00366 {
00367     int i, k;
00368     for( k = 0; k < npairs; k++ )
00369     {
00370         const T* s = src[k];
00371         T* d = dst[k];
00372         int ds = sdelta[k], dd = ddelta[k];
00373         if( s )
00374         {
00375             for( i = 0; i <= len - 2; i += 2, s += ds*2, d += dd*2 )
00376             {
00377                 T t0 = s[0], t1 = s[ds];
00378                 d[0] = t0; d[dd] = t1;
00379             }
00380             if( i < len )
00381                 d[0] = s[0];
00382         }
00383         else
00384         {
00385             for( i = 0; i <= len - 2; i += 2, d += dd*2 )
00386                 d[0] = d[dd] = 0;
00387             if( i < len )
00388                 d[0] = 0;
00389         }
00390     }
00391 }
00392 
00393 
00394 static void mixChannels8u( const uchar** src, const int* sdelta,
00395                            uchar** dst, const int* ddelta,
00396                            int len, int npairs )
00397 {
00398     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
00399 }
00400 
00401 static void mixChannels16u( const ushort** src, const int* sdelta,
00402                             ushort** dst, const int* ddelta,
00403                             int len, int npairs )
00404 {
00405     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
00406 }
00407 
00408 static void mixChannels32s( const int** src, const int* sdelta,
00409                             int** dst, const int* ddelta,
00410                             int len, int npairs )
00411 {
00412     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
00413 }
00414 
00415 static void mixChannels64s( const int64** src, const int* sdelta,
00416                             int64** dst, const int* ddelta,
00417                             int len, int npairs )
00418 {
00419     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
00420 }
00421 
00422 typedef void (*MixChannelsFunc)( const uchar** src, const int* sdelta,
00423         uchar** dst, const int* ddelta, int len, int npairs );
00424 
00425 static MixChannelsFunc getMixchFunc(int depth)
00426 {
00427     static MixChannelsFunc mixchTab[] =
00428     {
00429         (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels16u,
00430         (MixChannelsFunc)mixChannels16u, (MixChannelsFunc)mixChannels32s, (MixChannelsFunc)mixChannels32s,
00431         (MixChannelsFunc)mixChannels64s, 0
00432     };
00433 
00434     return mixchTab[depth];
00435 }
00436 
00437 }
00438 
00439 void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, const int* fromTo, size_t npairs )
00440 {
00441     if( npairs == 0 )
00442         return;
00443     CV_Assert( src && nsrcs > 0 && dst && ndsts > 0 && fromTo && npairs > 0 );
00444 
00445     size_t i, j, k, esz1 = dst[0].elemSize1();
00446     int depth = dst[0].depth();
00447 
00448     AutoBuffer<uchar>  buf((nsrcs + ndsts + 1)*(sizeof(Mat*) + sizeof(uchar*)) + npairs*(sizeof(uchar*)*2 + sizeof(int)*6));
00449     const Mat** arrays = (const Mat**)(uchar*)buf;
00450     uchar** ptrs = (uchar**)(arrays + nsrcs + ndsts);
00451     const uchar** srcs = (const uchar**)(ptrs + nsrcs + ndsts + 1);
00452     uchar** dsts = (uchar**)(srcs + npairs);
00453     int* tab = (int*)(dsts + npairs);
00454     int *sdelta = (int*)(tab + npairs*4), *ddelta = sdelta + npairs;
00455 
00456     for( i = 0; i < nsrcs; i++ )
00457         arrays[i] = &src[i];
00458     for( i = 0; i < ndsts; i++ )
00459         arrays[i + nsrcs] = &dst[i];
00460     ptrs[nsrcs + ndsts] = 0;
00461 
00462     for( i = 0; i < npairs; i++ )
00463     {
00464         int i0 = fromTo[i*2], i1 = fromTo[i*2+1];
00465         if( i0 >= 0 )
00466         {
00467             for( j = 0; j < nsrcs; i0 -= src[j].channels(), j++ )
00468                 if( i0 < src[j].channels() )
00469                     break;
00470             CV_Assert(j < nsrcs && src[j].depth() == depth);
00471             tab[i*4] = (int)j; tab[i*4+1] = (int)(i0*esz1);
00472             sdelta[i] = src[j].channels();
00473         }
00474         else
00475         {
00476             tab[i*4] = (int)(nsrcs + ndsts); tab[i*4+1] = 0;
00477             sdelta[i] = 0;
00478         }
00479 
00480         for( j = 0; j < ndsts; i1 -= dst[j].channels(), j++ )
00481             if( i1 < dst[j].channels() )
00482                 break;
00483         CV_Assert(i1 >= 0 && j < ndsts && dst[j].depth() == depth);
00484         tab[i*4+2] = (int)(j + nsrcs); tab[i*4+3] = (int)(i1*esz1);
00485         ddelta[i] = dst[j].channels();
00486     }
00487 
00488     NAryMatIterator it(arrays, ptrs, (int)(nsrcs + ndsts));
00489     int total = (int)it.size, blocksize = std::min(total, (int)((BLOCK_SIZE + esz1-1)/esz1));
00490     MixChannelsFunc func = getMixchFunc(depth);
00491 
00492     for( i = 0; i < it.nplanes; i++, ++it )
00493     {
00494         for( k = 0; k < npairs; k++ )
00495         {
00496             srcs[k] = ptrs[tab[k*4]] + tab[k*4+1];
00497             dsts[k] = ptrs[tab[k*4+2]] + tab[k*4+3];
00498         }
00499 
00500         for( int t = 0; t < total; t += blocksize )
00501         {
00502             int bsz = std::min(total - t, blocksize);
00503             func( srcs, sdelta, dsts, ddelta, bsz, (int)npairs );
00504 
00505             if( t + blocksize < total )
00506                 for( k = 0; k < npairs; k++ )
00507                 {
00508                     srcs[k] += blocksize*sdelta[k]*esz1;
00509                     dsts[k] += blocksize*ddelta[k]*esz1;
00510                 }
00511         }
00512     }
00513 }
00514 
00515 #ifdef HAVE_OPENCL
00516 
00517 namespace cv {
00518 
00519 static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx)
00520 {
00521     int totalChannels = 0;
00522     for (size_t i = 0, size = um.size(); i < size; ++i)
00523     {
00524         int ccn = um[i].channels();
00525         totalChannels += ccn;
00526 
00527         if (totalChannels == cn)
00528         {
00529             idx = (int)(i + 1);
00530             cnidx = 0;
00531             return;
00532         }
00533         else if (totalChannels > cn)
00534         {
00535             idx = (int)i;
00536             cnidx = i == 0 ? cn : (cn - totalChannels + ccn);
00537             return;
00538         }
00539     }
00540 
00541     idx = cnidx = -1;
00542 }
00543 
00544 static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _dst,
00545                             const int* fromTo, size_t npairs)
00546 {
00547     std::vector<UMat> src, dst;
00548     _src.getUMatVector(src);
00549     _dst.getUMatVector(dst);
00550 
00551     size_t nsrc = src.size(), ndst = dst.size();
00552     CV_Assert(nsrc > 0 && ndst > 0);
00553 
00554     Size size = src[0].size();
00555     int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth),
00556             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
00557 
00558     for (size_t i = 1, ssize = src.size(); i < ssize; ++i)
00559         CV_Assert(src[i].size() == size && src[i].depth() == depth);
00560     for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
00561         CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
00562 
00563     String declsrc, decldst, declproc, declcn, indexdecl;
00564     std::vector<UMat> srcargs(npairs), dstargs(npairs);
00565 
00566     for (size_t i = 0; i < npairs; ++i)
00567     {
00568         int scn = fromTo[i<<1], dcn = fromTo[(i<<1) + 1];
00569         int src_idx, src_cnidx, dst_idx, dst_cnidx;
00570 
00571         getUMatIndex(src, scn, src_idx, src_cnidx);
00572         getUMatIndex(dst, dcn, dst_idx, dst_cnidx);
00573 
00574         CV_Assert(dst_idx >= 0 && src_idx >= 0);
00575 
00576         srcargs[i] = src[src_idx];
00577         srcargs[i].offset += src_cnidx * esz;
00578 
00579         dstargs[i] = dst[dst_idx];
00580         dstargs[i].offset += dst_cnidx * esz;
00581 
00582         declsrc += format("DECLARE_INPUT_MAT(%d)", i);
00583         decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
00584         indexdecl += format("DECLARE_INDEX(%d)", i);
00585         declproc += format("PROCESS_ELEM(%d)", i);
00586         declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
00587     }
00588 
00589     ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
00590                   format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s"
00591                          " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s",
00592                          ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(),
00593                          declproc.c_str(), indexdecl.c_str(), declcn.c_str()));
00594     if (k.empty())
00595         return false;
00596 
00597     int argindex = 0;
00598     for (size_t i = 0; i < npairs; ++i)
00599         argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i]));
00600     for (size_t i = 0; i < npairs; ++i)
00601         argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i]));
00602     argindex = k.set(argindex, size.height);
00603     argindex = k.set(argindex, size.width);
00604     k.set(argindex, rowsPerWI);
00605 
00606     size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + rowsPerWI - 1) / rowsPerWI };
00607     return k.run(2, globalsize, NULL, false);
00608 }
00609 
00610 }
00611 
00612 #endif
00613 
00614 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
00615                  const int* fromTo, size_t npairs)
00616 {
00617     if (npairs == 0 || fromTo == NULL)
00618         return;
00619 
00620 #ifdef HAVE_OPENCL
00621     CV_OCL_RUN(dst.isUMatVector(),
00622                ocl_mixChannels(src, dst, fromTo, npairs))
00623 #endif
00624 
00625     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
00626             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
00627             src.kind() != _InputArray::STD_VECTOR_UMAT;
00628     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
00629             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
00630             dst.kind() != _InputArray::STD_VECTOR_UMAT;
00631     int i;
00632     int nsrc = src_is_mat ? 1 : (int)src.total();
00633     int ndst = dst_is_mat ? 1 : (int)dst.total();
00634 
00635     CV_Assert(nsrc > 0 && ndst > 0);
00636     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
00637     Mat* buf = _buf;
00638     for( i = 0; i < nsrc; i++ )
00639         buf[i] = src.getMat(src_is_mat ? -1 : i);
00640     for( i = 0; i < ndst; i++ )
00641         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
00642     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, fromTo, npairs);
00643 }
00644 
00645 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
00646                      const std::vector<int>& fromTo)
00647 {
00648     if (fromTo.empty())
00649         return;
00650 
00651 #ifdef HAVE_OPENCL
00652     CV_OCL_RUN(dst.isUMatVector(),
00653                ocl_mixChannels(src, dst, &fromTo[0], fromTo.size()>>1))
00654 #endif
00655 
00656     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
00657             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
00658             src.kind() != _InputArray::STD_VECTOR_UMAT;
00659     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
00660             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
00661             dst.kind() != _InputArray::STD_VECTOR_UMAT;
00662     int i;
00663     int nsrc = src_is_mat ? 1 : (int)src.total();
00664     int ndst = dst_is_mat ? 1 : (int)dst.total();
00665 
00666     CV_Assert(fromTo.size()%2 == 0 && nsrc > 0 && ndst > 0);
00667     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
00668     Mat* buf = _buf;
00669     for( i = 0; i < nsrc; i++ )
00670         buf[i] = src.getMat(src_is_mat ? -1 : i);
00671     for( i = 0; i < ndst; i++ )
00672         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
00673     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, &fromTo[0], fromTo.size()/2);
00674 }
00675 
00676 void cv::extractChannel(InputArray _src, OutputArray _dst, int coi)
00677 {
00678     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
00679     CV_Assert( 0 <= coi && coi < cn );
00680     int ch[] = { coi, 0 };
00681 
00682 #ifdef HAVE_OPENCL
00683     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
00684     {
00685         UMat  src = _src.getUMat();
00686         _dst.create(src.dims, &src.size[0], depth);
00687         UMat  dst = _dst.getUMat();
00688         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
00689         return;
00690     }
00691 #endif
00692 
00693     Mat src = _src.getMat();
00694     _dst.create(src.dims, &src.size[0], depth);
00695     Mat dst = _dst.getMat();
00696     mixChannels(&src, 1, &dst, 1, ch, 1);
00697 }
00698 
00699 void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi)
00700 {
00701     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
00702     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
00703     CV_Assert( _src.sameSize(_dst) && sdepth == ddepth );
00704     CV_Assert( 0 <= coi && coi < dcn && scn == 1 );
00705 
00706     int ch[] = { 0, coi };
00707     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
00708     {
00709         UMat  src = _src.getUMat(), dst = _dst.getUMat();
00710         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
00711         return;
00712     }
00713 
00714     Mat src = _src.getMat(), dst = _dst.getMat();
00715     mixChannels(&src, 1, &dst, 1, ch, 1);
00716 }
00717 
00718 /****************************************************************************************\
00719 *                                convertScale[Abs]                                       *
00720 \****************************************************************************************/
00721 
00722 namespace cv
00723 {
00724 
00725 template<typename T, typename DT, typename WT>
00726 struct cvtScaleAbs_SIMD
00727 {
00728     int operator () (const T *, DT *, int, WT, WT) const
00729     {
00730         return 0;
00731     }
00732 };
00733 
00734 #if CV_SSE2
00735 
00736 template <>
00737 struct cvtScaleAbs_SIMD<uchar, uchar, float>
00738 {
00739     int operator () (const uchar * src, uchar * dst, int width,
00740                      float scale, float shift) const
00741     {
00742         int x = 0;
00743 
00744         if (USE_SSE2)
00745         {
00746             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00747                 v_zero_f = _mm_setzero_ps();
00748             __m128i v_zero_i = _mm_setzero_si128();
00749 
00750             for ( ; x <= width - 16; x += 16)
00751             {
00752                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00753                 __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
00754                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
00755                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00756                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
00757                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00758                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
00759                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
00760                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
00761                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
00762 
00763                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
00764                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
00765                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
00766             }
00767         }
00768 
00769         return x;
00770     }
00771 };
00772 
00773 template <>
00774 struct cvtScaleAbs_SIMD<schar, uchar, float>
00775 {
00776     int operator () (const schar * src, uchar * dst, int width,
00777                      float scale, float shift) const
00778     {
00779         int x = 0;
00780 
00781         if (USE_SSE2)
00782         {
00783             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00784                 v_zero_f = _mm_setzero_ps();
00785             __m128i v_zero_i = _mm_setzero_si128();
00786 
00787             for ( ; x <= width - 16; x += 16)
00788             {
00789                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00790                 __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8),
00791                         v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8);
00792                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
00793                     _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
00794                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00795                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
00796                     _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
00797                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00798                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
00799                     _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
00800                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
00801                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
00802                     _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
00803                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
00804 
00805                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
00806                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
00807                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
00808             }
00809         }
00810 
00811         return x;
00812     }
00813 };
00814 
00815 template <>
00816 struct cvtScaleAbs_SIMD<ushort, uchar, float>
00817 {
00818     int operator () (const ushort * src, uchar * dst, int width,
00819                      float scale, float shift) const
00820     {
00821         int x = 0;
00822 
00823         if (USE_SSE2)
00824         {
00825             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00826                 v_zero_f = _mm_setzero_ps();
00827             __m128i v_zero_i = _mm_setzero_si128();
00828 
00829             for ( ; x <= width - 8; x += 8)
00830             {
00831                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00832                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
00833                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00834                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
00835                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00836 
00837                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
00838                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
00839             }
00840         }
00841 
00842         return x;
00843     }
00844 };
00845 
00846 template <>
00847 struct cvtScaleAbs_SIMD<short, uchar, float>
00848 {
00849     int operator () (const short * src, uchar * dst, int width,
00850                      float scale, float shift) const
00851     {
00852         int x = 0;
00853 
00854         if (USE_SSE2)
00855         {
00856             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00857                 v_zero_f = _mm_setzero_ps();
00858             __m128i v_zero_i = _mm_setzero_si128();
00859 
00860             for ( ; x <= width - 8; x += 8)
00861             {
00862                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00863                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
00864                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00865                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
00866                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00867 
00868                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
00869                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
00870             }
00871         }
00872 
00873         return x;
00874     }
00875 };
00876 
00877 template <>
00878 struct cvtScaleAbs_SIMD<int, uchar, float>
00879 {
00880     int operator () (const int * src, uchar * dst, int width,
00881                      float scale, float shift) const
00882     {
00883         int x = 0;
00884 
00885         if (USE_SSE2)
00886         {
00887             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00888                 v_zero_f = _mm_setzero_ps();
00889             __m128i v_zero_i = _mm_setzero_si128();
00890 
00891             for ( ; x <= width - 8; x += 4)
00892             {
00893                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
00894                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
00895                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00896 
00897                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
00898                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
00899             }
00900         }
00901 
00902         return x;
00903     }
00904 };
00905 
00906 template <>
00907 struct cvtScaleAbs_SIMD<float, uchar, float>
00908 {
00909     int operator () (const float * src, uchar * dst, int width,
00910                      float scale, float shift) const
00911     {
00912         int x = 0;
00913 
00914         if (USE_SSE2)
00915         {
00916             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00917                 v_zero_f = _mm_setzero_ps();
00918             __m128i v_zero_i = _mm_setzero_si128();
00919 
00920             for ( ; x <= width - 8; x += 4)
00921             {
00922                 __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
00923                 v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
00924 
00925                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
00926                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
00927             }
00928         }
00929 
00930         return x;
00931     }
00932 };
00933 
00934 template <>
00935 struct cvtScaleAbs_SIMD<double, uchar, float>
00936 {
00937     int operator () (const double * src, uchar * dst, int width,
00938                      float scale, float shift) const
00939     {
00940         int x = 0;
00941 
00942         if (USE_SSE2)
00943         {
00944             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
00945                 v_zero_f = _mm_setzero_ps();
00946             __m128i v_zero_i = _mm_setzero_si128();
00947 
00948             for ( ; x <= width - 8; x += 8)
00949             {
00950                 __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
00951                                               _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
00952                 __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
00953                                               _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
00954 
00955                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift);
00956                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
00957 
00958                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift);
00959                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
00960 
00961                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1),
00962                                                   _mm_cvtps_epi32(v_dst2));
00963 
00964                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
00965             }
00966         }
00967 
00968         return x;
00969     }
00970 };
00971 
00972 #elif CV_NEON
00973 
00974 template <>
00975 struct cvtScaleAbs_SIMD<uchar, uchar, float>
00976 {
00977     int operator () (const uchar * src, uchar * dst, int width,
00978                      float scale, float shift) const
00979     {
00980         int x = 0;
00981         float32x4_t v_shift = vdupq_n_f32(shift);
00982 
00983         for ( ; x <= width - 16; x += 16)
00984         {
00985             uint8x16_t v_src = vld1q_u8(src + x);
00986             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
00987 
00988             uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half));
00989             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
00990             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
00991 
00992             v_quat = vmovl_u16(vget_high_u16(v_half));
00993             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
00994             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
00995 
00996             v_half = vmovl_u8(vget_high_u8(v_src));
00997 
00998             v_quat = vmovl_u16(vget_low_u16(v_half));
00999             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
01000             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
01001 
01002             v_quat = vmovl_u16(vget_high_u16(v_half));
01003             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
01004             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
01005 
01006             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
01007                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
01008             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
01009                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
01010 
01011             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
01012         }
01013 
01014         return x;
01015     }
01016 };
01017 
01018 template <>
01019 struct cvtScaleAbs_SIMD<schar, uchar, float>
01020 {
01021     int operator () (const schar * src, uchar * dst, int width,
01022                      float scale, float shift) const
01023     {
01024         int x = 0;
01025         float32x4_t v_shift = vdupq_n_f32(shift);
01026 
01027         for ( ; x <= width - 16; x += 16)
01028         {
01029             int8x16_t v_src = vld1q_s8(src + x);
01030             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
01031 
01032             int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half));
01033             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
01034             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01035 
01036             v_quat = vmovl_s16(vget_high_s16(v_half));
01037             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
01038             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01039 
01040             v_half = vmovl_s8(vget_high_s8(v_src));
01041 
01042             v_quat = vmovl_s16(vget_low_s16(v_half));
01043             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
01044             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
01045 
01046             v_quat = vmovl_s16(vget_high_s16(v_half));
01047             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
01048             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
01049 
01050             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
01051                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
01052             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
01053                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
01054 
01055             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
01056         }
01057 
01058         return x;
01059     }
01060 };
01061 
01062 template <>
01063 struct cvtScaleAbs_SIMD<ushort, uchar, float>
01064 {
01065     int operator () (const ushort * src, uchar * dst, int width,
01066                      float scale, float shift) const
01067     {
01068         int x = 0;
01069         float32x4_t v_shift = vdupq_n_f32(shift);
01070 
01071         for ( ; x <= width - 8; x += 8)
01072         {
01073             uint16x8_t v_src = vld1q_u16(src + x);
01074 
01075             uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src));
01076             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
01077             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01078 
01079             v_half = vmovl_u16(vget_high_u16(v_src));
01080             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
01081             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01082 
01083             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
01084                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
01085 
01086             vst1_u8(dst + x, vqmovn_u16(v_dst));
01087         }
01088 
01089         return x;
01090     }
01091 };
01092 
01093 template <>
01094 struct cvtScaleAbs_SIMD<short, uchar, float>
01095 {
01096     int operator () (const short * src, uchar * dst, int width,
01097                      float scale, float shift) const
01098     {
01099         int x = 0;
01100         float32x4_t v_shift = vdupq_n_f32(shift);
01101 
01102         for ( ; x <= width - 8; x += 8)
01103         {
01104             int16x8_t v_src = vld1q_s16(src + x);
01105 
01106             int32x4_t v_half = vmovl_s16(vget_low_s16(v_src));
01107             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
01108             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01109 
01110             v_half = vmovl_s16(vget_high_s16(v_src));
01111             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
01112             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01113 
01114             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
01115                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
01116 
01117             vst1_u8(dst + x, vqmovn_u16(v_dst));
01118         }
01119 
01120         return x;
01121     }
01122 };
01123 
01124 template <>
01125 struct cvtScaleAbs_SIMD<int, uchar, float>
01126 {
01127     int operator () (const int * src, uchar * dst, int width,
01128                      float scale, float shift) const
01129     {
01130         int x = 0;
01131         float32x4_t v_shift = vdupq_n_f32(shift);
01132 
01133         for ( ; x <= width - 8; x += 8)
01134         {
01135             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale);
01136             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01137             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
01138 
01139             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale);
01140             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01141             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
01142 
01143             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
01144             vst1_u8(dst + x, vqmovn_u16(v_dst));
01145         }
01146 
01147         return x;
01148     }
01149 };
01150 
01151 template <>
01152 struct cvtScaleAbs_SIMD<float, uchar, float>
01153 {
01154     int operator () (const float * src, uchar * dst, int width,
01155                      float scale, float shift) const
01156     {
01157         int x = 0;
01158         float32x4_t v_shift = vdupq_n_f32(shift);
01159 
01160         for ( ; x <= width - 8; x += 8)
01161         {
01162             float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
01163             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
01164             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
01165 
01166             float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
01167             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
01168             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
01169 
01170             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
01171             vst1_u8(dst + x, vqmovn_u16(v_dst));
01172         }
01173 
01174         return x;
01175     }
01176 };
01177 
01178 #endif
01179 
01180 template<typename T, typename DT, typename WT> static void
01181 cvtScaleAbs_( const T* src, size_t sstep,
01182               DT* dst, size_t dstep, Size size,
01183               WT scale, WT shift )
01184 {
01185     sstep /= sizeof(src[0]);
01186     dstep /= sizeof(dst[0]);
01187     cvtScaleAbs_SIMD<T, DT, WT> vop;
01188 
01189     for( ; size.height--; src += sstep, dst += dstep )
01190     {
01191         int x = vop(src, dst, size.width, scale, shift);
01192 
01193         #if CV_ENABLE_UNROLLED
01194         for( ; x <= size.width - 4; x += 4 )
01195         {
01196             DT t0, t1;
01197             t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift));
01198             t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift));
01199             dst[x] = t0; dst[x+1] = t1;
01200             t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift));
01201             t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift));
01202             dst[x+2] = t0; dst[x+3] = t1;
01203         }
01204         #endif
01205         for( ; x < size.width; x++ )
01206             dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
01207     }
01208 }
01209 
01210 template <typename T, typename DT, typename WT>
01211 struct cvtScale_SIMD
01212 {
01213     int operator () (const T *, DT *, int, WT, WT) const
01214     {
01215         return 0;
01216     }
01217 };
01218 
01219 #if CV_SSE2
01220 
01221 // from uchar
01222 
01223 template <>
01224 struct cvtScale_SIMD<uchar, uchar, float>
01225 {
01226     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
01227     {
01228         int x = 0;
01229 
01230         if (!USE_SSE2)
01231             return x;
01232 
01233         __m128i v_zero = _mm_setzero_si128();
01234         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01235 
01236         for ( ; x <= width - 8; x += 8)
01237         {
01238             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01239             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01240             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01241 
01242             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01243             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01244 
01245             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01246                                             _mm_cvtps_epi32(v_dst_1));
01247             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
01248         }
01249 
01250         return x;
01251     }
01252 };
01253 
01254 template <>
01255 struct cvtScale_SIMD<uchar, schar, float>
01256 {
01257     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
01258     {
01259         int x = 0;
01260 
01261         if (!USE_SSE2)
01262             return x;
01263 
01264         __m128i v_zero = _mm_setzero_si128();
01265         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01266 
01267         for ( ; x <= width - 8; x += 8)
01268         {
01269             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01270             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01271             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01272 
01273             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01274             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01275 
01276             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01277                                             _mm_cvtps_epi32(v_dst_1));
01278             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
01279         }
01280 
01281         return x;
01282     }
01283 };
01284 
01285 #if CV_SSE4_1
01286 
01287 template <>
01288 struct cvtScale_SIMD<uchar, ushort, float>
01289 {
01290     cvtScale_SIMD()
01291     {
01292         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01293     }
01294 
01295     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
01296     {
01297         int x = 0;
01298 
01299         if (!haveSSE)
01300             return x;
01301 
01302         __m128i v_zero = _mm_setzero_si128();
01303         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01304 
01305         for ( ; x <= width - 8; x += 8)
01306         {
01307             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01308             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01309             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01310 
01311             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01312             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01313 
01314             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
01315                                              _mm_cvtps_epi32(v_dst_1));
01316             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01317         }
01318 
01319         return x;
01320     }
01321 
01322     bool haveSSE;
01323 };
01324 
01325 #endif
01326 
01327 template <>
01328 struct cvtScale_SIMD<uchar, short, float>
01329 {
01330     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
01331     {
01332         int x = 0;
01333 
01334         if (!USE_SSE2)
01335             return x;
01336 
01337         __m128i v_zero = _mm_setzero_si128();
01338         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01339 
01340         for ( ; x <= width - 8; x += 8)
01341         {
01342             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01343             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01344             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01345 
01346             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01347             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01348 
01349             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01350                                             _mm_cvtps_epi32(v_dst_1));
01351             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01352         }
01353 
01354         return x;
01355     }
01356 };
01357 
01358 template <>
01359 struct cvtScale_SIMD<uchar, int, float>
01360 {
01361     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
01362     {
01363         int x = 0;
01364 
01365         if (!USE_SSE2)
01366             return x;
01367 
01368         __m128i v_zero = _mm_setzero_si128();
01369         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01370 
01371         for ( ; x <= width - 8; x += 8)
01372         {
01373             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01374             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01375             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01376 
01377             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01378             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01379 
01380             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
01381             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
01382         }
01383 
01384         return x;
01385     }
01386 };
01387 
01388 template <>
01389 struct cvtScale_SIMD<uchar, float, float>
01390 {
01391     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
01392     {
01393         int x = 0;
01394 
01395         if (!USE_SSE2)
01396             return x;
01397 
01398         __m128i v_zero = _mm_setzero_si128();
01399         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01400 
01401         for ( ; x <= width - 8; x += 8)
01402         {
01403             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01404             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01405             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01406 
01407             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01408             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01409 
01410             _mm_storeu_ps(dst + x, v_dst_0);
01411             _mm_storeu_ps(dst + x + 4, v_dst_1);
01412         }
01413 
01414         return x;
01415     }
01416 };
01417 
01418 template <>
01419 struct cvtScale_SIMD<uchar, double, double>
01420 {
01421     int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
01422     {
01423         int x = 0;
01424 
01425         if (!USE_SSE2)
01426             return x;
01427 
01428         __m128i v_zero = _mm_setzero_si128();
01429         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
01430 
01431         for ( ; x <= width - 8; x += 8)
01432         {
01433             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
01434 
01435             __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
01436             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01437             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01438             _mm_storeu_pd(dst + x, v_dst_0);
01439             _mm_storeu_pd(dst + x + 2, v_dst_1);
01440 
01441             v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
01442             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01443             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01444             _mm_storeu_pd(dst + x + 4, v_dst_0);
01445             _mm_storeu_pd(dst + x + 6, v_dst_1);
01446         }
01447 
01448         return x;
01449     }
01450 };
01451 
01452 // from schar
01453 
01454 template <>
01455 struct cvtScale_SIMD<schar, uchar, float>
01456 {
01457     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
01458     {
01459         int x = 0;
01460 
01461         if (!USE_SSE2)
01462             return x;
01463 
01464         __m128i v_zero = _mm_setzero_si128();
01465         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01466 
01467         for ( ; x <= width - 8; x += 8)
01468         {
01469             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01470             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01471             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01472 
01473             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01474             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01475 
01476             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01477                                             _mm_cvtps_epi32(v_dst_1));
01478             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
01479         }
01480 
01481         return x;
01482     }
01483 };
01484 
01485 template <>
01486 struct cvtScale_SIMD<schar, schar, float>
01487 {
01488     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
01489     {
01490         int x = 0;
01491 
01492         if (!USE_SSE2)
01493             return x;
01494 
01495         __m128i v_zero = _mm_setzero_si128();
01496         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01497 
01498         for ( ; x <= width - 8; x += 8)
01499         {
01500             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01501             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01502             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01503 
01504             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01505             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01506 
01507             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01508                                             _mm_cvtps_epi32(v_dst_1));
01509             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
01510         }
01511 
01512         return x;
01513     }
01514 };
01515 
01516 #if CV_SSE4_1
01517 
01518 template <>
01519 struct cvtScale_SIMD<schar, ushort, float>
01520 {
01521     cvtScale_SIMD()
01522     {
01523         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01524     }
01525 
01526     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
01527     {
01528         int x = 0;
01529 
01530         if (!haveSSE)
01531             return x;
01532 
01533         __m128i v_zero = _mm_setzero_si128();
01534         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01535 
01536         for ( ; x <= width - 8; x += 8)
01537         {
01538             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01539             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01540             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01541 
01542             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01543             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01544 
01545             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
01546                                              _mm_cvtps_epi32(v_dst_1));
01547             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01548         }
01549 
01550         return x;
01551     }
01552 
01553     bool haveSSE;
01554 };
01555 
01556 #endif
01557 
01558 template <>
01559 struct cvtScale_SIMD<schar, short, float>
01560 {
01561     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
01562     {
01563         int x = 0;
01564 
01565         if (!USE_SSE2)
01566             return x;
01567 
01568         __m128i v_zero = _mm_setzero_si128();
01569         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01570 
01571         for ( ; x <= width - 8; x += 8)
01572         {
01573             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01574             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01575             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01576 
01577             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01578             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01579 
01580             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01581                                             _mm_cvtps_epi32(v_dst_1));
01582             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01583         }
01584 
01585         return x;
01586     }
01587 };
01588 
01589 template <>
01590 struct cvtScale_SIMD<schar, int, float>
01591 {
01592     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
01593     {
01594         int x = 0;
01595 
01596         if (!USE_SSE2)
01597             return x;
01598 
01599         __m128i v_zero = _mm_setzero_si128();
01600         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01601 
01602         for ( ; x <= width - 8; x += 8)
01603         {
01604             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01605             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01606             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01607 
01608             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01609             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01610 
01611             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
01612             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
01613         }
01614 
01615         return x;
01616     }
01617 };
01618 
01619 template <>
01620 struct cvtScale_SIMD<schar, float, float>
01621 {
01622     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
01623     {
01624         int x = 0;
01625 
01626         if (!USE_SSE2)
01627             return x;
01628 
01629         __m128i v_zero = _mm_setzero_si128();
01630         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01631 
01632         for ( ; x <= width - 8; x += 8)
01633         {
01634             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
01635             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01636             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01637 
01638             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01639             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01640 
01641             _mm_storeu_ps(dst + x, v_dst_0);
01642             _mm_storeu_ps(dst + x + 4, v_dst_1);
01643         }
01644 
01645         return x;
01646     }
01647 };
01648 
01649 template <>
01650 struct cvtScale_SIMD<schar, double, double>
01651 {
01652     int operator () (const schar * src, double * dst, int width, double scale, double shift) const
01653     {
01654         int x = 0;
01655 
01656         if (!USE_SSE2)
01657             return x;
01658 
01659         __m128i v_zero = _mm_setzero_si128();
01660         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
01661 
01662         for ( ; x <= width - 8; x += 8)
01663         {
01664             __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x)));
01665             v_src = _mm_srai_epi16(v_src, 8);
01666 
01667             __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
01668             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01669             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01670             _mm_storeu_pd(dst + x, v_dst_0);
01671             _mm_storeu_pd(dst + x + 2, v_dst_1);
01672 
01673             v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
01674             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01675             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01676             _mm_storeu_pd(dst + x + 4, v_dst_0);
01677             _mm_storeu_pd(dst + x + 6, v_dst_1);
01678         }
01679 
01680         return x;
01681     }
01682 };
01683 
01684 // from ushort
01685 
01686 template <>
01687 struct cvtScale_SIMD<ushort, uchar, float>
01688 {
01689     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
01690     {
01691         int x = 0;
01692 
01693         if (!USE_SSE2)
01694             return x;
01695 
01696         __m128i v_zero = _mm_setzero_si128();
01697         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01698 
01699         for ( ; x <= width - 8; x += 8)
01700         {
01701             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01702             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01703             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01704 
01705             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01706             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01707 
01708             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01709                                             _mm_cvtps_epi32(v_dst_1));
01710             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
01711         }
01712 
01713         return x;
01714     }
01715 };
01716 
01717 template <>
01718 struct cvtScale_SIMD<ushort, schar, float>
01719 {
01720     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
01721     {
01722         int x = 0;
01723 
01724         if (!USE_SSE2)
01725             return x;
01726 
01727         __m128i v_zero = _mm_setzero_si128();
01728         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01729 
01730         for ( ; x <= width - 8; x += 8)
01731         {
01732             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01733             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01734             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01735 
01736             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01737             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01738 
01739             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01740                                             _mm_cvtps_epi32(v_dst_1));
01741             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
01742         }
01743 
01744         return x;
01745     }
01746 };
01747 
01748 #if CV_SSE4_1
01749 
01750 template <>
01751 struct cvtScale_SIMD<ushort, ushort, float>
01752 {
01753     cvtScale_SIMD()
01754     {
01755         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01756     }
01757 
01758     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
01759     {
01760         int x = 0;
01761 
01762         if (!haveSSE)
01763             return x;
01764 
01765         __m128i v_zero = _mm_setzero_si128();
01766         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01767 
01768         for ( ; x <= width - 8; x += 8)
01769         {
01770             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01771             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01772             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01773 
01774             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01775             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01776 
01777             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
01778                                              _mm_cvtps_epi32(v_dst_1));
01779             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01780         }
01781 
01782         return x;
01783     }
01784 
01785     bool haveSSE;
01786 };
01787 
01788 #endif
01789 
01790 template <>
01791 struct cvtScale_SIMD<ushort, short, float>
01792 {
01793     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
01794     {
01795         int x = 0;
01796 
01797         if (!USE_SSE2)
01798             return x;
01799 
01800         __m128i v_zero = _mm_setzero_si128();
01801         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01802 
01803         for ( ; x <= width - 8; x += 8)
01804         {
01805             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01806             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01807             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01808 
01809             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01810             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01811 
01812             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01813                                             _mm_cvtps_epi32(v_dst_1));
01814             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
01815         }
01816 
01817         return x;
01818     }
01819 };
01820 
01821 template <>
01822 struct cvtScale_SIMD<ushort, int, float>
01823 {
01824     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
01825     {
01826         int x = 0;
01827 
01828         if (!USE_SSE2)
01829             return x;
01830 
01831         __m128i v_zero = _mm_setzero_si128();
01832         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01833 
01834         for ( ; x <= width - 8; x += 8)
01835         {
01836             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01837             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01838             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01839 
01840             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01841             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01842 
01843             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
01844             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
01845         }
01846 
01847         return x;
01848     }
01849 };
01850 
01851 template <>
01852 struct cvtScale_SIMD<ushort, float, float>
01853 {
01854     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
01855     {
01856         int x = 0;
01857 
01858         if (!USE_SSE2)
01859             return x;
01860 
01861         __m128i v_zero = _mm_setzero_si128();
01862         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01863 
01864         for ( ; x <= width - 8; x += 8)
01865         {
01866             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01867             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
01868             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01869 
01870             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
01871             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01872 
01873             _mm_storeu_ps(dst + x, v_dst_0);
01874             _mm_storeu_ps(dst + x + 4, v_dst_1);
01875         }
01876 
01877         return x;
01878     }
01879 };
01880 
01881 template <>
01882 struct cvtScale_SIMD<ushort, double, double>
01883 {
01884     int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
01885     {
01886         int x = 0;
01887 
01888         if (!USE_SSE2)
01889             return x;
01890 
01891         __m128i v_zero = _mm_setzero_si128();
01892         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
01893 
01894         for ( ; x <= width - 8; x += 8)
01895         {
01896             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01897 
01898             __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
01899             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01900             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01901             _mm_storeu_pd(dst + x, v_dst_0);
01902             _mm_storeu_pd(dst + x + 2, v_dst_1);
01903 
01904             v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
01905             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
01906             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
01907             _mm_storeu_pd(dst + x + 4, v_dst_0);
01908             _mm_storeu_pd(dst + x + 6, v_dst_1);
01909         }
01910 
01911         return x;
01912     }
01913 };
01914 
01915 // from short
01916 
01917 template <>
01918 struct cvtScale_SIMD<short, uchar, float>
01919 {
01920     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
01921     {
01922         int x = 0;
01923 
01924         if (!USE_SSE2)
01925             return x;
01926 
01927         __m128i v_zero = _mm_setzero_si128();
01928         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01929 
01930         for ( ; x <= width - 8; x += 8)
01931         {
01932             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01933             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01934             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01935 
01936             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01937             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01938 
01939             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01940                                             _mm_cvtps_epi32(v_dst_1));
01941             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
01942         }
01943 
01944         return x;
01945     }
01946 };
01947 
01948 template <>
01949 struct cvtScale_SIMD<short, schar, float>
01950 {
01951     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
01952     {
01953         int x = 0;
01954 
01955         if (!USE_SSE2)
01956             return x;
01957 
01958         __m128i v_zero = _mm_setzero_si128();
01959         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01960 
01961         for ( ; x <= width - 8; x += 8)
01962         {
01963             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
01964             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
01965             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01966 
01967             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
01968             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
01969 
01970             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
01971                                             _mm_cvtps_epi32(v_dst_1));
01972             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
01973         }
01974 
01975         return x;
01976     }
01977 };
01978 
01979 #if CV_SSE4_1
01980 
01981 template <>
01982 struct cvtScale_SIMD<short, ushort, float>
01983 {
01984     cvtScale_SIMD()
01985     {
01986         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01987     }
01988 
01989     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
01990     {
01991         int x = 0;
01992 
01993         if (!haveSSE)
01994             return x;
01995 
01996         __m128i v_zero = _mm_setzero_si128();
01997         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
01998 
01999         for ( ; x <= width - 8; x += 8)
02000         {
02001             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02002             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
02003             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02004 
02005             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
02006             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02007 
02008             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
02009                                              _mm_cvtps_epi32(v_dst_1));
02010             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02011         }
02012 
02013         return x;
02014     }
02015 
02016     bool haveSSE;
02017 };
02018 
02019 #endif
02020 
02021 template <>
02022 struct cvtScale_SIMD<short, short, float>
02023 {
02024     int operator () (const short * src, short * dst, int width, float scale, float shift) const
02025     {
02026         int x = 0;
02027 
02028         if (!USE_SSE2)
02029             return x;
02030 
02031         __m128i v_zero = _mm_setzero_si128();
02032         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02033 
02034         for ( ; x <= width - 8; x += 8)
02035         {
02036             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02037             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
02038             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02039 
02040             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
02041             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02042 
02043             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02044                                             _mm_cvtps_epi32(v_dst_1));
02045             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02046         }
02047 
02048         return x;
02049     }
02050 };
02051 
02052 template <>
02053 struct cvtScale_SIMD<short, int, float>
02054 {
02055     int operator () (const short * src, int * dst, int width, float scale, float shift) const
02056     {
02057         int x = 0;
02058 
02059         if (!USE_SSE2)
02060             return x;
02061 
02062         __m128i v_zero = _mm_setzero_si128();
02063         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02064 
02065         for ( ; x <= width - 8; x += 8)
02066         {
02067             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02068             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
02069             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02070 
02071             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
02072             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02073 
02074             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
02075             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
02076         }
02077 
02078         return x;
02079     }
02080 };
02081 
02082 template <>
02083 struct cvtScale_SIMD<short, float, float>
02084 {
02085     int operator () (const short * src, float * dst, int width, float scale, float shift) const
02086     {
02087         int x = 0;
02088 
02089         if (!USE_SSE2)
02090             return x;
02091 
02092         __m128i v_zero = _mm_setzero_si128();
02093         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02094 
02095         for ( ; x <= width - 8; x += 8)
02096         {
02097             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02098             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
02099             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02100 
02101             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
02102             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
02103 
02104             _mm_storeu_ps(dst + x, v_dst_0);
02105             _mm_storeu_ps(dst + x + 4, v_dst_1);
02106         }
02107 
02108         return x;
02109     }
02110 };
02111 
02112 template <>
02113 struct cvtScale_SIMD<short, double, double>
02114 {
02115     int operator () (const short * src, double * dst, int width, double scale, double shift) const
02116     {
02117         int x = 0;
02118 
02119         if (!USE_SSE2)
02120             return x;
02121 
02122         __m128i v_zero = _mm_setzero_si128();
02123         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02124 
02125         for ( ; x <= width - 8; x += 8)
02126         {
02127             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02128 
02129             __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
02130             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
02131             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
02132             _mm_storeu_pd(dst + x, v_dst_0);
02133             _mm_storeu_pd(dst + x + 2, v_dst_1);
02134 
02135             v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
02136             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
02137             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
02138             _mm_storeu_pd(dst + x + 4, v_dst_0);
02139             _mm_storeu_pd(dst + x + 6, v_dst_1);
02140         }
02141 
02142         return x;
02143     }
02144 };
02145 
02146 // from int
02147 
02148 template <>
02149 struct cvtScale_SIMD<int, uchar, float>
02150 {
02151     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
02152     {
02153         int x = 0;
02154 
02155         if (!USE_SSE2)
02156             return x;
02157 
02158         __m128i v_zero = _mm_setzero_si128();
02159         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02160 
02161         for ( ; x <= width - 8; x += 8)
02162         {
02163             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02164             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02165 
02166             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
02167             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02168 
02169             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02170                                             _mm_cvtps_epi32(v_dst_1));
02171             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
02172         }
02173 
02174         return x;
02175     }
02176 };
02177 
02178 template <>
02179 struct cvtScale_SIMD<int, schar, float>
02180 {
02181     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
02182     {
02183         int x = 0;
02184 
02185         if (!USE_SSE2)
02186             return x;
02187 
02188         __m128i v_zero = _mm_setzero_si128();
02189         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02190 
02191         for ( ; x <= width - 8; x += 8)
02192         {
02193             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02194             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02195 
02196             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
02197             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02198 
02199             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02200                                             _mm_cvtps_epi32(v_dst_1));
02201             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
02202         }
02203 
02204         return x;
02205     }
02206 };
02207 
02208 #if CV_SSE4_1
02209 
02210 template <>
02211 struct cvtScale_SIMD<int, ushort, float>
02212 {
02213     cvtScale_SIMD()
02214     {
02215         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
02216     }
02217 
02218     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
02219     {
02220         int x = 0;
02221 
02222         if (!haveSSE)
02223             return x;
02224 
02225         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02226 
02227         for ( ; x <= width - 8; x += 8)
02228         {
02229             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02230             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02231 
02232             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
02233             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02234 
02235             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
02236                                              _mm_cvtps_epi32(v_dst_1));
02237             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02238         }
02239 
02240         return x;
02241     }
02242 
02243     bool haveSSE;
02244 };
02245 
02246 #endif
02247 
02248 template <>
02249 struct cvtScale_SIMD<int, short, float>
02250 {
02251     int operator () (const int * src, short * dst, int width, float scale, float shift) const
02252     {
02253         int x = 0;
02254 
02255         if (!USE_SSE2)
02256             return x;
02257 
02258         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02259 
02260         for ( ; x <= width - 8; x += 8)
02261         {
02262             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02263             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02264 
02265             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
02266             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
02267 
02268             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02269                                             _mm_cvtps_epi32(v_dst_1));
02270             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02271         }
02272 
02273         return x;
02274     }
02275 };
02276 
02277 template <>
02278 struct cvtScale_SIMD<int, int, double>
02279 {
02280     int operator () (const int * src, int * dst, int width, double scale, double shift) const
02281     {
02282         int x = 0;
02283 
02284         if (!USE_SSE2)
02285             return x;
02286 
02287         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02288 
02289         for ( ; x <= width - 4; x += 4)
02290         {
02291             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02292             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02293 
02294             v_src = _mm_srli_si128(v_src, 8);
02295             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02296 
02297             __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)),
02298                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1)));
02299 
02300             _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
02301         }
02302 
02303         return x;
02304     }
02305 };
02306 
02307 template <>
02308 struct cvtScale_SIMD<int, float, double>
02309 {
02310     int operator () (const int * src, float * dst, int width, double scale, double shift) const
02311     {
02312         int x = 0;
02313 
02314         if (!USE_SSE2)
02315             return x;
02316 
02317         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02318 
02319         for ( ; x <= width - 4; x += 4)
02320         {
02321             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02322             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02323 
02324             v_src = _mm_srli_si128(v_src, 8);
02325             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02326 
02327             _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0),
02328                                                  _mm_cvtpd_ps(v_dst_1)));
02329         }
02330 
02331         return x;
02332     }
02333 };
02334 
02335 template <>
02336 struct cvtScale_SIMD<int, double, double>
02337 {
02338     int operator () (const int * src, double * dst, int width, double scale, double shift) const
02339     {
02340         int x = 0;
02341 
02342         if (!USE_SSE2)
02343             return x;
02344 
02345         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02346 
02347         for ( ; x <= width - 4; x += 4)
02348         {
02349             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
02350             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02351 
02352             v_src = _mm_srli_si128(v_src, 8);
02353             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
02354 
02355             _mm_storeu_pd(dst + x, v_dst_0);
02356             _mm_storeu_pd(dst + x + 2, v_dst_1);
02357         }
02358 
02359         return x;
02360     }
02361 };
02362 
02363 // from float
02364 
02365 template <>
02366 struct cvtScale_SIMD<float, uchar, float>
02367 {
02368     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
02369     {
02370         int x = 0;
02371 
02372         if (!USE_SSE2)
02373             return x;
02374 
02375         __m128i v_zero = _mm_setzero_si128();
02376         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02377 
02378         for ( ; x <= width - 8; x += 8)
02379         {
02380             __m128 v_src = _mm_loadu_ps(src + x);
02381             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02382 
02383             v_src = _mm_loadu_ps(src + x + 4);
02384             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02385 
02386             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02387                                             _mm_cvtps_epi32(v_dst_1));
02388             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
02389         }
02390 
02391         return x;
02392     }
02393 };
02394 
02395 template <>
02396 struct cvtScale_SIMD<float, schar, float>
02397 {
02398     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
02399     {
02400         int x = 0;
02401 
02402         if (!USE_SSE2)
02403             return x;
02404 
02405         __m128i v_zero = _mm_setzero_si128();
02406         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02407 
02408         for ( ; x <= width - 8; x += 8)
02409         {
02410             __m128 v_src = _mm_loadu_ps(src + x);
02411             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02412 
02413             v_src = _mm_loadu_ps(src + x + 4);
02414             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02415 
02416             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02417                                             _mm_cvtps_epi32(v_dst_1));
02418             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
02419         }
02420 
02421         return x;
02422     }
02423 };
02424 
02425 #if CV_SSE4_1
02426 
02427 template <>
02428 struct cvtScale_SIMD<float, ushort, float>
02429 {
02430     cvtScale_SIMD()
02431     {
02432         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
02433     }
02434 
02435     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
02436     {
02437         int x = 0;
02438 
02439         if (!haveSSE)
02440             return x;
02441 
02442         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02443 
02444         for ( ; x <= width - 8; x += 8)
02445         {
02446             __m128 v_src = _mm_loadu_ps(src + x);
02447             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02448 
02449             v_src = _mm_loadu_ps(src + x + 4);
02450             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02451 
02452             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
02453                                              _mm_cvtps_epi32(v_dst_1));
02454             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02455         }
02456 
02457         return x;
02458     }
02459 
02460     bool haveSSE;
02461 };
02462 
02463 #endif
02464 
02465 template <>
02466 struct cvtScale_SIMD<float, short, float>
02467 {
02468     int operator () (const float * src, short * dst, int width, float scale, float shift) const
02469     {
02470         int x = 0;
02471 
02472         if (!USE_SSE2)
02473             return x;
02474 
02475         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02476 
02477         for ( ; x <= width - 8; x += 8)
02478         {
02479             __m128 v_src = _mm_loadu_ps(src + x);
02480             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02481 
02482             v_src = _mm_loadu_ps(src + x + 4);
02483             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02484 
02485             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02486                                             _mm_cvtps_epi32(v_dst_1));
02487             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02488         }
02489 
02490         return x;
02491     }
02492 };
02493 
02494 template <>
02495 struct cvtScale_SIMD<float, int, float>
02496 {
02497     int operator () (const float * src, int * dst, int width, float scale, float shift) const
02498     {
02499         int x = 0;
02500 
02501         if (!USE_SSE2)
02502             return x;
02503 
02504         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02505 
02506         for ( ; x <= width - 8; x += 8)
02507         {
02508             __m128 v_src = _mm_loadu_ps(src + x);
02509             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02510 
02511             v_src = _mm_loadu_ps(src + x + 4);
02512             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02513 
02514             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
02515             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
02516         }
02517 
02518         return x;
02519     }
02520 };
02521 
02522 template <>
02523 struct cvtScale_SIMD<float, float, float>
02524 {
02525     int operator () (const float * src, float * dst, int width, float scale, float shift) const
02526     {
02527         int x = 0;
02528 
02529         if (!USE_SSE2)
02530             return x;
02531 
02532         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02533 
02534         for ( ; x <= width - 4; x += 4)
02535         {
02536             __m128 v_src = _mm_loadu_ps(src + x);
02537             __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02538             _mm_storeu_ps(dst + x, v_dst);
02539         }
02540 
02541         return x;
02542     }
02543 };
02544 
02545 template <>
02546 struct cvtScale_SIMD<float, double, double>
02547 {
02548     int operator () (const float * src, double * dst, int width, double scale, double shift) const
02549     {
02550         int x = 0;
02551 
02552         if (!USE_SSE2)
02553             return x;
02554 
02555         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02556 
02557         for ( ; x <= width - 4; x += 4)
02558         {
02559             __m128 v_src = _mm_loadu_ps(src + x);
02560             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
02561             v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
02562             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
02563 
02564             _mm_storeu_pd(dst + x, v_dst_0);
02565             _mm_storeu_pd(dst + x + 2, v_dst_1);
02566         }
02567 
02568         return x;
02569     }
02570 };
02571 
02572 // from double
02573 
02574 template <>
02575 struct cvtScale_SIMD<double, uchar, float>
02576 {
02577     int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
02578     {
02579         int x = 0;
02580 
02581         if (!USE_SSE2)
02582             return x;
02583 
02584         __m128i v_zero = _mm_setzero_si128();
02585         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02586 
02587         for ( ; x <= width - 8; x += 8)
02588         {
02589             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
02590                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
02591             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02592 
02593             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
02594                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
02595             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02596 
02597             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02598                                             _mm_cvtps_epi32(v_dst_1));
02599             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
02600         }
02601 
02602         return x;
02603     }
02604 };
02605 
02606 template <>
02607 struct cvtScale_SIMD<double, schar, float>
02608 {
02609     int operator () (const double * src, schar * dst, int width, float scale, float shift) const
02610     {
02611         int x = 0;
02612 
02613         if (!USE_SSE2)
02614             return x;
02615 
02616         __m128i v_zero = _mm_setzero_si128();
02617         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02618 
02619         for ( ; x <= width - 8; x += 8)
02620         {
02621             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
02622                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
02623             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02624 
02625             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
02626                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
02627             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02628 
02629             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02630                                             _mm_cvtps_epi32(v_dst_1));
02631             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
02632         }
02633 
02634         return x;
02635     }
02636 };
02637 
02638 #if CV_SSE4_1
02639 
02640 template <>
02641 struct cvtScale_SIMD<double, ushort, float>
02642 {
02643     cvtScale_SIMD()
02644     {
02645         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
02646     }
02647 
02648     int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
02649     {
02650         int x = 0;
02651 
02652         if (!haveSSE)
02653             return x;
02654 
02655         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02656 
02657         for ( ; x <= width - 8; x += 8)
02658         {
02659             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
02660                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
02661             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02662 
02663             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
02664                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
02665             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02666 
02667             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
02668                                              _mm_cvtps_epi32(v_dst_1));
02669             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02670         }
02671 
02672         return x;
02673     }
02674 
02675     bool haveSSE;
02676 };
02677 
02678 #endif
02679 
02680 template <>
02681 struct cvtScale_SIMD<double, short, float>
02682 {
02683     int operator () (const double * src, short * dst, int width, float scale, float shift) const
02684     {
02685         int x = 0;
02686 
02687         if (!USE_SSE2)
02688             return x;
02689 
02690         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
02691 
02692         for ( ; x <= width - 8; x += 8)
02693         {
02694             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
02695                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
02696             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02697 
02698             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
02699                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
02700             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
02701 
02702             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
02703                                             _mm_cvtps_epi32(v_dst_1));
02704             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
02705         }
02706 
02707         return x;
02708     }
02709 };
02710 
02711 template <>
02712 struct cvtScale_SIMD<double, int, double>
02713 {
02714     int operator () (const double * src, int * dst, int width, double scale, double shift) const
02715     {
02716         int x = 0;
02717 
02718         if (!USE_SSE2)
02719             return x;
02720 
02721         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02722 
02723         for ( ; x <= width - 4; x += 4)
02724         {
02725             __m128d v_src = _mm_loadu_pd(src + x);
02726             __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02727 
02728             v_src = _mm_loadu_pd(src + x + 2);
02729             __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02730 
02731             __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)),
02732                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1)));
02733 
02734             _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
02735         }
02736 
02737         return x;
02738     }
02739 };
02740 
02741 template <>
02742 struct cvtScale_SIMD<double, float, double>
02743 {
02744     int operator () (const double * src, float * dst, int width, double scale, double shift) const
02745     {
02746         int x = 0;
02747 
02748         if (!USE_SSE2)
02749             return x;
02750 
02751         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02752 
02753         for ( ; x <= width - 4; x += 4)
02754         {
02755             __m128d v_src = _mm_loadu_pd(src + x);
02756             __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02757 
02758             v_src = _mm_loadu_pd(src + x + 2);
02759             __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02760 
02761             __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0),
02762                                          _mm_cvtpd_ps(v_dst1));
02763 
02764             _mm_storeu_ps(dst + x, v_dst);
02765         }
02766 
02767         return x;
02768     }
02769 };
02770 
02771 template <>
02772 struct cvtScale_SIMD<double, double, double>
02773 {
02774     int operator () (const double * src, double * dst, int width, double scale, double shift) const
02775     {
02776         int x = 0;
02777 
02778         if (!USE_SSE2)
02779             return x;
02780 
02781         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
02782 
02783         for ( ; x <= width - 2; x += 2)
02784         {
02785             __m128d v_src = _mm_loadu_pd(src + x);
02786             __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
02787             _mm_storeu_pd(dst + x, v_dst);
02788         }
02789 
02790         return x;
02791     }
02792 };
02793 
02794 #elif CV_NEON
02795 
02796 // from uchar
02797 
02798 template <>
02799 struct cvtScale_SIMD<uchar, uchar, float>
02800 {
02801     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
02802     {
02803         int x = 0;
02804         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02805 
02806         for ( ; x <= width - 8; x += 8)
02807         {
02808             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02809             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02810             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02811 
02812             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
02813                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
02814             vst1_u8(dst + x, vqmovn_u16(v_dst));
02815         }
02816 
02817         return x;
02818     }
02819 };
02820 
02821 template <>
02822 struct cvtScale_SIMD<uchar, schar, float>
02823 {
02824     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
02825     {
02826         int x = 0;
02827         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02828 
02829         for ( ; x <= width - 8; x += 8)
02830         {
02831             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02832             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02833             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02834 
02835             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
02836                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
02837             vst1_s8(dst + x, vqmovn_s16(v_dst));
02838         }
02839 
02840         return x;
02841     }
02842 };
02843 
02844 template <>
02845 struct cvtScale_SIMD<uchar, ushort, float>
02846 {
02847     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
02848     {
02849         int x = 0;
02850         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02851 
02852         for ( ; x <= width - 8; x += 8)
02853         {
02854             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02855             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02856             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02857 
02858             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
02859                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
02860             vst1q_u16(dst + x, v_dst);
02861         }
02862 
02863         return x;
02864     }
02865 };
02866 
02867 template <>
02868 struct cvtScale_SIMD<uchar, short, float>
02869 {
02870     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
02871     {
02872         int x = 0;
02873         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02874 
02875         for ( ; x <= width - 8; x += 8)
02876         {
02877             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02878             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02879             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02880 
02881             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
02882                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
02883             vst1q_s16(dst + x, v_dst);
02884         }
02885 
02886         return x;
02887     }
02888 };
02889 
02890 template <>
02891 struct cvtScale_SIMD<uchar, int, float>
02892 {
02893     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
02894     {
02895         int x = 0;
02896         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02897 
02898         for ( ; x <= width - 8; x += 8)
02899         {
02900             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02901             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
02902             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
02903 
02904             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
02905             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
02906         }
02907 
02908         return x;
02909     }
02910 };
02911 
02912 template <>
02913 struct cvtScale_SIMD<uchar, float, float>
02914 {
02915     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
02916     {
02917         int x = 0;
02918         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02919 
02920         for ( ; x <= width - 8; x += 8)
02921         {
02922             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
02923             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
02924             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
02925         }
02926 
02927         return x;
02928     }
02929 };
02930 
02931 // from schar
02932 
02933 template <>
02934 struct cvtScale_SIMD<schar, uchar, float>
02935 {
02936     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
02937     {
02938         int x = 0;
02939         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02940 
02941         for ( ; x <= width - 8; x += 8)
02942         {
02943             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
02944             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
02945             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
02946 
02947             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
02948                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
02949             vst1_u8(dst + x, vqmovn_u16(v_dst));
02950         }
02951 
02952         return x;
02953     }
02954 };
02955 
02956 template <>
02957 struct cvtScale_SIMD<schar, schar, float>
02958 {
02959     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
02960     {
02961         int x = 0;
02962         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02963 
02964         for ( ; x <= width - 8; x += 8)
02965         {
02966             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
02967             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
02968             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
02969 
02970             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
02971                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
02972             vst1_s8(dst + x, vqmovn_s16(v_dst));
02973         }
02974 
02975         return x;
02976     }
02977 };
02978 
02979 template <>
02980 struct cvtScale_SIMD<schar, ushort, float>
02981 {
02982     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
02983     {
02984         int x = 0;
02985         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
02986 
02987         for ( ; x <= width - 8; x += 8)
02988         {
02989             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
02990             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
02991             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
02992 
02993             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
02994                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
02995             vst1q_u16(dst + x, v_dst);
02996         }
02997 
02998         return x;
02999     }
03000 };
03001 
03002 template <>
03003 struct cvtScale_SIMD<schar, short, float>
03004 {
03005     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
03006     {
03007         int x = 0;
03008         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03009 
03010         for ( ; x <= width - 8; x += 8)
03011         {
03012             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03013             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03014             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03015 
03016             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03017                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03018             vst1q_s16(dst + x, v_dst);
03019         }
03020 
03021         return x;
03022     }
03023 };
03024 
03025 template <>
03026 struct cvtScale_SIMD<schar, int, float>
03027 {
03028     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
03029     {
03030         int x = 0;
03031         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03032 
03033         for ( ; x <= width - 8; x += 8)
03034         {
03035             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03036             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03037             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03038 
03039             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
03040             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
03041         }
03042 
03043         return x;
03044     }
03045 };
03046 
03047 template <>
03048 struct cvtScale_SIMD<schar, float, float>
03049 {
03050     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
03051     {
03052         int x = 0;
03053         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03054 
03055         for ( ; x <= width - 8; x += 8)
03056         {
03057             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03058             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
03059             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
03060         }
03061 
03062         return x;
03063     }
03064 };
03065 
03066 // from ushort
03067 
03068 template <>
03069 struct cvtScale_SIMD<ushort, uchar, float>
03070 {
03071     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
03072     {
03073         int x = 0;
03074         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03075 
03076         for ( ; x <= width - 8; x += 8)
03077         {
03078             uint16x8_t v_src = vld1q_u16(src + x);
03079             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03080             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03081 
03082             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03083                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03084             vst1_u8(dst + x, vqmovn_u16(v_dst));
03085         }
03086 
03087         return x;
03088     }
03089 };
03090 
03091 template <>
03092 struct cvtScale_SIMD<ushort, schar, float>
03093 {
03094     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
03095     {
03096         int x = 0;
03097         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03098 
03099         for ( ; x <= width - 8; x += 8)
03100         {
03101             uint16x8_t v_src = vld1q_u16(src + x);
03102             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03103             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03104 
03105             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03106                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03107             vst1_s8(dst + x, vqmovn_s16(v_dst));
03108         }
03109 
03110         return x;
03111     }
03112 };
03113 
03114 template <>
03115 struct cvtScale_SIMD<ushort, ushort, float>
03116 {
03117     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
03118     {
03119         int x = 0;
03120         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03121 
03122         for ( ; x <= width - 8; x += 8)
03123         {
03124             uint16x8_t v_src = vld1q_u16(src + x);
03125             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03126             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03127 
03128             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03129                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03130             vst1q_u16(dst + x, v_dst);
03131         }
03132 
03133         return x;
03134     }
03135 };
03136 
03137 template <>
03138 struct cvtScale_SIMD<ushort, short, float>
03139 {
03140     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
03141     {
03142         int x = 0;
03143         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03144 
03145         for ( ; x <= width - 8; x += 8)
03146         {
03147             uint16x8_t v_src = vld1q_u16(src + x);
03148             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03149             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03150 
03151             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03152                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03153             vst1q_s16(dst + x, v_dst);
03154         }
03155 
03156         return x;
03157     }
03158 };
03159 
03160 template <>
03161 struct cvtScale_SIMD<ushort, int, float>
03162 {
03163     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
03164     {
03165         int x = 0;
03166         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03167 
03168         for ( ; x <= width - 8; x += 8)
03169         {
03170             uint16x8_t v_src = vld1q_u16(src + x);
03171             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
03172             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
03173 
03174             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
03175             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
03176         }
03177 
03178         return x;
03179     }
03180 };
03181 
03182 template <>
03183 struct cvtScale_SIMD<ushort, float, float>
03184 {
03185     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
03186     {
03187         int x = 0;
03188         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03189 
03190         for ( ; x <= width - 8; x += 8)
03191         {
03192             uint16x8_t v_src = vld1q_u16(src + x);
03193             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
03194             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
03195         }
03196 
03197         return x;
03198     }
03199 };
03200 
03201 // from short
03202 
03203 template <>
03204 struct cvtScale_SIMD<short, uchar, float>
03205 {
03206     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
03207     {
03208         int x = 0;
03209         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03210 
03211         for ( ; x <= width - 8; x += 8)
03212         {
03213             int16x8_t v_src = vld1q_s16(src + x);
03214             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03215             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03216 
03217             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03218                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03219             vst1_u8(dst + x, vqmovn_u16(v_dst));
03220         }
03221 
03222         return x;
03223     }
03224 };
03225 
03226 template <>
03227 struct cvtScale_SIMD<short, schar, float>
03228 {
03229     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
03230     {
03231         int x = 0;
03232         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03233 
03234         for ( ; x <= width - 8; x += 8)
03235         {
03236             int16x8_t v_src = vld1q_s16(src + x);
03237             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03238             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03239 
03240             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03241                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03242             vst1_s8(dst + x, vqmovn_s16(v_dst));
03243         }
03244 
03245         return x;
03246     }
03247 };
03248 
03249 template <>
03250 struct cvtScale_SIMD<short, ushort, float>
03251 {
03252     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
03253     {
03254         int x = 0;
03255         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03256 
03257         for ( ; x <= width - 8; x += 8)
03258         {
03259             int16x8_t v_src = vld1q_s16(src + x);
03260             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
03261             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
03262 
03263             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03264                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03265             vst1q_u16(dst + x, v_dst);
03266         }
03267 
03268         return x;
03269     }
03270 };
03271 
03272 template <>
03273 struct cvtScale_SIMD<short, float, float>
03274 {
03275     int operator () (const short * src, float * dst, int width, float scale, float shift) const
03276     {
03277         int x = 0;
03278         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03279 
03280         for ( ; x <= width - 8; x += 8)
03281         {
03282             int16x8_t v_src = vld1q_s16(src + x);
03283             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
03284             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
03285         }
03286 
03287         return x;
03288     }
03289 };
03290 
03291 // from int
03292 
03293 template <>
03294 struct cvtScale_SIMD<int, uchar, float>
03295 {
03296     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
03297     {
03298         int x = 0;
03299         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03300 
03301         for ( ; x <= width - 8; x += 8)
03302         {
03303             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
03304             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
03305 
03306             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03307                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03308             vst1_u8(dst + x, vqmovn_u16(v_dst));
03309         }
03310 
03311         return x;
03312     }
03313 };
03314 
03315 template <>
03316 struct cvtScale_SIMD<int, schar, float>
03317 {
03318     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
03319     {
03320         int x = 0;
03321         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03322 
03323         for ( ; x <= width - 8; x += 8)
03324         {
03325             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
03326             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
03327 
03328             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03329                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03330             vst1_s8(dst + x, vqmovn_s16(v_dst));
03331         }
03332 
03333         return x;
03334     }
03335 };
03336 
03337 template <>
03338 struct cvtScale_SIMD<int, ushort, float>
03339 {
03340     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
03341     {
03342         int x = 0;
03343         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03344 
03345         for ( ; x <= width - 8; x += 8)
03346         {
03347             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
03348             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
03349 
03350             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03351                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03352             vst1q_u16(dst + x, v_dst);
03353         }
03354 
03355         return x;
03356     }
03357 };
03358 
03359 template <>
03360 struct cvtScale_SIMD<int, short, float>
03361 {
03362     int operator () (const int * src, short * dst, int width, float scale, float shift) const
03363     {
03364         int x = 0;
03365         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03366 
03367         for ( ; x <= width - 8; x += 8)
03368         {
03369             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
03370             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
03371 
03372             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03373                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03374             vst1q_s16(dst + x, v_dst);
03375         }
03376 
03377         return x;
03378     }
03379 };
03380 
03381 // from float
03382 
03383 template <>
03384 struct cvtScale_SIMD<float, uchar, float>
03385 {
03386     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
03387     {
03388         int x = 0;
03389         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03390 
03391         for ( ; x <= width - 8; x += 8)
03392         {
03393             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
03394             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
03395 
03396             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03397                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03398             vst1_u8(dst + x, vqmovn_u16(v_dst));
03399         }
03400 
03401         return x;
03402     }
03403 };
03404 
03405 template <>
03406 struct cvtScale_SIMD<float, schar, float>
03407 {
03408     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
03409     {
03410         int x = 0;
03411         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03412 
03413         for ( ; x <= width - 8; x += 8)
03414         {
03415             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
03416             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
03417 
03418             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03419                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03420             vst1_s8(dst + x, vqmovn_s16(v_dst));
03421         }
03422 
03423         return x;
03424     }
03425 };
03426 
03427 template <>
03428 struct cvtScale_SIMD<float, ushort, float>
03429 {
03430     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
03431     {
03432         int x = 0;
03433         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03434 
03435         for ( ; x <= width - 8; x += 8)
03436         {
03437             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
03438             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
03439 
03440             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
03441                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
03442             vst1q_u16(dst + x, v_dst);
03443         }
03444 
03445         return x;
03446     }
03447 };
03448 
03449 template <>
03450 struct cvtScale_SIMD<float, short, float>
03451 {
03452     int operator () (const float * src, short * dst, int width, float scale, float shift) const
03453     {
03454         int x = 0;
03455         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03456 
03457         for ( ; x <= width - 8; x += 8)
03458         {
03459             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
03460             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
03461 
03462             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
03463                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
03464             vst1q_s16(dst + x, v_dst);
03465         }
03466 
03467         return x;
03468     }
03469 };
03470 
03471 template <>
03472 struct cvtScale_SIMD<float, int, float>
03473 {
03474     int operator () (const float * src, int * dst, int width, float scale, float shift) const
03475     {
03476         int x = 0;
03477         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03478 
03479         for ( ; x <= width - 4; x += 4)
03480             vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
03481 
03482         return x;
03483     }
03484 };
03485 
03486 template <>
03487 struct cvtScale_SIMD<float, float, float>
03488 {
03489     int operator () (const float * src, float * dst, int width, float scale, float shift) const
03490     {
03491         int x = 0;
03492         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
03493 
03494         for ( ; x <= width - 4; x += 4)
03495             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift));
03496 
03497         return x;
03498     }
03499 };
03500 
03501 #endif
03502 
03503 template<typename T, typename DT, typename WT> static void
03504 cvtScale_( const T* src, size_t sstep,
03505            DT* dst, size_t dstep, Size size,
03506            WT scale, WT shift )
03507 {
03508     sstep /= sizeof(src[0]);
03509     dstep /= sizeof(dst[0]);
03510 
03511     cvtScale_SIMD<T, DT, WT> vop;
03512 
03513     for( ; size.height--; src += sstep, dst += dstep )
03514     {
03515         int x = vop(src, dst, size.width, scale, shift);
03516 
03517         #if CV_ENABLE_UNROLLED
03518         for( ; x <= size.width - 4; x += 4 )
03519         {
03520             DT t0, t1;
03521             t0 = saturate_cast<DT>(src[x]*scale + shift);
03522             t1 = saturate_cast<DT>(src[x+1]*scale + shift);
03523             dst[x] = t0; dst[x+1] = t1;
03524             t0 = saturate_cast<DT>(src[x+2]*scale + shift);
03525             t1 = saturate_cast<DT>(src[x+3]*scale + shift);
03526             dst[x+2] = t0; dst[x+3] = t1;
03527         }
03528         #endif
03529 
03530         for( ; x < size.width; x++ )
03531             dst[x] = saturate_cast<DT>(src[x]*scale + shift);
03532     }
03533 }
03534 
03535 //vz optimized template specialization
03536 template<> void
03537 cvtScale_<short, short, float>( const short* src, size_t sstep,
03538            short* dst, size_t dstep, Size size,
03539            float scale, float shift )
03540 {
03541     sstep /= sizeof(src[0]);
03542     dstep /= sizeof(dst[0]);
03543 
03544     for( ; size.height--; src += sstep, dst += dstep )
03545     {
03546         int x = 0;
03547         #if CV_SSE2
03548             if(USE_SSE2)
03549             {
03550                 __m128 scale128 = _mm_set1_ps (scale);
03551                 __m128 shift128 = _mm_set1_ps (shift);
03552                 for(; x <= size.width - 8; x += 8 )
03553                 {
03554                     __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
03555                     __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
03556                     __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
03557                     __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
03558                     rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
03559                     rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
03560                     r0 = _mm_cvtps_epi32(rf0);
03561                     r1 = _mm_cvtps_epi32(rf1);
03562                     r0 = _mm_packs_epi32(r0, r1);
03563                     _mm_storeu_si128((__m128i*)(dst + x), r0);
03564                 }
03565             }
03566         #elif CV_NEON
03567         float32x4_t v_shift = vdupq_n_f32(shift);
03568         for(; x <= size.width - 8; x += 8 )
03569         {
03570             int16x8_t v_src = vld1q_s16(src + x);
03571             float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
03572             float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
03573 
03574             v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
03575             v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
03576 
03577             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)),
03578                                             vqmovn_s32(cv_vrndq_s32_f32(v_tmp2))));
03579         }
03580         #endif
03581 
03582         for(; x < size.width; x++ )
03583             dst[x] = saturate_cast<short>(src[x]*scale + shift);
03584     }
03585 }
03586 
03587 template<> void
03588 cvtScale_<short, int, float>( const short* src, size_t sstep,
03589            int* dst, size_t dstep, Size size,
03590            float scale, float shift )
03591 {
03592     sstep /= sizeof(src[0]);
03593     dstep /= sizeof(dst[0]);
03594 
03595     for( ; size.height--; src += sstep, dst += dstep )
03596     {
03597         int x = 0;
03598 
03599         #if CV_AVX2
03600         if (USE_AVX2)
03601         {
03602             __m256 scale256 = _mm256_set1_ps(scale);
03603             __m256 shift256 = _mm256_set1_ps(shift);
03604             const int shuffle = 0xD8;
03605 
03606             for ( ; x <= size.width - 16; x += 16)
03607             {
03608                 __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
03609                 v_src = _mm256_permute4x64_epi64(v_src, shuffle);
03610                 __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
03611                 __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
03612                 __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
03613                 __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
03614                 _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
03615                 _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
03616             }
03617         }
03618         #endif
03619         #if CV_SSE2
03620         if (USE_SSE2)//~5X
03621         {
03622             __m128 scale128 = _mm_set1_ps (scale);
03623             __m128 shift128 = _mm_set1_ps (shift);
03624             for(; x <= size.width - 8; x += 8 )
03625             {
03626                 __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x));
03627 
03628                 __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
03629                 __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16));
03630                 rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
03631                 rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
03632 
03633                 _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0));
03634                 _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1));
03635             }
03636         }
03637         #elif CV_NEON
03638         float32x4_t v_shift = vdupq_n_f32(shift);
03639         for(; x <= size.width - 8; x += 8 )
03640         {
03641             int16x8_t v_src = vld1q_s16(src + x);
03642             float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
03643             float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
03644 
03645             v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
03646             v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
03647 
03648             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1));
03649             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2));
03650         }
03651         #endif
03652 
03653         for(; x < size.width; x++ )
03654             dst[x] = saturate_cast<int>(src[x]*scale + shift);
03655     }
03656 }
03657 
03658 template <typename T, typename DT>
03659 struct Cvt_SIMD
03660 {
03661     int operator() (const T *, DT *, int) const
03662     {
03663         return 0;
03664     }
03665 };
03666 
03667 #if CV_SSE2
03668 
03669 // from double
03670 
03671 template <>
03672 struct Cvt_SIMD<double, uchar>
03673 {
03674     int operator() (const double * src, uchar * dst, int width) const
03675     {
03676         int x = 0;
03677 
03678         if (!USE_SSE2)
03679             return x;
03680 
03681         for ( ; x <= width - 8; x += 8)
03682         {
03683             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03684             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03685             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
03686             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
03687 
03688             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03689             v_src1 = _mm_movelh_ps(v_src2, v_src3);
03690 
03691             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
03692                                             _mm_cvtps_epi32(v_src1));
03693             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst));
03694         }
03695 
03696         return x;
03697     }
03698 };
03699 
03700 template <>
03701 struct Cvt_SIMD<double, schar>
03702 {
03703     int operator() (const double * src, schar * dst, int width) const
03704     {
03705         int x = 0;
03706 
03707         if (!USE_SSE2)
03708             return x;
03709 
03710         for ( ; x <= width - 8; x += 8)
03711         {
03712             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03713             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03714             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
03715             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
03716 
03717             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03718             v_src1 = _mm_movelh_ps(v_src2, v_src3);
03719 
03720             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
03721                                             _mm_cvtps_epi32(v_src1));
03722             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst));
03723         }
03724 
03725         return x;
03726     }
03727 };
03728 
03729 #if CV_SSE4_1
03730 
03731 template <>
03732 struct Cvt_SIMD<double, ushort>
03733 {
03734     bool haveSIMD;
03735     Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
03736 
03737     int operator() (const double * src, ushort * dst, int width) const
03738     {
03739         int x = 0;
03740 
03741         if (!haveSIMD)
03742             return x;
03743 
03744         for ( ; x <= width - 8; x += 8)
03745         {
03746             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03747             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03748             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
03749             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
03750 
03751             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03752             v_src1 = _mm_movelh_ps(v_src2, v_src3);
03753 
03754             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
03755                                              _mm_cvtps_epi32(v_src1));
03756             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
03757         }
03758 
03759         return x;
03760     }
03761 };
03762 
03763 #endif // CV_SSE4_1
03764 
03765 template <>
03766 struct Cvt_SIMD<double, short>
03767 {
03768     int operator() (const double * src, short * dst, int width) const
03769     {
03770         int x = 0;
03771 
03772         if (!USE_SSE2)
03773             return x;
03774 
03775         for ( ; x <= width - 8; x += 8)
03776         {
03777             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03778             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03779             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
03780             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
03781 
03782             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03783             v_src1 = _mm_movelh_ps(v_src2, v_src3);
03784 
03785             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
03786                                             _mm_cvtps_epi32(v_src1));
03787             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
03788         }
03789 
03790         return x;
03791     }
03792 };
03793 
03794 template <>
03795 struct Cvt_SIMD<double, int>
03796 {
03797     int operator() (const double * src, int * dst, int width) const
03798     {
03799         int x = 0;
03800 
03801         if (!USE_SSE2)
03802             return x;
03803 
03804         for ( ; x <= width - 4; x += 4)
03805         {
03806             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03807             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03808             v_src0 = _mm_movelh_ps(v_src0, v_src1);
03809 
03810             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0));
03811         }
03812 
03813         return x;
03814     }
03815 };
03816 
03817 template <>
03818 struct Cvt_SIMD<double, float>
03819 {
03820     int operator() (const double * src, float * dst, int width) const
03821     {
03822         int x = 0;
03823 
03824         if (!USE_SSE2)
03825             return x;
03826 
03827         for ( ; x <= width - 4; x += 4)
03828         {
03829             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
03830             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
03831 
03832             _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1));
03833         }
03834 
03835         return x;
03836     }
03837 };
03838 
03839 
03840 #elif CV_NEON
03841 
03842 // from uchar
03843 
03844 template <>
03845 struct Cvt_SIMD<uchar, schar>
03846 {
03847     int operator() (const uchar * src, schar * dst, int width) const
03848     {
03849         int x = 0;
03850 
03851         for ( ; x <= width - 8; x += 8)
03852             vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))));
03853 
03854         return x;
03855     }
03856 };
03857 
03858 
03859 template <>
03860 struct Cvt_SIMD<uchar, ushort>
03861 {
03862     int operator() (const uchar * src, ushort * dst, int width) const
03863     {
03864         int x = 0;
03865 
03866         for ( ; x <= width - 8; x += 8)
03867             vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x)));
03868 
03869         return x;
03870     }
03871 };
03872 
03873 template <>
03874 struct Cvt_SIMD<uchar, short>
03875 {
03876     int operator() (const uchar * src, short * dst, int width) const
03877     {
03878         int x = 0;
03879 
03880         for ( ; x <= width - 8; x += 8)
03881             vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))));
03882 
03883         return x;
03884     }
03885 };
03886 
03887 template <>
03888 struct Cvt_SIMD<uchar, int>
03889 {
03890     int operator() (const uchar * src, int * dst, int width) const
03891     {
03892         int x = 0;
03893 
03894         for ( ; x <= width - 8; x += 8)
03895         {
03896             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
03897             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
03898             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
03899         }
03900 
03901         return x;
03902     }
03903 };
03904 
03905 template <>
03906 struct Cvt_SIMD<uchar, float>
03907 {
03908     int operator() (const uchar * src, float * dst, int width) const
03909     {
03910         int x = 0;
03911 
03912         for ( ; x <= width - 8; x += 8)
03913         {
03914             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
03915             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
03916             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
03917         }
03918 
03919         return x;
03920     }
03921 };
03922 
03923 // from schar
03924 
03925 template <>
03926 struct Cvt_SIMD<schar, uchar>
03927 {
03928     int operator() (const schar * src, uchar * dst, int width) const
03929     {
03930         int x = 0;
03931 
03932         for ( ; x <= width - 8; x += 8)
03933             vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x))));
03934 
03935         return x;
03936     }
03937 };
03938 
03939 template <>
03940 struct Cvt_SIMD<schar, short>
03941 {
03942     int operator() (const schar * src, short * dst, int width) const
03943     {
03944         int x = 0;
03945 
03946         for ( ; x <= width - 8; x += 8)
03947             vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x)));
03948 
03949         return x;
03950     }
03951 };
03952 
03953 template <>
03954 struct Cvt_SIMD<schar, ushort>
03955 {
03956     int operator() (const schar * src, ushort * dst, int width) const
03957     {
03958         int x = 0;
03959 
03960         for ( ; x <= width - 8; x += 8)
03961         {
03962             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03963             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))),
03964                                             vqmovun_s32(vmovl_s16(vget_high_s16(v_src)))));
03965         }
03966 
03967         return x;
03968     }
03969 };
03970 
03971 
03972 template <>
03973 struct Cvt_SIMD<schar, int>
03974 {
03975     int operator() (const schar * src, int * dst, int width) const
03976     {
03977         int x = 0;
03978 
03979         for ( ; x <= width - 8; x += 8)
03980         {
03981             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
03982             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
03983             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
03984         }
03985 
03986         return x;
03987     }
03988 };
03989 
03990 template <>
03991 struct Cvt_SIMD<schar, float>
03992 {
03993     int operator() (const schar * src, float * dst, int width) const
03994     {
03995         int x = 0;
03996 
03997         for ( ; x <= width - 8; x += 8)
03998         {
03999             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
04000             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
04001             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
04002         }
04003 
04004         return x;
04005     }
04006 };
04007 
04008 // from ushort
04009 
04010 template <>
04011 struct Cvt_SIMD<ushort, uchar>
04012 {
04013     int operator() (const ushort * src, uchar * dst, int width) const
04014     {
04015         int x = 0;
04016 
04017         for ( ; x <= width - 16; x += 16)
04018         {
04019             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
04020             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2)));
04021         }
04022 
04023         return x;
04024     }
04025 };
04026 
04027 template <>
04028 struct Cvt_SIMD<ushort, schar>
04029 {
04030     int operator() (const ushort * src, schar * dst, int width) const
04031     {
04032         int x = 0;
04033 
04034         for ( ; x <= width - 16; x += 16)
04035         {
04036             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
04037             int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1)));
04038             int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1)));
04039             int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2)));
04040             int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2)));
04041 
04042             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))),
04043                                           vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21)))));
04044         }
04045 
04046         return x;
04047     }
04048 };
04049 
04050 template <>
04051 struct Cvt_SIMD<ushort, short>
04052 {
04053     int operator() (const ushort * src, short * dst, int width) const
04054     {
04055         int x = 0;
04056 
04057         for ( ; x <= width - 8; x += 8)
04058         {
04059             uint16x8_t v_src = vld1q_u16(src + x);
04060             int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)));
04061             int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)));
04062 
04063             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
04064         }
04065 
04066         return x;
04067     }
04068 };
04069 
04070 template <>
04071 struct Cvt_SIMD<ushort, int>
04072 {
04073     int operator() (const ushort * src, int * dst, int width) const
04074     {
04075         int x = 0;
04076 
04077         for ( ; x <= width - 8; x += 8)
04078         {
04079             uint16x8_t v_src = vld1q_u16(src + x);
04080             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
04081             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
04082         }
04083 
04084         return x;
04085     }
04086 };
04087 
04088 template <>
04089 struct Cvt_SIMD<ushort, float>
04090 {
04091     int operator() (const ushort * src, float * dst, int width) const
04092     {
04093         int x = 0;
04094 
04095         for ( ; x <= width - 8; x += 8)
04096         {
04097             uint16x8_t v_src = vld1q_u16(src + x);
04098             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
04099             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
04100         }
04101 
04102         return x;
04103     }
04104 };
04105 
04106 // from short
04107 
04108 template <>
04109 struct Cvt_SIMD<short, uchar>
04110 {
04111     int operator() (const short * src, uchar * dst, int width) const
04112     {
04113         int x = 0;
04114 
04115         for ( ; x <= width - 16; x += 16)
04116         {
04117             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
04118             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2)));
04119         }
04120 
04121         return x;
04122     }
04123 };
04124 
04125 template <>
04126 struct Cvt_SIMD<short, schar>
04127 {
04128     int operator() (const short * src, schar * dst, int width) const
04129     {
04130         int x = 0;
04131 
04132         for ( ; x <= width - 16; x += 16)
04133         {
04134             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
04135             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2)));
04136         }
04137 
04138         return x;
04139     }
04140 };
04141 
04142 template <>
04143 struct Cvt_SIMD<short, ushort>
04144 {
04145     int operator() (const short * src, ushort * dst, int width) const
04146     {
04147         int x = 0;
04148 
04149         for ( ; x <= width - 8; x += 8)
04150         {
04151             int16x8_t v_src = vld1q_s16(src + x);
04152             uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src)));
04153             uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src)));
04154             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
04155         }
04156 
04157         return x;
04158     }
04159 };
04160 
04161 template <>
04162 struct Cvt_SIMD<short, int>
04163 {
04164     int operator() (const short * src, int * dst, int width) const
04165     {
04166         int x = 0;
04167 
04168         for ( ; x <= width - 8; x += 8)
04169         {
04170             int16x8_t v_src = vld1q_s16(src + x);
04171             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
04172             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
04173         }
04174 
04175         return x;
04176     }
04177 };
04178 
04179 template <>
04180 struct Cvt_SIMD<short, float>
04181 {
04182     int operator() (const short * src, float * dst, int width) const
04183     {
04184         int x = 0;
04185 
04186         for ( ; x <= width - 8; x += 8)
04187         {
04188             int16x8_t v_src = vld1q_s16(src + x);
04189             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
04190             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
04191         }
04192 
04193         return x;
04194     }
04195 };
04196 
04197 // from int
04198 
04199 template <>
04200 struct Cvt_SIMD<int, uchar>
04201 {
04202     int operator() (const int * src, uchar * dst, int width) const
04203     {
04204         int x = 0;
04205 
04206         for ( ; x <= width - 16; x += 16)
04207         {
04208             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
04209             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
04210             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
04211             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4)));
04212             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
04213         }
04214 
04215         return x;
04216     }
04217 };
04218 
04219 template <>
04220 struct Cvt_SIMD<int, schar>
04221 {
04222     int operator() (const int * src, schar * dst, int width) const
04223     {
04224         int x = 0;
04225 
04226         for ( ; x <= width - 16; x += 16)
04227         {
04228             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
04229             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
04230             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
04231             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
04232             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
04233         }
04234 
04235         return x;
04236     }
04237 };
04238 
04239 
04240 template <>
04241 struct Cvt_SIMD<int, ushort>
04242 {
04243     int operator() (const int * src, ushort * dst, int width) const
04244     {
04245         int x = 0;
04246 
04247         for ( ; x <= width - 8; x += 8)
04248         {
04249             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
04250             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
04251         }
04252 
04253         return x;
04254     }
04255 };
04256 
04257 template <>
04258 struct Cvt_SIMD<int, short>
04259 {
04260     int operator() (const int * src, short * dst, int width) const
04261     {
04262         int x = 0;
04263 
04264         for ( ; x <= width - 8; x += 8)
04265         {
04266             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
04267             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
04268         }
04269 
04270         return x;
04271     }
04272 };
04273 
04274 template <>
04275 struct Cvt_SIMD<int, float>
04276 {
04277     int operator() (const int * src, float * dst, int width) const
04278     {
04279         int x = 0;
04280 
04281         for ( ; x <= width - 4; x += 4)
04282             vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x)));
04283 
04284         return x;
04285     }
04286 };
04287 
04288 // from float
04289 
04290 template <>
04291 struct Cvt_SIMD<float, uchar>
04292 {
04293     int operator() (const float * src, uchar * dst, int width) const
04294     {
04295         int x = 0;
04296 
04297         for ( ; x <= width - 16; x += 16)
04298         {
04299             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
04300             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
04301             uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8));
04302             uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12));
04303             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
04304             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4)));
04305             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
04306         }
04307 
04308         return x;
04309     }
04310 };
04311 
04312 template <>
04313 struct Cvt_SIMD<float, schar>
04314 {
04315     int operator() (const float * src, schar * dst, int width) const
04316     {
04317         int x = 0;
04318 
04319         for ( ; x <= width - 16; x += 16)
04320         {
04321             int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x));
04322             int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4));
04323             int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8));
04324             int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12));
04325             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
04326             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
04327             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
04328         }
04329 
04330         return x;
04331     }
04332 };
04333 
04334 
04335 template <>
04336 struct Cvt_SIMD<float, ushort>
04337 {
04338     int operator() (const float * src, ushort * dst, int width) const
04339     {
04340         int x = 0;
04341 
04342         for ( ; x <= width - 8; x += 8)
04343         {
04344             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
04345             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
04346             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
04347         }
04348 
04349         return x;
04350     }
04351 };
04352 
04353 template <>
04354 struct Cvt_SIMD<float, int>
04355 {
04356     int operator() (const float * src, int * dst, int width) const
04357     {
04358         int x = 0;
04359 
04360         for ( ; x <= width - 4; x += 4)
04361             vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x)));
04362 
04363         return x;
04364     }
04365 };
04366 
04367 #endif
04368 
04369 template<typename T, typename DT> static void
04370 cvt_( const T* src, size_t sstep,
04371       DT* dst, size_t dstep, Size size )
04372 {
04373     sstep /= sizeof(src[0]);
04374     dstep /= sizeof(dst[0]);
04375     Cvt_SIMD<T, DT> vop;
04376 
04377     for( ; size.height--; src += sstep, dst += dstep )
04378     {
04379         int x = vop(src, dst, size.width);
04380         #if CV_ENABLE_UNROLLED
04381         for( ; x <= size.width - 4; x += 4 )
04382         {
04383             DT t0, t1;
04384             t0 = saturate_cast<DT>(src[x]);
04385             t1 = saturate_cast<DT>(src[x+1]);
04386             dst[x] = t0; dst[x+1] = t1;
04387             t0 = saturate_cast<DT>(src[x+2]);
04388             t1 = saturate_cast<DT>(src[x+3]);
04389             dst[x+2] = t0; dst[x+3] = t1;
04390         }
04391         #endif
04392         for( ; x < size.width; x++ )
04393             dst[x] = saturate_cast<DT>(src[x]);
04394     }
04395 }
04396 
04397 //vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
04398 template<>  void
04399 cvt_<float, short>( const float* src, size_t sstep,
04400      short* dst, size_t dstep, Size size )
04401 {
04402     sstep /= sizeof(src[0]);
04403     dstep /= sizeof(dst[0]);
04404 
04405     for( ; size.height--; src += sstep, dst += dstep )
04406     {
04407         int x = 0;
04408         #if   CV_SSE2
04409         if(USE_SSE2)
04410         {
04411             for( ; x <= size.width - 8; x += 8 )
04412             {
04413                 __m128 src128 = _mm_loadu_ps (src + x);
04414                 __m128i src_int128 = _mm_cvtps_epi32 (src128);
04415 
04416                 src128 = _mm_loadu_ps (src + x + 4);
04417                 __m128i src1_int128 = _mm_cvtps_epi32 (src128);
04418 
04419                 src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
04420                 _mm_storeu_si128((__m128i*)(dst + x),src1_int128);
04421             }
04422         }
04423         #elif CV_NEON
04424         for( ; x <= size.width - 8; x += 8 )
04425         {
04426             float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4);
04427             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)),
04428                                            vqmovn_s32(cv_vrndq_s32_f32(v_src2)));
04429             vst1q_s16(dst + x, v_dst);
04430         }
04431         #endif
04432         for( ; x < size.width; x++ )
04433             dst[x] = saturate_cast<short>(src[x]);
04434     }
04435 
04436 }
04437 
04438 
04439 template<typename T> static void
04440 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
04441 {
04442     sstep /= sizeof(src[0]);
04443     dstep /= sizeof(dst[0]);
04444 
04445     for( ; size.height--; src += sstep, dst += dstep )
04446         memcpy(dst, src, size.width*sizeof(src[0]));
04447 }
04448 
04449 #define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \
04450 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04451                          dtype* dst, size_t dstep, Size size, double* scale) \
04452 { \
04453     tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
04454 }
04455 
04456 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
04457 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04458 dtype* dst, size_t dstep, Size size, double* scale) \
04459 { \
04460     cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
04461 }
04462 
04463 #if defined(HAVE_IPP)
04464 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
04465 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04466                          dtype* dst, size_t dstep, Size size, double*) \
04467 { \
04468     CV_IPP_RUN(src && dst, ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0)\
04469     cvt_(src, sstep, dst, dstep, size); \
04470 }
04471 
04472 #define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \
04473 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04474                          dtype* dst, size_t dstep, Size size, double*) \
04475 { \
04476     CV_IPP_RUN(src && dst, ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0)\
04477     cvt_(src, sstep, dst, dstep, size); \
04478 }
04479 #else
04480 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
04481 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04482                          dtype* dst, size_t dstep, Size size, double*) \
04483 { \
04484     cvt_(src, sstep, dst, dstep, size); \
04485 }
04486 #define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F
04487 #endif
04488 
04489 #define DEF_CVT_FUNC(suffix, stype, dtype) \
04490 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04491                          dtype* dst, size_t dstep, Size size, double*) \
04492 { \
04493     cvt_(src, sstep, dst, dstep, size); \
04494 }
04495 
04496 #define DEF_CPY_FUNC(suffix, stype) \
04497 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
04498                          stype* dst, size_t dstep, Size size, double*) \
04499 { \
04500     cpy_(src, sstep, dst, dstep, size); \
04501 }
04502 
04503 
04504 DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float)
04505 DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float)
04506 DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float)
04507 DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float)
04508 DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
04509 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
04510 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
04511 
04512 DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
04513 DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
04514 DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
04515 DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float)
04516 DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float)
04517 DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float)
04518 DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float)
04519 
04520 DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float)
04521 DEF_CVT_SCALE_FUNC(8s,     schar, schar, float)
04522 DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float)
04523 DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float)
04524 DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float)
04525 DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float)
04526 DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float)
04527 
04528 DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float)
04529 DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float)
04530 DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float)
04531 DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float)
04532 DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float)
04533 DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float)
04534 DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float)
04535 
04536 DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float)
04537 DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float)
04538 DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float)
04539 DEF_CVT_SCALE_FUNC(16s,    short, short, float)
04540 DEF_CVT_SCALE_FUNC(32s16s, int, short, float)
04541 DEF_CVT_SCALE_FUNC(32f16s, float, short, float)
04542 DEF_CVT_SCALE_FUNC(64f16s, double, short, float)
04543 
04544 DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float)
04545 DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float)
04546 DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float)
04547 DEF_CVT_SCALE_FUNC(16s32s, short, int, float)
04548 DEF_CVT_SCALE_FUNC(32s,    int, int, double)
04549 DEF_CVT_SCALE_FUNC(32f32s, float, int, float)
04550 DEF_CVT_SCALE_FUNC(64f32s, double, int, double)
04551 
04552 DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float)
04553 DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float)
04554 DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float)
04555 DEF_CVT_SCALE_FUNC(16s32f, short, float, float)
04556 DEF_CVT_SCALE_FUNC(32s32f, int, float, double)
04557 DEF_CVT_SCALE_FUNC(32f,    float, float, float)
04558 DEF_CVT_SCALE_FUNC(64f32f, double, float, double)
04559 
04560 DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double)
04561 DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double)
04562 DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double)
04563 DEF_CVT_SCALE_FUNC(16s64f, short, double, double)
04564 DEF_CVT_SCALE_FUNC(32s64f, int, double, double)
04565 DEF_CVT_SCALE_FUNC(32f64f, float, double, double)
04566 DEF_CVT_SCALE_FUNC(64f,    double, double, double)
04567 
04568 DEF_CPY_FUNC(8u,     uchar)
04569 DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
04570 DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
04571 DEF_CVT_FUNC_F(16s8u,  short, uchar, 16s8u_C1R)
04572 DEF_CVT_FUNC_F(32s8u,  int, uchar, 32s8u_C1R)
04573 DEF_CVT_FUNC_F2(32f8u,  float, uchar, 32f8u_C1RSfs)
04574 DEF_CVT_FUNC(64f8u,  double, uchar)
04575 
04576 DEF_CVT_FUNC_F2(8u8s,   uchar, schar, 8u8s_C1RSfs)
04577 DEF_CVT_FUNC_F2(16u8s,  ushort, schar, 16u8s_C1RSfs)
04578 DEF_CVT_FUNC_F2(16s8s,  short, schar, 16s8s_C1RSfs)
04579 DEF_CVT_FUNC_F(32s8s,  int, schar, 32s8s_C1R)
04580 DEF_CVT_FUNC_F2(32f8s,  float, schar, 32f8s_C1RSfs)
04581 DEF_CVT_FUNC(64f8s,  double, schar)
04582 
04583 DEF_CVT_FUNC_F(8u16u,  uchar, ushort, 8u16u_C1R)
04584 DEF_CVT_FUNC_F(8s16u,  schar, ushort, 8s16u_C1Rs)
04585 DEF_CPY_FUNC(16u,    ushort)
04586 DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs)
04587 DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs)
04588 DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs)
04589 DEF_CVT_FUNC(64f16u, double, ushort)
04590 
04591 DEF_CVT_FUNC_F(8u16s,  uchar, short, 8u16s_C1R)
04592 DEF_CVT_FUNC_F(8s16s,  schar, short, 8s16s_C1R)
04593 DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs)
04594 DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs)
04595 DEF_CVT_FUNC(32f16s, float, short)
04596 DEF_CVT_FUNC(64f16s, double, short)
04597 
04598 DEF_CVT_FUNC_F(8u32s,  uchar, int, 8u32s_C1R)
04599 DEF_CVT_FUNC_F(8s32s,  schar, int, 8s32s_C1R)
04600 DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R)
04601 DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R)
04602 DEF_CPY_FUNC(32s,    int)
04603 DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs)
04604 DEF_CVT_FUNC(64f32s, double, int)
04605 
04606 DEF_CVT_FUNC_F(8u32f,  uchar, float, 8u32f_C1R)
04607 DEF_CVT_FUNC_F(8s32f,  schar, float, 8s32f_C1R)
04608 DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R)
04609 DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R)
04610 DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R)
04611 DEF_CVT_FUNC(64f32f, double, float)
04612 
04613 DEF_CVT_FUNC(8u64f,  uchar, double)
04614 DEF_CVT_FUNC(8s64f,  schar, double)
04615 DEF_CVT_FUNC(16u64f, ushort, double)
04616 DEF_CVT_FUNC(16s64f, short, double)
04617 DEF_CVT_FUNC(32s64f, int, double)
04618 DEF_CVT_FUNC(32f64f, float, double)
04619 DEF_CPY_FUNC(64s,    int64)
04620 
04621 static BinaryFunc getCvtScaleAbsFunc(int depth)
04622 {
04623     static BinaryFunc cvtScaleAbsTab[] =
04624     {
04625         (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u,
04626         (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u,
04627         (BinaryFunc)cvtScaleAbs64f8u, 0
04628     };
04629 
04630     return cvtScaleAbsTab[depth];
04631 }
04632 
04633 BinaryFunc getConvertFunc(int sdepth, int ddepth)
04634 {
04635     static BinaryFunc cvtTab[][8] =
04636     {
04637         {
04638             (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
04639             (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
04640             (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0
04641         },
04642         {
04643             (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
04644             (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
04645             (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0
04646         },
04647         {
04648             (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
04649             (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
04650             (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0
04651         },
04652         {
04653             (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
04654             (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
04655             (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0
04656         },
04657         {
04658             (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
04659             (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
04660             (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0
04661         },
04662         {
04663             (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
04664             (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
04665             (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0
04666         },
04667         {
04668             (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
04669             (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
04670             (BinaryFunc)(cvt64s), 0
04671         },
04672         {
04673             0, 0, 0, 0, 0, 0, 0, 0
04674         }
04675     };
04676 
04677     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
04678 }
04679 
04680 static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
04681 {
04682     static BinaryFunc cvtScaleTab[][8] =
04683     {
04684         {
04685             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
04686             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
04687             (BinaryFunc)cvtScale64f8u, 0
04688         },
04689         {
04690             (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
04691             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
04692             (BinaryFunc)cvtScale64f8s, 0
04693         },
04694         {
04695             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
04696             (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
04697             (BinaryFunc)cvtScale64f16u, 0
04698         },
04699         {
04700             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
04701             (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
04702             (BinaryFunc)cvtScale64f16s, 0
04703         },
04704         {
04705             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
04706             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
04707             (BinaryFunc)cvtScale64f32s, 0
04708         },
04709         {
04710             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
04711             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
04712             (BinaryFunc)cvtScale64f32f, 0
04713         },
04714         {
04715             (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
04716             (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
04717             (BinaryFunc)cvtScale64f, 0
04718         },
04719         {
04720             0, 0, 0, 0, 0, 0, 0, 0
04721         }
04722     };
04723 
04724     return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
04725 }
04726 
04727 #ifdef HAVE_OPENCL
04728 
04729 static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
04730 {
04731     const ocl::Device & d = ocl::Device::getDefault();
04732 
04733     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
04734     bool doubleSupport = d.doubleFPConfig() > 0;
04735     if (!doubleSupport && depth == CV_64F)
04736         return false;
04737 
04738     _dst.create(_src.size(), CV_8UC(cn));
04739     int kercn = 1;
04740     if (d.isIntel())
04741     {
04742         static const int vectorWidths[] = {4, 4, 4, 4, 4, 4, 4, -1};
04743         kercn = ocl::checkOptimalVectorWidth( vectorWidths, _src, _dst,
04744                                               noArray(), noArray(), noArray(),
04745                                               noArray(), noArray(), noArray(),
04746                                               noArray(), ocl::OCL_VECTOR_MAX);
04747     }
04748     else
04749         kercn = ocl::predictOptimalVectorWidthMax(_src, _dst);
04750 
04751     int rowsPerWI = d.isIntel() ? 4 : 1;
04752     char cvt[2][50];
04753     int wdepth = std::max(depth, CV_32F);
04754     String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
04755                          " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s"
04756                          " -D workT1=%s -D rowsPerWI=%d%s",
04757                          ocl::typeToStr(CV_8UC(kercn)),
04758                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
04759                          ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
04760                          ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
04761                          ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
04762                          ocl::typeToStr(wdepth), rowsPerWI,
04763                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
04764     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, build_opt);
04765     if (k.empty())
04766         return false;
04767 
04768     UMat src = _src.getUMat();
04769     UMat dst = _dst.getUMat();
04770 
04771     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
04772             dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
04773 
04774     if (wdepth == CV_32F)
04775         k.args(srcarg, dstarg, (float)alpha, (float)beta);
04776     else if (wdepth == CV_64F)
04777         k.args(srcarg, dstarg, alpha, beta);
04778 
04779     size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
04780     return k.run(2, globalsize, NULL, false);
04781 }
04782 
04783 #endif
04784 
04785 }
04786 
04787 void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
04788 {
04789 #ifdef HAVE_OPENCL
04790     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
04791                ocl_convertScaleAbs(_src, _dst, alpha, beta))
04792 #endif
04793 
04794     Mat src = _src.getMat();
04795     int cn = src.channels();
04796     double scale[] = {alpha, beta};
04797     _dst.create( src.dims, src.size, CV_8UC(cn) );
04798     Mat dst = _dst.getMat();
04799     BinaryFunc func = getCvtScaleAbsFunc(src.depth());
04800     CV_Assert( func != 0 );
04801 
04802     if( src.dims <= 2 )
04803     {
04804         Size sz = getContinuousSize(src, dst, cn);
04805         func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale );
04806     }
04807     else
04808     {
04809         const Mat* arrays[] = {&src, &dst, 0};
04810         uchar* ptrs[2];
04811         NAryMatIterator it(arrays, ptrs);
04812         Size sz((int)it.size*cn, 1);
04813 
04814         for( size_t i = 0; i < it.nplanes; i++, ++it )
04815             func( ptrs[0], 0, 0, 0, ptrs[1], 0, sz, scale );
04816     }
04817 }
04818 
04819 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
04820 {
04821     bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
04822 
04823     if( _type < 0 )
04824         _type = _dst.fixedType() ? _dst.type() : type();
04825     else
04826         _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
04827 
04828     int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
04829     if( sdepth == ddepth && noScale )
04830     {
04831         copyTo(_dst);
04832         return;
04833     }
04834 
04835     Mat src = *this;
04836 
04837     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
04838     double scale[] = {alpha, beta};
04839     int cn = channels();
04840     CV_Assert( func != 0 );
04841 
04842     if( dims <= 2 )
04843     {
04844         _dst.create( size(), _type );
04845         Mat dst = _dst.getMat();
04846         Size sz = getContinuousSize(src, dst, cn);
04847         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
04848     }
04849     else
04850     {
04851         _dst.create( dims, size, _type );
04852         Mat dst = _dst.getMat();
04853         const Mat* arrays[] = {&src, &dst, 0};
04854         uchar* ptrs[2];
04855         NAryMatIterator it(arrays, ptrs);
04856         Size sz((int)(it.size*cn), 1);
04857 
04858         for( size_t i = 0; i < it.nplanes; i++, ++it )
04859             func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, scale);
04860     }
04861 }
04862 
04863 /****************************************************************************************\
04864 *                                    LUT Transform                                       *
04865 \****************************************************************************************/
04866 
04867 namespace cv
04868 {
04869 
04870 template<typename T> static void
04871 LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn )
04872 {
04873     if( lutcn == 1 )
04874     {
04875         for( int i = 0; i < len*cn; i++ )
04876             dst[i] = lut[src[i]];
04877     }
04878     else
04879     {
04880         for( int i = 0; i < len*cn; i += cn )
04881             for( int k = 0; k < cn; k++ )
04882                 dst[i+k] = lut[src[i+k]*cn+k];
04883     }
04884 }
04885 
04886 static void LUT8u_8u( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn )
04887 {
04888     LUT8u_( src, lut, dst, len, cn, lutcn );
04889 }
04890 
04891 static void LUT8u_8s( const uchar* src, const schar* lut, schar* dst, int len, int cn, int lutcn )
04892 {
04893     LUT8u_( src, lut, dst, len, cn, lutcn );
04894 }
04895 
04896 static void LUT8u_16u( const uchar* src, const ushort* lut, ushort* dst, int len, int cn, int lutcn )
04897 {
04898     LUT8u_( src, lut, dst, len, cn, lutcn );
04899 }
04900 
04901 static void LUT8u_16s( const uchar* src, const short* lut, short* dst, int len, int cn, int lutcn )
04902 {
04903     LUT8u_( src, lut, dst, len, cn, lutcn );
04904 }
04905 
04906 static void LUT8u_32s( const uchar* src, const int* lut, int* dst, int len, int cn, int lutcn )
04907 {
04908     LUT8u_( src, lut, dst, len, cn, lutcn );
04909 }
04910 
04911 static void LUT8u_32f( const uchar* src, const float* lut, float* dst, int len, int cn, int lutcn )
04912 {
04913     LUT8u_( src, lut, dst, len, cn, lutcn );
04914 }
04915 
04916 static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len, int cn, int lutcn )
04917 {
04918     LUT8u_( src, lut, dst, len, cn, lutcn );
04919 }
04920 
04921 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn );
04922 
04923 static LUTFunc lutTab[] =
04924 {
04925     (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s,
04926     (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0
04927 };
04928 
04929 #ifdef HAVE_OPENCL
04930 
04931 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
04932 {
04933     int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth();
04934 
04935     UMat src = _src.getUMat(), lut = _lut.getUMat();
04936     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
04937     UMat dst = _dst.getUMat();
04938     int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_src, _dst)) : dcn;
04939 
04940     ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
04941                   format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
04942                          ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
04943     if (k.empty())
04944         return false;
04945 
04946     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
04947         ocl::KernelArg::WriteOnly(dst, dcn, kercn));
04948 
04949     size_t globalSize[2] = { (size_t)dst.cols * dcn / kercn, ((size_t)dst.rows + 3) / 4 };
04950     return k.run(2, globalSize, NULL, false);
04951 }
04952 
04953 #endif
04954 
04955 #if defined(HAVE_IPP)
04956 namespace ipp {
04957 
04958 #if IPP_DISABLE_BLOCK // there are no performance benefits (PR #2653)
04959 class IppLUTParallelBody_LUTC1 : public ParallelLoopBody
04960 {
04961 public:
04962     bool* ok;
04963     const Mat& src_;
04964     const Mat& lut_;
04965     Mat& dst_;
04966 
04967     typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep,
04968                           IppiSize roiSize, const void* pTable, int nBitSize);
04969     IppFn fn;
04970 
04971     int width;
04972 
04973     IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
04974         : ok(_ok), src_(src), lut_(lut), dst_(dst)
04975     {
04976         width = dst.cols * dst.channels();
04977 
04978         size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
04979 
04980         fn =
04981                 elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R :
04982                 elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R :
04983                 NULL;
04984 
04985         *ok = (fn != NULL);
04986     }
04987 
04988     void operator()( const cv::Range& range ) const
04989     {
04990         if (!*ok)
04991             return;
04992 
04993         const int row0 = range.start;
04994         const int row1 = range.end;
04995 
04996         Mat src = src_.rowRange(row0, row1);
04997         Mat dst = dst_.rowRange(row0, row1);
04998 
04999         IppiSize sz = { width, dst.rows };
05000 
05001         CV_DbgAssert(fn != NULL);
05002         if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0)
05003         {
05004             setIppErrorStatus();
05005             *ok = false;
05006         }
05007         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05008     }
05009 private:
05010     IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&);
05011     IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&);
05012 };
05013 #endif
05014 
05015 class IppLUTParallelBody_LUTCN : public ParallelLoopBody
05016 {
05017 public:
05018     bool *ok;
05019     const Mat& src_;
05020     const Mat& lut_;
05021     Mat& dst_;
05022 
05023     int lutcn;
05024 
05025     uchar* lutBuffer;
05026     uchar* lutTable[4];
05027 
05028     IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
05029         : ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL)
05030     {
05031         lutcn = lut.channels();
05032         IppiSize sz256 = {256, 1};
05033 
05034         size_t elemSize1 = dst.elemSize1();
05035         CV_DbgAssert(elemSize1 == 1);
05036         lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4);
05037         lutTable[0] = lutBuffer + 0;
05038         lutTable[1] = lutBuffer + 1 * 256 * elemSize1;
05039         lutTable[2] = lutBuffer + 2 * 256 * elemSize1;
05040         lutTable[3] = lutBuffer + 3 * 256 * elemSize1;
05041 
05042         CV_DbgAssert(lutcn == 3 || lutcn == 4);
05043         if (lutcn == 3)
05044         {
05045             IppStatus status = ippiCopy_8u_C3P3R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
05046             if (status < 0)
05047             {
05048                 setIppErrorStatus();
05049                 return;
05050             }
05051             CV_IMPL_ADD(CV_IMPL_IPP);
05052         }
05053         else if (lutcn == 4)
05054         {
05055             IppStatus status = ippiCopy_8u_C4P4R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
05056             if (status < 0)
05057             {
05058                 setIppErrorStatus();
05059                 return;
05060             }
05061             CV_IMPL_ADD(CV_IMPL_IPP);
05062         }
05063 
05064         *ok = true;
05065     }
05066 
05067     ~IppLUTParallelBody_LUTCN()
05068     {
05069         if (lutBuffer != NULL)
05070             ippFree(lutBuffer);
05071         lutBuffer = NULL;
05072         lutTable[0] = NULL;
05073     }
05074 
05075     void operator()( const cv::Range& range ) const
05076     {
05077         if (!*ok)
05078             return;
05079 
05080         const int row0 = range.start;
05081         const int row1 = range.end;
05082 
05083         Mat src = src_.rowRange(row0, row1);
05084         Mat dst = dst_.rowRange(row0, row1);
05085 
05086         if (lutcn == 3)
05087         {
05088             if (ippiLUTPalette_8u_C3R(
05089                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
05090                     ippiSize(dst.size()), lutTable, 8) >= 0)
05091             {
05092                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05093                 return;
05094             }
05095         }
05096         else if (lutcn == 4)
05097         {
05098             if (ippiLUTPalette_8u_C4R(
05099                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
05100                     ippiSize(dst.size()), lutTable, 8) >= 0)
05101             {
05102                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
05103                 return;
05104             }
05105         }
05106         setIppErrorStatus();
05107         *ok = false;
05108     }
05109 private:
05110     IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&);
05111     IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&);
05112 };
05113 } // namespace ipp
05114 
05115 static bool ipp_lut(Mat &src, Mat &lut, Mat &dst)
05116 {
05117     int lutcn = lut.channels();
05118 
05119     if(src.dims > 2)
05120         return false;
05121 
05122     bool ok = false;
05123     Ptr<ParallelLoopBody> body;
05124 
05125     size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
05126 #if IPP_DISABLE_BLOCK // there are no performance benefits (PR #2653)
05127     if (lutcn == 1)
05128     {
05129         ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok);
05130         body.reset(p);
05131     }
05132     else
05133 #endif
05134     if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1)
05135     {
05136         ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok);
05137         body.reset(p);
05138     }
05139 
05140     if (body != NULL && ok)
05141     {
05142         Range all(0, dst.rows);
05143         if (dst.total()>>18)
05144             parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16));
05145         else
05146             (*body)(all);
05147         if (ok)
05148             return true;
05149     }
05150 
05151     return false;
05152 }
05153 #endif // IPP
05154 
05155 class LUTParallelBody : public ParallelLoopBody
05156 {
05157 public:
05158     bool* ok;
05159     const Mat& src_;
05160     const Mat& lut_;
05161     Mat& dst_;
05162 
05163     LUTFunc func;
05164 
05165     LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
05166         : ok(_ok), src_(src), lut_(lut), dst_(dst)
05167     {
05168         func = lutTab[lut.depth()];
05169         *ok = (func != NULL);
05170     }
05171 
05172     void operator()( const cv::Range& range ) const
05173     {
05174         CV_DbgAssert(*ok);
05175 
05176         const int row0 = range.start;
05177         const int row1 = range.end;
05178 
05179         Mat src = src_.rowRange(row0, row1);
05180         Mat dst = dst_.rowRange(row0, row1);
05181 
05182         int cn = src.channels();
05183         int lutcn = lut_.channels();
05184 
05185         const Mat* arrays[] = {&src, &dst, 0};
05186         uchar* ptrs[2];
05187         NAryMatIterator it(arrays, ptrs);
05188         int len = (int)it.size;
05189 
05190         for( size_t i = 0; i < it.nplanes; i++, ++it )
05191             func(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn);
05192     }
05193 private:
05194     LUTParallelBody(const LUTParallelBody&);
05195     LUTParallelBody& operator=(const LUTParallelBody&);
05196 };
05197 
05198 }
05199 
05200 void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
05201 {
05202     int cn = _src.channels(), depth = _src.depth();
05203     int lutcn = _lut.channels();
05204 
05205     CV_Assert( (lutcn == cn || lutcn == 1) &&
05206         _lut.total() == 256 && _lut.isContinuous() &&
05207         (depth == CV_8U || depth == CV_8S) );
05208 
05209 #ifdef HAVE_OPENCL
05210     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
05211                ocl_LUT(_src, _lut, _dst))
05212 #endif
05213 
05214     Mat src = _src.getMat(), lut = _lut.getMat();
05215     _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn));
05216     Mat dst = _dst.getMat();
05217 
05218     CV_IPP_RUN(_src.dims() <= 2, ipp_lut(src, lut, dst));
05219 
05220     if (_src.dims() <= 2)
05221     {
05222         bool ok = false;
05223         Ptr<ParallelLoopBody> body;
05224 
05225         if (body == NULL || ok == false)
05226         {
05227             ok = false;
05228             ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok);
05229             body.reset(p);
05230         }
05231         if (body != NULL && ok)
05232         {
05233             Range all(0, dst.rows);
05234             if (dst.total()>>18)
05235                 parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16));
05236             else
05237                 (*body)(all);
05238             if (ok)
05239                 return;
05240         }
05241     }
05242 
05243     LUTFunc func = lutTab[lut.depth()];
05244     CV_Assert( func != 0 );
05245 
05246     const Mat* arrays[] = {&src, &dst, 0};
05247     uchar* ptrs[2];
05248     NAryMatIterator it(arrays, ptrs);
05249     int len = (int)it.size;
05250 
05251     for( size_t i = 0; i < it.nplanes; i++, ++it )
05252         func(ptrs[0], lut.ptr(), ptrs[1], len, cn, lutcn);
05253 }
05254 
05255 namespace cv {
05256 
05257 #ifdef HAVE_OPENCL
05258 
05259 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
05260                            double scale, double delta )
05261 {
05262     UMat src = _src.getUMat();
05263 
05264     if( _mask.empty() )
05265         src.convertTo( _dst, dtype, scale, delta );
05266     else if (src.channels() <= 4)
05267     {
05268         const ocl::Device & dev = ocl::Device::getDefault();
05269 
05270         int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
05271                 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
05272                 rowsPerWI = dev.isIntel() ? 4 : 1;
05273 
05274         float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
05275         bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
05276                 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
05277                 haveDelta = std::fabs(delta) > DBL_EPSILON,
05278                 doubleSupport = dev.doubleFPConfig() > 0;
05279 
05280         if (!haveScale && !haveDelta && stype == dtype)
05281         {
05282             _src.copyTo(_dst, _mask);
05283             return true;
05284         }
05285         if (haveZeroScale)
05286         {
05287             _dst.setTo(Scalar(delta), _mask);
05288             return true;
05289         }
05290 
05291         if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
05292             return false;
05293 
05294         char cvt[2][40];
05295         String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
05296                              " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
05297                              ocl::typeToStr(stype), ocl::typeToStr(dtype),
05298                              ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
05299                              rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
05300                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
05301                              doubleSupport ? " -D DOUBLE_SUPPORT" : "",
05302                              haveScale ? " -D HAVE_SCALE" : "",
05303                              haveDelta ? " -D HAVE_DELTA" : "",
05304                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
05305 
05306         ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
05307         if (k.empty())
05308             return false;
05309 
05310         UMat mask = _mask.getUMat(), dst = _dst.getUMat();
05311 
05312         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
05313                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
05314                 dstarg = ocl::KernelArg::ReadWrite(dst);
05315 
05316         if (haveScale)
05317         {
05318             if (haveDelta)
05319                 k.args(srcarg, maskarg, dstarg, fscale, fdelta);
05320             else
05321                 k.args(srcarg, maskarg, dstarg, fscale);
05322         }
05323         else
05324         {
05325             if (haveDelta)
05326                 k.args(srcarg, maskarg, dstarg, fdelta);
05327             else
05328                 k.args(srcarg, maskarg, dstarg);
05329         }
05330 
05331         size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
05332         return k.run(2, globalsize, NULL, false);
05333     }
05334     else
05335     {
05336         UMat temp;
05337         src.convertTo( temp, dtype, scale, delta );
05338         temp.copyTo( _dst, _mask );
05339     }
05340 
05341     return true;
05342 }
05343 
05344 #endif
05345 
05346 }
05347 
05348 void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b,
05349                     int norm_type, int rtype, InputArray _mask )
05350 {
05351     double scale = 1, shift = 0;
05352     if( norm_type == CV_MINMAX )
05353     {
05354         double smin = 0, smax = 0;
05355         double dmin = MIN( a, b ), dmax = MAX( a, b );
05356         minMaxLoc( _src, &smin, &smax, 0, 0, _mask );
05357         scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
05358         shift = dmin - smin*scale;
05359     }
05360     else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
05361     {
05362         scale = norm( _src, norm_type, _mask );
05363         scale = scale > DBL_EPSILON ? a/scale : 0.;
05364         shift = 0;
05365     }
05366     else
05367         CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
05368 
05369     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
05370     if( rtype < 0 )
05371         rtype = _dst.fixedType() ? _dst.depth() : depth;
05372     _dst.createSameSize(_src, CV_MAKETYPE(rtype, cn));
05373 
05374 #ifdef HAVE_OPENCL
05375     CV_OCL_RUN(_dst.isUMat(),
05376                ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
05377 #endif
05378 
05379     Mat src = _src.getMat(), dst = _dst.getMat();
05380     if( _mask.empty() )
05381         src.convertTo( dst, rtype, scale, shift );
05382     else
05383     {
05384         Mat temp;
05385         src.convertTo( temp, rtype, scale, shift );
05386         temp.copyTo( dst, _mask );
05387     }
05388 }
05389 
05390 CV_IMPL void
05391 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 )
05392 {
05393     void* dptrs[] = { dstarr0, dstarr1, dstarr2, dstarr3 };
05394     cv::Mat src = cv::cvarrToMat(srcarr);
05395     int i, j, nz = 0;
05396     for( i = 0; i < 4; i++ )
05397         nz += dptrs[i] != 0;
05398     CV_Assert( nz > 0 );
05399     std::vector<cv::Mat> dvec(nz);
05400     std::vector<int> pairs(nz*2);
05401 
05402     for( i = j = 0; i < 4; i++ )
05403     {
05404         if( dptrs[i] != 0 )
05405         {
05406             dvec[j] = cv::cvarrToMat(dptrs[i]);
05407             CV_Assert( dvec[j].size() == src.size() );
05408             CV_Assert( dvec[j].depth() == src.depth() );
05409             CV_Assert( dvec[j].channels() == 1 );
05410             CV_Assert( i < src.channels() );
05411             pairs[j*2] = i;
05412             pairs[j*2+1] = j;
05413             j++;
05414         }
05415     }
05416     if( nz == src.channels() )
05417         cv::split( src, dvec );
05418     else
05419     {
05420         cv::mixChannels( &src, 1, &dvec[0], nz, &pairs[0], nz );
05421     }
05422 }
05423 
05424 
05425 CV_IMPL void
05426 cvMerge( const void* srcarr0, const void* srcarr1, const void* srcarr2,
05427          const void* srcarr3, void* dstarr )
05428 {
05429     const void* sptrs[] = { srcarr0, srcarr1, srcarr2, srcarr3 };
05430     cv::Mat dst = cv::cvarrToMat(dstarr);
05431     int i, j, nz = 0;
05432     for( i = 0; i < 4; i++ )
05433         nz += sptrs[i] != 0;
05434     CV_Assert( nz > 0 );
05435     std::vector<cv::Mat> svec(nz);
05436     std::vector<int> pairs(nz*2);
05437 
05438     for( i = j = 0; i < 4; i++ )
05439     {
05440         if( sptrs[i] != 0 )
05441         {
05442             svec[j] = cv::cvarrToMat(sptrs[i]);
05443             CV_Assert( svec[j].size == dst.size &&
05444                 svec[j].depth() == dst.depth() &&
05445                 svec[j].channels() == 1 && i < dst.channels() );
05446             pairs[j*2] = j;
05447             pairs[j*2+1] = i;
05448             j++;
05449         }
05450     }
05451 
05452     if( nz == dst.channels() )
05453         cv::merge( svec, dst );
05454     else
05455     {
05456         cv::mixChannels( &svec[0], nz, &dst, 1, &pairs[0], nz );
05457     }
05458 }
05459 
05460 
05461 CV_IMPL void
05462 cvMixChannels( const CvArr** src, int src_count,
05463                CvArr** dst, int dst_count,
05464                const int* from_to, int pair_count )
05465 {
05466     cv::AutoBuffer<cv::Mat> buf(src_count + dst_count);
05467 
05468     int i;
05469     for( i = 0; i < src_count; i++ )
05470         buf[i] = cv::cvarrToMat(src[i]);
05471     for( i = 0; i < dst_count; i++ )
05472         buf[i+src_count] = cv::cvarrToMat(dst[i]);
05473     cv::mixChannels(&buf[0], src_count, &buf[src_count], dst_count, from_to, pair_count);
05474 }
05475 
05476 CV_IMPL void
05477 cvConvertScaleAbs( const void* srcarr, void* dstarr,
05478                    double scale, double shift )
05479 {
05480     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
05481     CV_Assert( src.size == dst.size && dst.type() == CV_8UC(src.channels()));
05482     cv::convertScaleAbs( src, dst, scale, shift );
05483 }
05484 
05485 CV_IMPL void
05486 cvConvertScale( const void* srcarr, void* dstarr,
05487                 double scale, double shift )
05488 {
05489     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
05490 
05491     CV_Assert( src.size == dst.size && src.channels() == dst.channels() );
05492     src.convertTo(dst, dst.type(), scale, shift);
05493 }
05494 
05495 CV_IMPL void cvLUT( const void* srcarr, void* dstarr, const void* lutarr )
05496 {
05497     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), lut = cv::cvarrToMat(lutarr);
05498 
05499     CV_Assert( dst.size() == src.size() && dst.type() == CV_MAKETYPE(lut.depth(), src.channels()) );
05500     cv::LUT( src, lut, dst );
05501 }
05502 
05503 CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr,
05504                           double a, double b, int norm_type, const CvArr* maskarr )
05505 {
05506     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
05507     if( maskarr )
05508         mask = cv::cvarrToMat(maskarr);
05509     CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() );
05510     cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
05511 }
05512 
05513 /* End of file. */
05514
Repository toolbox

Repository details

Type:	Program
Created:	26 Jul 2017
Imports:	3
Forks:	0
Commits:	168
Dependents:	0
Dependencies:	0
Followers:	9
Important changes to repositories hosted on mbed.com

convert.cpp

Repository toolbox

Repository details

Important Information for this Arm website

Important changes to repositories hosted on mbed.com

convert.cpp

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning