Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
convert.cpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 00016 // Third party copyrights are property of their respective owners. 00017 // 00018 // Redistribution and use in source and binary forms, with or without modification, 00019 // are permitted provided that the following conditions are met: 00020 // 00021 // * Redistribution's of source code must retain the above copyright notice, 00022 // this list of conditions and the following disclaimer. 00023 // 00024 // * Redistribution's in binary form must reproduce the above copyright notice, 00025 // this list of conditions and the following disclaimer in the documentation 00026 // and/or other materials provided with the distribution. 00027 // 00028 // * The name of the copyright holders may not be used to endorse or promote products 00029 // derived from this software without specific prior written permission. 00030 // 00031 // This software is provided by the copyright holders and contributors "as is" and 00032 // any express or implied warranties, including, but not limited to, the implied 00033 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00034 // In no event shall the Intel Corporation or contributors be liable for any direct, 00035 // indirect, incidental, special, exemplary, or consequential damages 00036 // (including, but not limited to, procurement of substitute goods or services; 00037 // loss of use, data, or profits; or business interruption) however caused 00038 // and on any theory of liability, whether in contract, strict liability, 00039 // or tort (including negligence or otherwise) arising in any way out of 00040 // the use of this software, even if advised of the possibility of such damage. 00041 // 00042 //M*/ 00043 00044 #include "precomp.hpp" 00045 00046 #include "opencl_kernels_core.hpp" 00047 00048 #ifdef __APPLE__ 00049 #undef CV_NEON 00050 #define CV_NEON 0 00051 #endif 00052 00053 00054 /****************************************************************************************\ 00055 * split & merge * 00056 \****************************************************************************************/ 00057 00058 typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn); 00059 00060 static SplitFunc getSplitFunc(int depth) 00061 { 00062 static SplitFunc splitTab[] = 00063 { 00064 (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), 00065 (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0 00066 }; 00067 00068 return splitTab[depth]; 00069 } 00070 00071 typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn); 00072 00073 static MergeFunc getMergeFunc(int depth) 00074 { 00075 static MergeFunc mergeTab[] = 00076 { 00077 (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), 00078 (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0 00079 }; 00080 00081 return mergeTab[depth]; 00082 } 00083 00084 void cv::split(const Mat& src, Mat* mv) 00085 { 00086 int k, depth = src.depth(), cn = src.channels(); 00087 if( cn == 1 ) 00088 { 00089 src.copyTo(mv[0]); 00090 return; 00091 } 00092 00093 SplitFunc func = getSplitFunc(depth); 00094 CV_Assert( func != 0 ); 00095 00096 int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1(); 00097 int blocksize0 = (BLOCK_SIZE + esz-1)/esz; 00098 AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16); 00099 const Mat** arrays = (const Mat**)(uchar*)_buf; 00100 uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16); 00101 00102 arrays[0] = &src; 00103 for( k = 0; k < cn; k++ ) 00104 { 00105 mv[k].create(src.dims, src.size, depth); 00106 arrays[k+1] = &mv[k]; 00107 } 00108 00109 NAryMatIterator it(arrays, ptrs, cn+1); 00110 int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0); 00111 00112 for( size_t i = 0; i < it.nplanes; i++, ++it ) 00113 { 00114 for( int j = 0; j < total; j += blocksize ) 00115 { 00116 int bsz = std::min(total - j, blocksize); 00117 func( ptrs[0], &ptrs[1], bsz, cn ); 00118 00119 if( j + blocksize < total ) 00120 { 00121 ptrs[0] += bsz*esz; 00122 for( k = 0; k < cn; k++ ) 00123 ptrs[k+1] += bsz*esz1; 00124 } 00125 } 00126 } 00127 } 00128 00129 #ifdef HAVE_OPENCL 00130 00131 namespace cv { 00132 00133 static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) 00134 { 00135 int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), 00136 rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; 00137 00138 String dstargs, processelem, indexdecl; 00139 for (int i = 0; i < cn; ++i) 00140 { 00141 dstargs += format("DECLARE_DST_PARAM(%d)", i); 00142 indexdecl += format("DECLARE_INDEX(%d)", i); 00143 processelem += format("PROCESS_ELEM(%d)", i); 00144 } 00145 00146 ocl::Kernel k("split", ocl::core::split_merge_oclsrc, 00147 format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s" 00148 " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s", 00149 ocl::memopTypeToStr(depth), cn, dstargs.c_str(), 00150 processelem.c_str(), indexdecl.c_str())); 00151 if (k.empty()) 00152 return false; 00153 00154 Size size = _m.size(); 00155 _mv.create(cn, 1, depth); 00156 for (int i = 0; i < cn; ++i) 00157 _mv.create(size, depth, i); 00158 00159 std::vector<UMat> dst; 00160 _mv.getUMatVector(dst); 00161 00162 int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat())); 00163 for (int i = 0; i < cn; ++i) 00164 argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i])); 00165 k.set(argidx, rowsPerWI); 00166 00167 size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + rowsPerWI - 1) / rowsPerWI }; 00168 return k.run(2, globalsize, NULL, false); 00169 } 00170 00171 } 00172 00173 #endif 00174 00175 void cv::split(InputArray _m, OutputArrayOfArrays _mv) 00176 { 00177 #ifdef HAVE_OPENCL 00178 CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(), 00179 ocl_split(_m, _mv)) 00180 #endif 00181 00182 Mat m = _m.getMat(); 00183 if( m.empty() ) 00184 { 00185 _mv.release(); 00186 return; 00187 } 00188 00189 CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() ); 00190 00191 int depth = m.depth(), cn = m.channels(); 00192 _mv.create(cn, 1, depth); 00193 for (int i = 0; i < cn; ++i) 00194 _mv.create(m.dims, m.size.p, depth, i); 00195 00196 std::vector<Mat> dst; 00197 _mv.getMatVector(dst); 00198 00199 split(m, &dst[0]); 00200 } 00201 00202 void cv::merge(const Mat* mv, size_t n, OutputArray _dst) 00203 { 00204 CV_Assert( mv && n > 0 ); 00205 00206 int depth = mv[0].depth(); 00207 bool allch1 = true; 00208 int k, cn = 0; 00209 size_t i; 00210 00211 for( i = 0; i < n; i++ ) 00212 { 00213 CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth); 00214 allch1 = allch1 && mv[i].channels() == 1; 00215 cn += mv[i].channels(); 00216 } 00217 00218 CV_Assert( 0 < cn && cn <= CV_CN_MAX ); 00219 _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn)); 00220 Mat dst = _dst.getMat(); 00221 00222 if( n == 1 ) 00223 { 00224 mv[0].copyTo(dst); 00225 return; 00226 } 00227 00228 if( !allch1 ) 00229 { 00230 AutoBuffer<int> pairs(cn*2); 00231 int j, ni=0; 00232 00233 for( i = 0, j = 0; i < n; i++, j += ni ) 00234 { 00235 ni = mv[i].channels(); 00236 for( k = 0; k < ni; k++ ) 00237 { 00238 pairs[(j+k)*2] = j + k; 00239 pairs[(j+k)*2+1] = j + k; 00240 } 00241 } 00242 mixChannels( mv, n, &dst, 1, &pairs[0], cn ); 00243 return; 00244 } 00245 00246 size_t esz = dst.elemSize(), esz1 = dst.elemSize1(); 00247 int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz); 00248 AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16); 00249 const Mat** arrays = (const Mat**)(uchar*)_buf; 00250 uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16); 00251 00252 arrays[0] = &dst; 00253 for( k = 0; k < cn; k++ ) 00254 arrays[k+1] = &mv[k]; 00255 00256 NAryMatIterator it(arrays, ptrs, cn+1); 00257 int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0); 00258 MergeFunc func = getMergeFunc(depth); 00259 00260 for( i = 0; i < it.nplanes; i++, ++it ) 00261 { 00262 for( int j = 0; j < total; j += blocksize ) 00263 { 00264 int bsz = std::min(total - j, blocksize); 00265 func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn ); 00266 00267 if( j + blocksize < total ) 00268 { 00269 ptrs[0] += bsz*esz; 00270 for( int t = 0; t < cn; t++ ) 00271 ptrs[t+1] += bsz*esz1; 00272 } 00273 } 00274 } 00275 } 00276 00277 #ifdef HAVE_OPENCL 00278 00279 namespace cv { 00280 00281 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst ) 00282 { 00283 std::vector<UMat> src, ksrc; 00284 _mv.getUMatVector(src); 00285 CV_Assert(!src.empty()); 00286 00287 int type = src[0].type(), depth = CV_MAT_DEPTH(type), 00288 rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; 00289 Size size = src[0].size(); 00290 00291 for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i) 00292 { 00293 int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype), 00294 esz1 = CV_ELEM_SIZE1(idepth); 00295 if (src[i].dims > 2) 00296 return false; 00297 00298 CV_Assert(size == src[i].size() && depth == idepth); 00299 00300 for (int cn = 0; cn < icn; ++cn) 00301 { 00302 UMat tsrc = src[i]; 00303 tsrc.offset += cn * esz1; 00304 ksrc.push_back(tsrc); 00305 } 00306 } 00307 int dcn = (int)ksrc.size(); 00308 00309 String srcargs, processelem, cndecl, indexdecl; 00310 for (int i = 0; i < dcn; ++i) 00311 { 00312 srcargs += format("DECLARE_SRC_PARAM(%d)", i); 00313 processelem += format("PROCESS_ELEM(%d)", i); 00314 indexdecl += format("DECLARE_INDEX(%d)", i); 00315 cndecl += format(" -D scn%d=%d", i, ksrc[i].channels()); 00316 } 00317 00318 ocl::Kernel k("merge", ocl::core::split_merge_oclsrc, 00319 format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s" 00320 " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s", 00321 dcn, ocl::memopTypeToStr(depth), srcargs.c_str(), 00322 indexdecl.c_str(), processelem.c_str(), cndecl.c_str())); 00323 if (k.empty()) 00324 return false; 00325 00326 _dst.create(size, CV_MAKE_TYPE(depth, dcn)); 00327 UMat dst = _dst.getUMat(); 00328 00329 int argidx = 0; 00330 for (int i = 0; i < dcn; ++i) 00331 argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i])); 00332 argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst)); 00333 k.set(argidx, rowsPerWI); 00334 00335 size_t globalsize[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI }; 00336 return k.run(2, globalsize, NULL, false); 00337 } 00338 00339 } 00340 00341 #endif 00342 00343 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst) 00344 { 00345 #ifdef HAVE_OPENCL 00346 CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(), 00347 ocl_merge(_mv, _dst)) 00348 #endif 00349 00350 std::vector<Mat> mv; 00351 _mv.getMatVector(mv); 00352 merge(!mv.empty() ? &mv[0] : 0, mv.size(), _dst); 00353 } 00354 00355 /****************************************************************************************\ 00356 * Generalized split/merge: mixing channels * 00357 \****************************************************************************************/ 00358 00359 namespace cv 00360 { 00361 00362 template<typename T> static void 00363 mixChannels_( const T** src, const int* sdelta, 00364 T** dst, const int* ddelta, 00365 int len, int npairs ) 00366 { 00367 int i, k; 00368 for( k = 0; k < npairs; k++ ) 00369 { 00370 const T* s = src[k]; 00371 T* d = dst[k]; 00372 int ds = sdelta[k], dd = ddelta[k]; 00373 if( s ) 00374 { 00375 for( i = 0; i <= len - 2; i += 2, s += ds*2, d += dd*2 ) 00376 { 00377 T t0 = s[0], t1 = s[ds]; 00378 d[0] = t0; d[dd] = t1; 00379 } 00380 if( i < len ) 00381 d[0] = s[0]; 00382 } 00383 else 00384 { 00385 for( i = 0; i <= len - 2; i += 2, d += dd*2 ) 00386 d[0] = d[dd] = 0; 00387 if( i < len ) 00388 d[0] = 0; 00389 } 00390 } 00391 } 00392 00393 00394 static void mixChannels8u( const uchar** src, const int* sdelta, 00395 uchar** dst, const int* ddelta, 00396 int len, int npairs ) 00397 { 00398 mixChannels_(src, sdelta, dst, ddelta, len, npairs); 00399 } 00400 00401 static void mixChannels16u( const ushort** src, const int* sdelta, 00402 ushort** dst, const int* ddelta, 00403 int len, int npairs ) 00404 { 00405 mixChannels_(src, sdelta, dst, ddelta, len, npairs); 00406 } 00407 00408 static void mixChannels32s( const int** src, const int* sdelta, 00409 int** dst, const int* ddelta, 00410 int len, int npairs ) 00411 { 00412 mixChannels_(src, sdelta, dst, ddelta, len, npairs); 00413 } 00414 00415 static void mixChannels64s( const int64** src, const int* sdelta, 00416 int64** dst, const int* ddelta, 00417 int len, int npairs ) 00418 { 00419 mixChannels_(src, sdelta, dst, ddelta, len, npairs); 00420 } 00421 00422 typedef void (*MixChannelsFunc)( const uchar** src, const int* sdelta, 00423 uchar** dst, const int* ddelta, int len, int npairs ); 00424 00425 static MixChannelsFunc getMixchFunc(int depth) 00426 { 00427 static MixChannelsFunc mixchTab[] = 00428 { 00429 (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels16u, 00430 (MixChannelsFunc)mixChannels16u, (MixChannelsFunc)mixChannels32s, (MixChannelsFunc)mixChannels32s, 00431 (MixChannelsFunc)mixChannels64s, 0 00432 }; 00433 00434 return mixchTab[depth]; 00435 } 00436 00437 } 00438 00439 void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, const int* fromTo, size_t npairs ) 00440 { 00441 if( npairs == 0 ) 00442 return; 00443 CV_Assert( src && nsrcs > 0 && dst && ndsts > 0 && fromTo && npairs > 0 ); 00444 00445 size_t i, j, k, esz1 = dst[0].elemSize1(); 00446 int depth = dst[0].depth(); 00447 00448 AutoBuffer<uchar> buf((nsrcs + ndsts + 1)*(sizeof(Mat*) + sizeof(uchar*)) + npairs*(sizeof(uchar*)*2 + sizeof(int)*6)); 00449 const Mat** arrays = (const Mat**)(uchar*)buf; 00450 uchar** ptrs = (uchar**)(arrays + nsrcs + ndsts); 00451 const uchar** srcs = (const uchar**)(ptrs + nsrcs + ndsts + 1); 00452 uchar** dsts = (uchar**)(srcs + npairs); 00453 int* tab = (int*)(dsts + npairs); 00454 int *sdelta = (int*)(tab + npairs*4), *ddelta = sdelta + npairs; 00455 00456 for( i = 0; i < nsrcs; i++ ) 00457 arrays[i] = &src[i]; 00458 for( i = 0; i < ndsts; i++ ) 00459 arrays[i + nsrcs] = &dst[i]; 00460 ptrs[nsrcs + ndsts] = 0; 00461 00462 for( i = 0; i < npairs; i++ ) 00463 { 00464 int i0 = fromTo[i*2], i1 = fromTo[i*2+1]; 00465 if( i0 >= 0 ) 00466 { 00467 for( j = 0; j < nsrcs; i0 -= src[j].channels(), j++ ) 00468 if( i0 < src[j].channels() ) 00469 break; 00470 CV_Assert(j < nsrcs && src[j].depth() == depth); 00471 tab[i*4] = (int)j; tab[i*4+1] = (int)(i0*esz1); 00472 sdelta[i] = src[j].channels(); 00473 } 00474 else 00475 { 00476 tab[i*4] = (int)(nsrcs + ndsts); tab[i*4+1] = 0; 00477 sdelta[i] = 0; 00478 } 00479 00480 for( j = 0; j < ndsts; i1 -= dst[j].channels(), j++ ) 00481 if( i1 < dst[j].channels() ) 00482 break; 00483 CV_Assert(i1 >= 0 && j < ndsts && dst[j].depth() == depth); 00484 tab[i*4+2] = (int)(j + nsrcs); tab[i*4+3] = (int)(i1*esz1); 00485 ddelta[i] = dst[j].channels(); 00486 } 00487 00488 NAryMatIterator it(arrays, ptrs, (int)(nsrcs + ndsts)); 00489 int total = (int)it.size, blocksize = std::min(total, (int)((BLOCK_SIZE + esz1-1)/esz1)); 00490 MixChannelsFunc func = getMixchFunc(depth); 00491 00492 for( i = 0; i < it.nplanes; i++, ++it ) 00493 { 00494 for( k = 0; k < npairs; k++ ) 00495 { 00496 srcs[k] = ptrs[tab[k*4]] + tab[k*4+1]; 00497 dsts[k] = ptrs[tab[k*4+2]] + tab[k*4+3]; 00498 } 00499 00500 for( int t = 0; t < total; t += blocksize ) 00501 { 00502 int bsz = std::min(total - t, blocksize); 00503 func( srcs, sdelta, dsts, ddelta, bsz, (int)npairs ); 00504 00505 if( t + blocksize < total ) 00506 for( k = 0; k < npairs; k++ ) 00507 { 00508 srcs[k] += blocksize*sdelta[k]*esz1; 00509 dsts[k] += blocksize*ddelta[k]*esz1; 00510 } 00511 } 00512 } 00513 } 00514 00515 #ifdef HAVE_OPENCL 00516 00517 namespace cv { 00518 00519 static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx) 00520 { 00521 int totalChannels = 0; 00522 for (size_t i = 0, size = um.size(); i < size; ++i) 00523 { 00524 int ccn = um[i].channels(); 00525 totalChannels += ccn; 00526 00527 if (totalChannels == cn) 00528 { 00529 idx = (int)(i + 1); 00530 cnidx = 0; 00531 return; 00532 } 00533 else if (totalChannels > cn) 00534 { 00535 idx = (int)i; 00536 cnidx = i == 0 ? cn : (cn - totalChannels + ccn); 00537 return; 00538 } 00539 } 00540 00541 idx = cnidx = -1; 00542 } 00543 00544 static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _dst, 00545 const int* fromTo, size_t npairs) 00546 { 00547 std::vector<UMat> src, dst; 00548 _src.getUMatVector(src); 00549 _dst.getUMatVector(dst); 00550 00551 size_t nsrc = src.size(), ndst = dst.size(); 00552 CV_Assert(nsrc > 0 && ndst > 0); 00553 00554 Size size = src[0].size(); 00555 int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth), 00556 rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; 00557 00558 for (size_t i = 1, ssize = src.size(); i < ssize; ++i) 00559 CV_Assert(src[i].size() == size && src[i].depth() == depth); 00560 for (size_t i = 0, dsize = dst.size(); i < dsize; ++i) 00561 CV_Assert(dst[i].size() == size && dst[i].depth() == depth); 00562 00563 String declsrc, decldst, declproc, declcn, indexdecl; 00564 std::vector<UMat> srcargs(npairs), dstargs(npairs); 00565 00566 for (size_t i = 0; i < npairs; ++i) 00567 { 00568 int scn = fromTo[i<<1], dcn = fromTo[(i<<1) + 1]; 00569 int src_idx, src_cnidx, dst_idx, dst_cnidx; 00570 00571 getUMatIndex(src, scn, src_idx, src_cnidx); 00572 getUMatIndex(dst, dcn, dst_idx, dst_cnidx); 00573 00574 CV_Assert(dst_idx >= 0 && src_idx >= 0); 00575 00576 srcargs[i] = src[src_idx]; 00577 srcargs[i].offset += src_cnidx * esz; 00578 00579 dstargs[i] = dst[dst_idx]; 00580 dstargs[i].offset += dst_cnidx * esz; 00581 00582 declsrc += format("DECLARE_INPUT_MAT(%d)", i); 00583 decldst += format("DECLARE_OUTPUT_MAT(%d)", i); 00584 indexdecl += format("DECLARE_INDEX(%d)", i); 00585 declproc += format("PROCESS_ELEM(%d)", i); 00586 declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels()); 00587 } 00588 00589 ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc, 00590 format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s" 00591 " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s", 00592 ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(), 00593 declproc.c_str(), indexdecl.c_str(), declcn.c_str())); 00594 if (k.empty()) 00595 return false; 00596 00597 int argindex = 0; 00598 for (size_t i = 0; i < npairs; ++i) 00599 argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i])); 00600 for (size_t i = 0; i < npairs; ++i) 00601 argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i])); 00602 argindex = k.set(argindex, size.height); 00603 argindex = k.set(argindex, size.width); 00604 k.set(argindex, rowsPerWI); 00605 00606 size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + rowsPerWI - 1) / rowsPerWI }; 00607 return k.run(2, globalsize, NULL, false); 00608 } 00609 00610 } 00611 00612 #endif 00613 00614 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst, 00615 const int* fromTo, size_t npairs) 00616 { 00617 if (npairs == 0 || fromTo == NULL) 00618 return; 00619 00620 #ifdef HAVE_OPENCL 00621 CV_OCL_RUN(dst.isUMatVector(), 00622 ocl_mixChannels(src, dst, fromTo, npairs)) 00623 #endif 00624 00625 bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT && 00626 src.kind() != _InputArray::STD_VECTOR_VECTOR && 00627 src.kind() != _InputArray::STD_VECTOR_UMAT; 00628 bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT && 00629 dst.kind() != _InputArray::STD_VECTOR_VECTOR && 00630 dst.kind() != _InputArray::STD_VECTOR_UMAT; 00631 int i; 00632 int nsrc = src_is_mat ? 1 : (int)src.total(); 00633 int ndst = dst_is_mat ? 1 : (int)dst.total(); 00634 00635 CV_Assert(nsrc > 0 && ndst > 0); 00636 cv::AutoBuffer<Mat> _buf(nsrc + ndst); 00637 Mat* buf = _buf; 00638 for( i = 0; i < nsrc; i++ ) 00639 buf[i] = src.getMat(src_is_mat ? -1 : i); 00640 for( i = 0; i < ndst; i++ ) 00641 buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i); 00642 mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, fromTo, npairs); 00643 } 00644 00645 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst, 00646 const std::vector<int>& fromTo) 00647 { 00648 if (fromTo.empty()) 00649 return; 00650 00651 #ifdef HAVE_OPENCL 00652 CV_OCL_RUN(dst.isUMatVector(), 00653 ocl_mixChannels(src, dst, &fromTo[0], fromTo.size()>>1)) 00654 #endif 00655 00656 bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT && 00657 src.kind() != _InputArray::STD_VECTOR_VECTOR && 00658 src.kind() != _InputArray::STD_VECTOR_UMAT; 00659 bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT && 00660 dst.kind() != _InputArray::STD_VECTOR_VECTOR && 00661 dst.kind() != _InputArray::STD_VECTOR_UMAT; 00662 int i; 00663 int nsrc = src_is_mat ? 1 : (int)src.total(); 00664 int ndst = dst_is_mat ? 1 : (int)dst.total(); 00665 00666 CV_Assert(fromTo.size()%2 == 0 && nsrc > 0 && ndst > 0); 00667 cv::AutoBuffer<Mat> _buf(nsrc + ndst); 00668 Mat* buf = _buf; 00669 for( i = 0; i < nsrc; i++ ) 00670 buf[i] = src.getMat(src_is_mat ? -1 : i); 00671 for( i = 0; i < ndst; i++ ) 00672 buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i); 00673 mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, &fromTo[0], fromTo.size()/2); 00674 } 00675 00676 void cv::extractChannel(InputArray _src, OutputArray _dst, int coi) 00677 { 00678 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 00679 CV_Assert( 0 <= coi && coi < cn ); 00680 int ch[] = { coi, 0 }; 00681 00682 #ifdef HAVE_OPENCL 00683 if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat()) 00684 { 00685 UMat src = _src.getUMat(); 00686 _dst.create(src.dims, &src.size[0], depth); 00687 UMat dst = _dst.getUMat(); 00688 mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1); 00689 return; 00690 } 00691 #endif 00692 00693 Mat src = _src.getMat(); 00694 _dst.create(src.dims, &src.size[0], depth); 00695 Mat dst = _dst.getMat(); 00696 mixChannels(&src, 1, &dst, 1, ch, 1); 00697 } 00698 00699 void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi) 00700 { 00701 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype); 00702 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype); 00703 CV_Assert( _src.sameSize(_dst) && sdepth == ddepth ); 00704 CV_Assert( 0 <= coi && coi < dcn && scn == 1 ); 00705 00706 int ch[] = { 0, coi }; 00707 if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat()) 00708 { 00709 UMat src = _src.getUMat(), dst = _dst.getUMat(); 00710 mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1); 00711 return; 00712 } 00713 00714 Mat src = _src.getMat(), dst = _dst.getMat(); 00715 mixChannels(&src, 1, &dst, 1, ch, 1); 00716 } 00717 00718 /****************************************************************************************\ 00719 * convertScale[Abs] * 00720 \****************************************************************************************/ 00721 00722 namespace cv 00723 { 00724 00725 template<typename T, typename DT, typename WT> 00726 struct cvtScaleAbs_SIMD 00727 { 00728 int operator () (const T *, DT *, int, WT, WT) const 00729 { 00730 return 0; 00731 } 00732 }; 00733 00734 #if CV_SSE2 00735 00736 template <> 00737 struct cvtScaleAbs_SIMD<uchar, uchar, float> 00738 { 00739 int operator () (const uchar * src, uchar * dst, int width, 00740 float scale, float shift) const 00741 { 00742 int x = 0; 00743 00744 if (USE_SSE2) 00745 { 00746 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 00747 v_zero_f = _mm_setzero_ps(); 00748 __m128i v_zero_i = _mm_setzero_si128(); 00749 00750 for ( ; x <= width - 16; x += 16) 00751 { 00752 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 00753 __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i); 00754 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift); 00755 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 00756 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift); 00757 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 00758 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift); 00759 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3); 00760 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift); 00761 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4); 00762 00763 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), 00764 _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4))); 00765 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i); 00766 } 00767 } 00768 00769 return x; 00770 } 00771 }; 00772 00773 template <> 00774 struct cvtScaleAbs_SIMD<schar, uchar, float> 00775 { 00776 int operator () (const schar * src, uchar * dst, int width, 00777 float scale, float shift) const 00778 { 00779 int x = 0; 00780 00781 if (USE_SSE2) 00782 { 00783 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 00784 v_zero_f = _mm_setzero_ps(); 00785 __m128i v_zero_i = _mm_setzero_si128(); 00786 00787 for ( ; x <= width - 16; x += 16) 00788 { 00789 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 00790 __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8), 00791 v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8); 00792 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( 00793 _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); 00794 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 00795 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( 00796 _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); 00797 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 00798 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( 00799 _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); 00800 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3); 00801 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( 00802 _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); 00803 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4); 00804 00805 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), 00806 _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4))); 00807 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i); 00808 } 00809 } 00810 00811 return x; 00812 } 00813 }; 00814 00815 template <> 00816 struct cvtScaleAbs_SIMD<ushort, uchar, float> 00817 { 00818 int operator () (const ushort * src, uchar * dst, int width, 00819 float scale, float shift) const 00820 { 00821 int x = 0; 00822 00823 if (USE_SSE2) 00824 { 00825 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 00826 v_zero_f = _mm_setzero_ps(); 00827 __m128i v_zero_i = _mm_setzero_si128(); 00828 00829 for ( ; x <= width - 8; x += 8) 00830 { 00831 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 00832 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift); 00833 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 00834 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift); 00835 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 00836 00837 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i); 00838 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); 00839 } 00840 } 00841 00842 return x; 00843 } 00844 }; 00845 00846 template <> 00847 struct cvtScaleAbs_SIMD<short, uchar, float> 00848 { 00849 int operator () (const short * src, uchar * dst, int width, 00850 float scale, float shift) const 00851 { 00852 int x = 0; 00853 00854 if (USE_SSE2) 00855 { 00856 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 00857 v_zero_f = _mm_setzero_ps(); 00858 __m128i v_zero_i = _mm_setzero_si128(); 00859 00860 for ( ; x <= width - 8; x += 8) 00861 { 00862 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 00863 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift); 00864 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 00865 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift); 00866 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 00867 00868 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i); 00869 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); 00870 } 00871 } 00872 00873 return x; 00874 } 00875 }; 00876 00877 template <> 00878 struct cvtScaleAbs_SIMD<int, uchar, float> 00879 { 00880 int operator () (const int * src, uchar * dst, int width, 00881 float scale, float shift) const 00882 { 00883 int x = 0; 00884 00885 if (USE_SSE2) 00886 { 00887 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 00888 v_zero_f = _mm_setzero_ps(); 00889 __m128i v_zero_i = _mm_setzero_si128(); 00890 00891 for ( ; x <= width - 8; x += 4) 00892 { 00893 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 00894 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 00895 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 00896 00897 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i); 00898 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); 00899 } 00900 } 00901 00902 return x; 00903 } 00904 }; 00905 00906 template <> 00907 struct cvtScaleAbs_SIMD<float, uchar, float> 00908 { 00909 int operator () (const float * src, uchar * dst, int width, 00910 float scale, float shift) const 00911 { 00912 int x = 0; 00913 00914 if (USE_SSE2) 00915 { 00916 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 00917 v_zero_f = _mm_setzero_ps(); 00918 __m128i v_zero_i = _mm_setzero_si128(); 00919 00920 for ( ; x <= width - 8; x += 4) 00921 { 00922 __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift); 00923 v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst); 00924 00925 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i); 00926 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i)); 00927 } 00928 } 00929 00930 return x; 00931 } 00932 }; 00933 00934 template <> 00935 struct cvtScaleAbs_SIMD<double, uchar, float> 00936 { 00937 int operator () (const double * src, uchar * dst, int width, 00938 float scale, float shift) const 00939 { 00940 int x = 0; 00941 00942 if (USE_SSE2) 00943 { 00944 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 00945 v_zero_f = _mm_setzero_ps(); 00946 __m128i v_zero_i = _mm_setzero_si128(); 00947 00948 for ( ; x <= width - 8; x += 8) 00949 { 00950 __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 00951 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 00952 __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 00953 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 00954 00955 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift); 00956 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 00957 00958 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift); 00959 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 00960 00961 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), 00962 _mm_cvtps_epi32(v_dst2)); 00963 00964 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i)); 00965 } 00966 } 00967 00968 return x; 00969 } 00970 }; 00971 00972 #elif CV_NEON 00973 00974 template <> 00975 struct cvtScaleAbs_SIMD<uchar, uchar, float> 00976 { 00977 int operator () (const uchar * src, uchar * dst, int width, 00978 float scale, float shift) const 00979 { 00980 int x = 0; 00981 float32x4_t v_shift = vdupq_n_f32(shift); 00982 00983 for ( ; x <= width - 16; x += 16) 00984 { 00985 uint8x16_t v_src = vld1q_u8(src + x); 00986 uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src)); 00987 00988 uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half)); 00989 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); 00990 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 00991 00992 v_quat = vmovl_u16(vget_high_u16(v_half)); 00993 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); 00994 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 00995 00996 v_half = vmovl_u8(vget_high_u8(v_src)); 00997 00998 v_quat = vmovl_u16(vget_low_u16(v_half)); 00999 float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); 01000 v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift)); 01001 01002 v_quat = vmovl_u16(vget_high_u16(v_half)); 01003 float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); 01004 v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift)); 01005 01006 uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), 01007 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); 01008 uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)), 01009 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3))); 01010 01011 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1))); 01012 } 01013 01014 return x; 01015 } 01016 }; 01017 01018 template <> 01019 struct cvtScaleAbs_SIMD<schar, uchar, float> 01020 { 01021 int operator () (const schar * src, uchar * dst, int width, 01022 float scale, float shift) const 01023 { 01024 int x = 0; 01025 float32x4_t v_shift = vdupq_n_f32(shift); 01026 01027 for ( ; x <= width - 16; x += 16) 01028 { 01029 int8x16_t v_src = vld1q_s8(src + x); 01030 int16x8_t v_half = vmovl_s8(vget_low_s8(v_src)); 01031 01032 int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half)); 01033 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); 01034 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 01035 01036 v_quat = vmovl_s16(vget_high_s16(v_half)); 01037 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); 01038 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 01039 01040 v_half = vmovl_s8(vget_high_s8(v_src)); 01041 01042 v_quat = vmovl_s16(vget_low_s16(v_half)); 01043 float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); 01044 v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift)); 01045 01046 v_quat = vmovl_s16(vget_high_s16(v_half)); 01047 float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); 01048 v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift)); 01049 01050 uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), 01051 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); 01052 uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)), 01053 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3))); 01054 01055 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1))); 01056 } 01057 01058 return x; 01059 } 01060 }; 01061 01062 template <> 01063 struct cvtScaleAbs_SIMD<ushort, uchar, float> 01064 { 01065 int operator () (const ushort * src, uchar * dst, int width, 01066 float scale, float shift) const 01067 { 01068 int x = 0; 01069 float32x4_t v_shift = vdupq_n_f32(shift); 01070 01071 for ( ; x <= width - 8; x += 8) 01072 { 01073 uint16x8_t v_src = vld1q_u16(src + x); 01074 01075 uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src)); 01076 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale); 01077 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 01078 01079 v_half = vmovl_u16(vget_high_u16(v_src)); 01080 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale); 01081 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 01082 01083 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), 01084 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); 01085 01086 vst1_u8(dst + x, vqmovn_u16(v_dst)); 01087 } 01088 01089 return x; 01090 } 01091 }; 01092 01093 template <> 01094 struct cvtScaleAbs_SIMD<short, uchar, float> 01095 { 01096 int operator () (const short * src, uchar * dst, int width, 01097 float scale, float shift) const 01098 { 01099 int x = 0; 01100 float32x4_t v_shift = vdupq_n_f32(shift); 01101 01102 for ( ; x <= width - 8; x += 8) 01103 { 01104 int16x8_t v_src = vld1q_s16(src + x); 01105 01106 int32x4_t v_half = vmovl_s16(vget_low_s16(v_src)); 01107 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale); 01108 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 01109 01110 v_half = vmovl_s16(vget_high_s16(v_src)); 01111 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale); 01112 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 01113 01114 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), 01115 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); 01116 01117 vst1_u8(dst + x, vqmovn_u16(v_dst)); 01118 } 01119 01120 return x; 01121 } 01122 }; 01123 01124 template <> 01125 struct cvtScaleAbs_SIMD<int, uchar, float> 01126 { 01127 int operator () (const int * src, uchar * dst, int width, 01128 float scale, float shift) const 01129 { 01130 int x = 0; 01131 float32x4_t v_shift = vdupq_n_f32(shift); 01132 01133 for ( ; x <= width - 8; x += 8) 01134 { 01135 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale); 01136 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 01137 uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)); 01138 01139 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale); 01140 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 01141 uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)); 01142 01143 uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1); 01144 vst1_u8(dst + x, vqmovn_u16(v_dst)); 01145 } 01146 01147 return x; 01148 } 01149 }; 01150 01151 template <> 01152 struct cvtScaleAbs_SIMD<float, uchar, float> 01153 { 01154 int operator () (const float * src, uchar * dst, int width, 01155 float scale, float shift) const 01156 { 01157 int x = 0; 01158 float32x4_t v_shift = vdupq_n_f32(shift); 01159 01160 for ( ; x <= width - 8; x += 8) 01161 { 01162 float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale); 01163 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 01164 uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)); 01165 01166 float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale); 01167 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 01168 uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)); 01169 01170 uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1); 01171 vst1_u8(dst + x, vqmovn_u16(v_dst)); 01172 } 01173 01174 return x; 01175 } 01176 }; 01177 01178 #endif 01179 01180 template<typename T, typename DT, typename WT> static void 01181 cvtScaleAbs_( const T* src, size_t sstep, 01182 DT* dst, size_t dstep, Size size, 01183 WT scale, WT shift ) 01184 { 01185 sstep /= sizeof(src[0]); 01186 dstep /= sizeof(dst[0]); 01187 cvtScaleAbs_SIMD<T, DT, WT> vop; 01188 01189 for( ; size.height--; src += sstep, dst += dstep ) 01190 { 01191 int x = vop(src, dst, size.width, scale, shift); 01192 01193 #if CV_ENABLE_UNROLLED 01194 for( ; x <= size.width - 4; x += 4 ) 01195 { 01196 DT t0, t1; 01197 t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift)); 01198 t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift)); 01199 dst[x] = t0; dst[x+1] = t1; 01200 t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift)); 01201 t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift)); 01202 dst[x+2] = t0; dst[x+3] = t1; 01203 } 01204 #endif 01205 for( ; x < size.width; x++ ) 01206 dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift)); 01207 } 01208 } 01209 01210 template <typename T, typename DT, typename WT> 01211 struct cvtScale_SIMD 01212 { 01213 int operator () (const T *, DT *, int, WT, WT) const 01214 { 01215 return 0; 01216 } 01217 }; 01218 01219 #if CV_SSE2 01220 01221 // from uchar 01222 01223 template <> 01224 struct cvtScale_SIMD<uchar, uchar, float> 01225 { 01226 int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const 01227 { 01228 int x = 0; 01229 01230 if (!USE_SSE2) 01231 return x; 01232 01233 __m128i v_zero = _mm_setzero_si128(); 01234 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01235 01236 for ( ; x <= width - 8; x += 8) 01237 { 01238 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 01239 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01240 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01241 01242 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01243 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01244 01245 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01246 _mm_cvtps_epi32(v_dst_1)); 01247 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 01248 } 01249 01250 return x; 01251 } 01252 }; 01253 01254 template <> 01255 struct cvtScale_SIMD<uchar, schar, float> 01256 { 01257 int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const 01258 { 01259 int x = 0; 01260 01261 if (!USE_SSE2) 01262 return x; 01263 01264 __m128i v_zero = _mm_setzero_si128(); 01265 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01266 01267 for ( ; x <= width - 8; x += 8) 01268 { 01269 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 01270 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01271 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01272 01273 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01274 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01275 01276 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01277 _mm_cvtps_epi32(v_dst_1)); 01278 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 01279 } 01280 01281 return x; 01282 } 01283 }; 01284 01285 #if CV_SSE4_1 01286 01287 template <> 01288 struct cvtScale_SIMD<uchar, ushort, float> 01289 { 01290 cvtScale_SIMD() 01291 { 01292 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 01293 } 01294 01295 int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const 01296 { 01297 int x = 0; 01298 01299 if (!haveSSE) 01300 return x; 01301 01302 __m128i v_zero = _mm_setzero_si128(); 01303 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01304 01305 for ( ; x <= width - 8; x += 8) 01306 { 01307 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 01308 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01309 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01310 01311 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01312 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01313 01314 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 01315 _mm_cvtps_epi32(v_dst_1)); 01316 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 01317 } 01318 01319 return x; 01320 } 01321 01322 bool haveSSE; 01323 }; 01324 01325 #endif 01326 01327 template <> 01328 struct cvtScale_SIMD<uchar, short, float> 01329 { 01330 int operator () (const uchar * src, short * dst, int width, float scale, float shift) const 01331 { 01332 int x = 0; 01333 01334 if (!USE_SSE2) 01335 return x; 01336 01337 __m128i v_zero = _mm_setzero_si128(); 01338 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01339 01340 for ( ; x <= width - 8; x += 8) 01341 { 01342 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 01343 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01344 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01345 01346 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01347 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01348 01349 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01350 _mm_cvtps_epi32(v_dst_1)); 01351 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 01352 } 01353 01354 return x; 01355 } 01356 }; 01357 01358 template <> 01359 struct cvtScale_SIMD<uchar, int, float> 01360 { 01361 int operator () (const uchar * src, int * dst, int width, float scale, float shift) const 01362 { 01363 int x = 0; 01364 01365 if (!USE_SSE2) 01366 return x; 01367 01368 __m128i v_zero = _mm_setzero_si128(); 01369 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01370 01371 for ( ; x <= width - 8; x += 8) 01372 { 01373 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 01374 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01375 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01376 01377 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01378 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01379 01380 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 01381 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 01382 } 01383 01384 return x; 01385 } 01386 }; 01387 01388 template <> 01389 struct cvtScale_SIMD<uchar, float, float> 01390 { 01391 int operator () (const uchar * src, float * dst, int width, float scale, float shift) const 01392 { 01393 int x = 0; 01394 01395 if (!USE_SSE2) 01396 return x; 01397 01398 __m128i v_zero = _mm_setzero_si128(); 01399 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01400 01401 for ( ; x <= width - 8; x += 8) 01402 { 01403 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 01404 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01405 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01406 01407 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01408 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01409 01410 _mm_storeu_ps(dst + x, v_dst_0); 01411 _mm_storeu_ps(dst + x + 4, v_dst_1); 01412 } 01413 01414 return x; 01415 } 01416 }; 01417 01418 template <> 01419 struct cvtScale_SIMD<uchar, double, double> 01420 { 01421 int operator () (const uchar * src, double * dst, int width, double scale, double shift) const 01422 { 01423 int x = 0; 01424 01425 if (!USE_SSE2) 01426 return x; 01427 01428 __m128i v_zero = _mm_setzero_si128(); 01429 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 01430 01431 for ( ; x <= width - 8; x += 8) 01432 { 01433 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 01434 01435 __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); 01436 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 01437 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 01438 _mm_storeu_pd(dst + x, v_dst_0); 01439 _mm_storeu_pd(dst + x + 2, v_dst_1); 01440 01441 v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); 01442 v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 01443 v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 01444 _mm_storeu_pd(dst + x + 4, v_dst_0); 01445 _mm_storeu_pd(dst + x + 6, v_dst_1); 01446 } 01447 01448 return x; 01449 } 01450 }; 01451 01452 // from schar 01453 01454 template <> 01455 struct cvtScale_SIMD<schar, uchar, float> 01456 { 01457 int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const 01458 { 01459 int x = 0; 01460 01461 if (!USE_SSE2) 01462 return x; 01463 01464 __m128i v_zero = _mm_setzero_si128(); 01465 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01466 01467 for ( ; x <= width - 8; x += 8) 01468 { 01469 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 01470 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 01471 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01472 01473 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 01474 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01475 01476 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01477 _mm_cvtps_epi32(v_dst_1)); 01478 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 01479 } 01480 01481 return x; 01482 } 01483 }; 01484 01485 template <> 01486 struct cvtScale_SIMD<schar, schar, float> 01487 { 01488 int operator () (const schar * src, schar * dst, int width, float scale, float shift) const 01489 { 01490 int x = 0; 01491 01492 if (!USE_SSE2) 01493 return x; 01494 01495 __m128i v_zero = _mm_setzero_si128(); 01496 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01497 01498 for ( ; x <= width - 8; x += 8) 01499 { 01500 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 01501 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 01502 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01503 01504 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 01505 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01506 01507 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01508 _mm_cvtps_epi32(v_dst_1)); 01509 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 01510 } 01511 01512 return x; 01513 } 01514 }; 01515 01516 #if CV_SSE4_1 01517 01518 template <> 01519 struct cvtScale_SIMD<schar, ushort, float> 01520 { 01521 cvtScale_SIMD() 01522 { 01523 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 01524 } 01525 01526 int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const 01527 { 01528 int x = 0; 01529 01530 if (!haveSSE) 01531 return x; 01532 01533 __m128i v_zero = _mm_setzero_si128(); 01534 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01535 01536 for ( ; x <= width - 8; x += 8) 01537 { 01538 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 01539 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 01540 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01541 01542 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 01543 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01544 01545 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 01546 _mm_cvtps_epi32(v_dst_1)); 01547 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 01548 } 01549 01550 return x; 01551 } 01552 01553 bool haveSSE; 01554 }; 01555 01556 #endif 01557 01558 template <> 01559 struct cvtScale_SIMD<schar, short, float> 01560 { 01561 int operator () (const schar * src, short * dst, int width, float scale, float shift) const 01562 { 01563 int x = 0; 01564 01565 if (!USE_SSE2) 01566 return x; 01567 01568 __m128i v_zero = _mm_setzero_si128(); 01569 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01570 01571 for ( ; x <= width - 8; x += 8) 01572 { 01573 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 01574 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 01575 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01576 01577 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 01578 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01579 01580 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01581 _mm_cvtps_epi32(v_dst_1)); 01582 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 01583 } 01584 01585 return x; 01586 } 01587 }; 01588 01589 template <> 01590 struct cvtScale_SIMD<schar, int, float> 01591 { 01592 int operator () (const schar * src, int * dst, int width, float scale, float shift) const 01593 { 01594 int x = 0; 01595 01596 if (!USE_SSE2) 01597 return x; 01598 01599 __m128i v_zero = _mm_setzero_si128(); 01600 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01601 01602 for ( ; x <= width - 8; x += 8) 01603 { 01604 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 01605 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 01606 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01607 01608 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 01609 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01610 01611 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 01612 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 01613 } 01614 01615 return x; 01616 } 01617 }; 01618 01619 template <> 01620 struct cvtScale_SIMD<schar, float, float> 01621 { 01622 int operator () (const schar * src, float * dst, int width, float scale, float shift) const 01623 { 01624 int x = 0; 01625 01626 if (!USE_SSE2) 01627 return x; 01628 01629 __m128i v_zero = _mm_setzero_si128(); 01630 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01631 01632 for ( ; x <= width - 8; x += 8) 01633 { 01634 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 01635 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 01636 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01637 01638 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 01639 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01640 01641 _mm_storeu_ps(dst + x, v_dst_0); 01642 _mm_storeu_ps(dst + x + 4, v_dst_1); 01643 } 01644 01645 return x; 01646 } 01647 }; 01648 01649 template <> 01650 struct cvtScale_SIMD<schar, double, double> 01651 { 01652 int operator () (const schar * src, double * dst, int width, double scale, double shift) const 01653 { 01654 int x = 0; 01655 01656 if (!USE_SSE2) 01657 return x; 01658 01659 __m128i v_zero = _mm_setzero_si128(); 01660 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 01661 01662 for ( ; x <= width - 8; x += 8) 01663 { 01664 __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))); 01665 v_src = _mm_srai_epi16(v_src, 8); 01666 01667 __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); 01668 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 01669 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 01670 _mm_storeu_pd(dst + x, v_dst_0); 01671 _mm_storeu_pd(dst + x + 2, v_dst_1); 01672 01673 v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); 01674 v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 01675 v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 01676 _mm_storeu_pd(dst + x + 4, v_dst_0); 01677 _mm_storeu_pd(dst + x + 6, v_dst_1); 01678 } 01679 01680 return x; 01681 } 01682 }; 01683 01684 // from ushort 01685 01686 template <> 01687 struct cvtScale_SIMD<ushort, uchar, float> 01688 { 01689 int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const 01690 { 01691 int x = 0; 01692 01693 if (!USE_SSE2) 01694 return x; 01695 01696 __m128i v_zero = _mm_setzero_si128(); 01697 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01698 01699 for ( ; x <= width - 8; x += 8) 01700 { 01701 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01702 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01703 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01704 01705 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01706 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01707 01708 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01709 _mm_cvtps_epi32(v_dst_1)); 01710 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 01711 } 01712 01713 return x; 01714 } 01715 }; 01716 01717 template <> 01718 struct cvtScale_SIMD<ushort, schar, float> 01719 { 01720 int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const 01721 { 01722 int x = 0; 01723 01724 if (!USE_SSE2) 01725 return x; 01726 01727 __m128i v_zero = _mm_setzero_si128(); 01728 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01729 01730 for ( ; x <= width - 8; x += 8) 01731 { 01732 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01733 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01734 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01735 01736 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01737 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01738 01739 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01740 _mm_cvtps_epi32(v_dst_1)); 01741 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 01742 } 01743 01744 return x; 01745 } 01746 }; 01747 01748 #if CV_SSE4_1 01749 01750 template <> 01751 struct cvtScale_SIMD<ushort, ushort, float> 01752 { 01753 cvtScale_SIMD() 01754 { 01755 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 01756 } 01757 01758 int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const 01759 { 01760 int x = 0; 01761 01762 if (!haveSSE) 01763 return x; 01764 01765 __m128i v_zero = _mm_setzero_si128(); 01766 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01767 01768 for ( ; x <= width - 8; x += 8) 01769 { 01770 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01771 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01772 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01773 01774 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01775 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01776 01777 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 01778 _mm_cvtps_epi32(v_dst_1)); 01779 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 01780 } 01781 01782 return x; 01783 } 01784 01785 bool haveSSE; 01786 }; 01787 01788 #endif 01789 01790 template <> 01791 struct cvtScale_SIMD<ushort, short, float> 01792 { 01793 int operator () (const ushort * src, short * dst, int width, float scale, float shift) const 01794 { 01795 int x = 0; 01796 01797 if (!USE_SSE2) 01798 return x; 01799 01800 __m128i v_zero = _mm_setzero_si128(); 01801 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01802 01803 for ( ; x <= width - 8; x += 8) 01804 { 01805 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01806 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01807 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01808 01809 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01810 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01811 01812 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01813 _mm_cvtps_epi32(v_dst_1)); 01814 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 01815 } 01816 01817 return x; 01818 } 01819 }; 01820 01821 template <> 01822 struct cvtScale_SIMD<ushort, int, float> 01823 { 01824 int operator () (const ushort * src, int * dst, int width, float scale, float shift) const 01825 { 01826 int x = 0; 01827 01828 if (!USE_SSE2) 01829 return x; 01830 01831 __m128i v_zero = _mm_setzero_si128(); 01832 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01833 01834 for ( ; x <= width - 8; x += 8) 01835 { 01836 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01837 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01838 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01839 01840 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01841 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01842 01843 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 01844 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 01845 } 01846 01847 return x; 01848 } 01849 }; 01850 01851 template <> 01852 struct cvtScale_SIMD<ushort, float, float> 01853 { 01854 int operator () (const ushort * src, float * dst, int width, float scale, float shift) const 01855 { 01856 int x = 0; 01857 01858 if (!USE_SSE2) 01859 return x; 01860 01861 __m128i v_zero = _mm_setzero_si128(); 01862 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01863 01864 for ( ; x <= width - 8; x += 8) 01865 { 01866 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01867 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 01868 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01869 01870 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 01871 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01872 01873 _mm_storeu_ps(dst + x, v_dst_0); 01874 _mm_storeu_ps(dst + x + 4, v_dst_1); 01875 } 01876 01877 return x; 01878 } 01879 }; 01880 01881 template <> 01882 struct cvtScale_SIMD<ushort, double, double> 01883 { 01884 int operator () (const ushort * src, double * dst, int width, double scale, double shift) const 01885 { 01886 int x = 0; 01887 01888 if (!USE_SSE2) 01889 return x; 01890 01891 __m128i v_zero = _mm_setzero_si128(); 01892 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 01893 01894 for ( ; x <= width - 8; x += 8) 01895 { 01896 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01897 01898 __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); 01899 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 01900 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 01901 _mm_storeu_pd(dst + x, v_dst_0); 01902 _mm_storeu_pd(dst + x + 2, v_dst_1); 01903 01904 v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); 01905 v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 01906 v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 01907 _mm_storeu_pd(dst + x + 4, v_dst_0); 01908 _mm_storeu_pd(dst + x + 6, v_dst_1); 01909 } 01910 01911 return x; 01912 } 01913 }; 01914 01915 // from short 01916 01917 template <> 01918 struct cvtScale_SIMD<short, uchar, float> 01919 { 01920 int operator () (const short * src, uchar * dst, int width, float scale, float shift) const 01921 { 01922 int x = 0; 01923 01924 if (!USE_SSE2) 01925 return x; 01926 01927 __m128i v_zero = _mm_setzero_si128(); 01928 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01929 01930 for ( ; x <= width - 8; x += 8) 01931 { 01932 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01933 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 01934 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01935 01936 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 01937 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01938 01939 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01940 _mm_cvtps_epi32(v_dst_1)); 01941 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 01942 } 01943 01944 return x; 01945 } 01946 }; 01947 01948 template <> 01949 struct cvtScale_SIMD<short, schar, float> 01950 { 01951 int operator () (const short * src, schar * dst, int width, float scale, float shift) const 01952 { 01953 int x = 0; 01954 01955 if (!USE_SSE2) 01956 return x; 01957 01958 __m128i v_zero = _mm_setzero_si128(); 01959 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01960 01961 for ( ; x <= width - 8; x += 8) 01962 { 01963 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 01964 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 01965 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01966 01967 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 01968 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 01969 01970 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 01971 _mm_cvtps_epi32(v_dst_1)); 01972 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 01973 } 01974 01975 return x; 01976 } 01977 }; 01978 01979 #if CV_SSE4_1 01980 01981 template <> 01982 struct cvtScale_SIMD<short, ushort, float> 01983 { 01984 cvtScale_SIMD() 01985 { 01986 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 01987 } 01988 01989 int operator () (const short * src, ushort * dst, int width, float scale, float shift) const 01990 { 01991 int x = 0; 01992 01993 if (!haveSSE) 01994 return x; 01995 01996 __m128i v_zero = _mm_setzero_si128(); 01997 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 01998 01999 for ( ; x <= width - 8; x += 8) 02000 { 02001 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02002 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 02003 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 02004 02005 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 02006 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 02007 02008 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 02009 _mm_cvtps_epi32(v_dst_1)); 02010 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 02011 } 02012 02013 return x; 02014 } 02015 02016 bool haveSSE; 02017 }; 02018 02019 #endif 02020 02021 template <> 02022 struct cvtScale_SIMD<short, short, float> 02023 { 02024 int operator () (const short * src, short * dst, int width, float scale, float shift) const 02025 { 02026 int x = 0; 02027 02028 if (!USE_SSE2) 02029 return x; 02030 02031 __m128i v_zero = _mm_setzero_si128(); 02032 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02033 02034 for ( ; x <= width - 8; x += 8) 02035 { 02036 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02037 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 02038 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 02039 02040 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 02041 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 02042 02043 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02044 _mm_cvtps_epi32(v_dst_1)); 02045 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 02046 } 02047 02048 return x; 02049 } 02050 }; 02051 02052 template <> 02053 struct cvtScale_SIMD<short, int, float> 02054 { 02055 int operator () (const short * src, int * dst, int width, float scale, float shift) const 02056 { 02057 int x = 0; 02058 02059 if (!USE_SSE2) 02060 return x; 02061 02062 __m128i v_zero = _mm_setzero_si128(); 02063 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02064 02065 for ( ; x <= width - 8; x += 8) 02066 { 02067 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02068 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 02069 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 02070 02071 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 02072 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 02073 02074 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 02075 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 02076 } 02077 02078 return x; 02079 } 02080 }; 02081 02082 template <> 02083 struct cvtScale_SIMD<short, float, float> 02084 { 02085 int operator () (const short * src, float * dst, int width, float scale, float shift) const 02086 { 02087 int x = 0; 02088 02089 if (!USE_SSE2) 02090 return x; 02091 02092 __m128i v_zero = _mm_setzero_si128(); 02093 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02094 02095 for ( ; x <= width - 8; x += 8) 02096 { 02097 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02098 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 02099 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 02100 02101 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 02102 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 02103 02104 _mm_storeu_ps(dst + x, v_dst_0); 02105 _mm_storeu_ps(dst + x + 4, v_dst_1); 02106 } 02107 02108 return x; 02109 } 02110 }; 02111 02112 template <> 02113 struct cvtScale_SIMD<short, double, double> 02114 { 02115 int operator () (const short * src, double * dst, int width, double scale, double shift) const 02116 { 02117 int x = 0; 02118 02119 if (!USE_SSE2) 02120 return x; 02121 02122 __m128i v_zero = _mm_setzero_si128(); 02123 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 02124 02125 for ( ; x <= width - 8; x += 8) 02126 { 02127 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02128 02129 __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); 02130 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 02131 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 02132 _mm_storeu_pd(dst + x, v_dst_0); 02133 _mm_storeu_pd(dst + x + 2, v_dst_1); 02134 02135 v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); 02136 v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 02137 v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 02138 _mm_storeu_pd(dst + x + 4, v_dst_0); 02139 _mm_storeu_pd(dst + x + 6, v_dst_1); 02140 } 02141 02142 return x; 02143 } 02144 }; 02145 02146 // from int 02147 02148 template <> 02149 struct cvtScale_SIMD<int, uchar, float> 02150 { 02151 int operator () (const int * src, uchar * dst, int width, float scale, float shift) const 02152 { 02153 int x = 0; 02154 02155 if (!USE_SSE2) 02156 return x; 02157 02158 __m128i v_zero = _mm_setzero_si128(); 02159 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02160 02161 for ( ; x <= width - 8; x += 8) 02162 { 02163 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02164 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 02165 02166 v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); 02167 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 02168 02169 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02170 _mm_cvtps_epi32(v_dst_1)); 02171 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 02172 } 02173 02174 return x; 02175 } 02176 }; 02177 02178 template <> 02179 struct cvtScale_SIMD<int, schar, float> 02180 { 02181 int operator () (const int * src, schar * dst, int width, float scale, float shift) const 02182 { 02183 int x = 0; 02184 02185 if (!USE_SSE2) 02186 return x; 02187 02188 __m128i v_zero = _mm_setzero_si128(); 02189 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02190 02191 for ( ; x <= width - 8; x += 8) 02192 { 02193 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02194 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 02195 02196 v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); 02197 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 02198 02199 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02200 _mm_cvtps_epi32(v_dst_1)); 02201 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 02202 } 02203 02204 return x; 02205 } 02206 }; 02207 02208 #if CV_SSE4_1 02209 02210 template <> 02211 struct cvtScale_SIMD<int, ushort, float> 02212 { 02213 cvtScale_SIMD() 02214 { 02215 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 02216 } 02217 02218 int operator () (const int * src, ushort * dst, int width, float scale, float shift) const 02219 { 02220 int x = 0; 02221 02222 if (!haveSSE) 02223 return x; 02224 02225 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02226 02227 for ( ; x <= width - 8; x += 8) 02228 { 02229 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02230 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 02231 02232 v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); 02233 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 02234 02235 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 02236 _mm_cvtps_epi32(v_dst_1)); 02237 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 02238 } 02239 02240 return x; 02241 } 02242 02243 bool haveSSE; 02244 }; 02245 02246 #endif 02247 02248 template <> 02249 struct cvtScale_SIMD<int, short, float> 02250 { 02251 int operator () (const int * src, short * dst, int width, float scale, float shift) const 02252 { 02253 int x = 0; 02254 02255 if (!USE_SSE2) 02256 return x; 02257 02258 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02259 02260 for ( ; x <= width - 8; x += 8) 02261 { 02262 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02263 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 02264 02265 v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); 02266 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 02267 02268 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02269 _mm_cvtps_epi32(v_dst_1)); 02270 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 02271 } 02272 02273 return x; 02274 } 02275 }; 02276 02277 template <> 02278 struct cvtScale_SIMD<int, int, double> 02279 { 02280 int operator () (const int * src, int * dst, int width, double scale, double shift) const 02281 { 02282 int x = 0; 02283 02284 if (!USE_SSE2) 02285 return x; 02286 02287 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 02288 02289 for ( ; x <= width - 4; x += 4) 02290 { 02291 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02292 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 02293 02294 v_src = _mm_srli_si128(v_src, 8); 02295 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 02296 02297 __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)), 02298 _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1))); 02299 02300 _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); 02301 } 02302 02303 return x; 02304 } 02305 }; 02306 02307 template <> 02308 struct cvtScale_SIMD<int, float, double> 02309 { 02310 int operator () (const int * src, float * dst, int width, double scale, double shift) const 02311 { 02312 int x = 0; 02313 02314 if (!USE_SSE2) 02315 return x; 02316 02317 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 02318 02319 for ( ; x <= width - 4; x += 4) 02320 { 02321 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02322 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 02323 02324 v_src = _mm_srli_si128(v_src, 8); 02325 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 02326 02327 _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0), 02328 _mm_cvtpd_ps(v_dst_1))); 02329 } 02330 02331 return x; 02332 } 02333 }; 02334 02335 template <> 02336 struct cvtScale_SIMD<int, double, double> 02337 { 02338 int operator () (const int * src, double * dst, int width, double scale, double shift) const 02339 { 02340 int x = 0; 02341 02342 if (!USE_SSE2) 02343 return x; 02344 02345 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 02346 02347 for ( ; x <= width - 4; x += 4) 02348 { 02349 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 02350 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 02351 02352 v_src = _mm_srli_si128(v_src, 8); 02353 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 02354 02355 _mm_storeu_pd(dst + x, v_dst_0); 02356 _mm_storeu_pd(dst + x + 2, v_dst_1); 02357 } 02358 02359 return x; 02360 } 02361 }; 02362 02363 // from float 02364 02365 template <> 02366 struct cvtScale_SIMD<float, uchar, float> 02367 { 02368 int operator () (const float * src, uchar * dst, int width, float scale, float shift) const 02369 { 02370 int x = 0; 02371 02372 if (!USE_SSE2) 02373 return x; 02374 02375 __m128i v_zero = _mm_setzero_si128(); 02376 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02377 02378 for ( ; x <= width - 8; x += 8) 02379 { 02380 __m128 v_src = _mm_loadu_ps(src + x); 02381 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02382 02383 v_src = _mm_loadu_ps(src + x + 4); 02384 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02385 02386 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02387 _mm_cvtps_epi32(v_dst_1)); 02388 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 02389 } 02390 02391 return x; 02392 } 02393 }; 02394 02395 template <> 02396 struct cvtScale_SIMD<float, schar, float> 02397 { 02398 int operator () (const float * src, schar * dst, int width, float scale, float shift) const 02399 { 02400 int x = 0; 02401 02402 if (!USE_SSE2) 02403 return x; 02404 02405 __m128i v_zero = _mm_setzero_si128(); 02406 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02407 02408 for ( ; x <= width - 8; x += 8) 02409 { 02410 __m128 v_src = _mm_loadu_ps(src + x); 02411 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02412 02413 v_src = _mm_loadu_ps(src + x + 4); 02414 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02415 02416 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02417 _mm_cvtps_epi32(v_dst_1)); 02418 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 02419 } 02420 02421 return x; 02422 } 02423 }; 02424 02425 #if CV_SSE4_1 02426 02427 template <> 02428 struct cvtScale_SIMD<float, ushort, float> 02429 { 02430 cvtScale_SIMD() 02431 { 02432 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 02433 } 02434 02435 int operator () (const float * src, ushort * dst, int width, float scale, float shift) const 02436 { 02437 int x = 0; 02438 02439 if (!haveSSE) 02440 return x; 02441 02442 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02443 02444 for ( ; x <= width - 8; x += 8) 02445 { 02446 __m128 v_src = _mm_loadu_ps(src + x); 02447 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02448 02449 v_src = _mm_loadu_ps(src + x + 4); 02450 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02451 02452 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 02453 _mm_cvtps_epi32(v_dst_1)); 02454 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 02455 } 02456 02457 return x; 02458 } 02459 02460 bool haveSSE; 02461 }; 02462 02463 #endif 02464 02465 template <> 02466 struct cvtScale_SIMD<float, short, float> 02467 { 02468 int operator () (const float * src, short * dst, int width, float scale, float shift) const 02469 { 02470 int x = 0; 02471 02472 if (!USE_SSE2) 02473 return x; 02474 02475 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02476 02477 for ( ; x <= width - 8; x += 8) 02478 { 02479 __m128 v_src = _mm_loadu_ps(src + x); 02480 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02481 02482 v_src = _mm_loadu_ps(src + x + 4); 02483 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02484 02485 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02486 _mm_cvtps_epi32(v_dst_1)); 02487 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 02488 } 02489 02490 return x; 02491 } 02492 }; 02493 02494 template <> 02495 struct cvtScale_SIMD<float, int, float> 02496 { 02497 int operator () (const float * src, int * dst, int width, float scale, float shift) const 02498 { 02499 int x = 0; 02500 02501 if (!USE_SSE2) 02502 return x; 02503 02504 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02505 02506 for ( ; x <= width - 8; x += 8) 02507 { 02508 __m128 v_src = _mm_loadu_ps(src + x); 02509 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02510 02511 v_src = _mm_loadu_ps(src + x + 4); 02512 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02513 02514 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 02515 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 02516 } 02517 02518 return x; 02519 } 02520 }; 02521 02522 template <> 02523 struct cvtScale_SIMD<float, float, float> 02524 { 02525 int operator () (const float * src, float * dst, int width, float scale, float shift) const 02526 { 02527 int x = 0; 02528 02529 if (!USE_SSE2) 02530 return x; 02531 02532 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02533 02534 for ( ; x <= width - 4; x += 4) 02535 { 02536 __m128 v_src = _mm_loadu_ps(src + x); 02537 __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02538 _mm_storeu_ps(dst + x, v_dst); 02539 } 02540 02541 return x; 02542 } 02543 }; 02544 02545 template <> 02546 struct cvtScale_SIMD<float, double, double> 02547 { 02548 int operator () (const float * src, double * dst, int width, double scale, double shift) const 02549 { 02550 int x = 0; 02551 02552 if (!USE_SSE2) 02553 return x; 02554 02555 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 02556 02557 for ( ; x <= width - 4; x += 4) 02558 { 02559 __m128 v_src = _mm_loadu_ps(src + x); 02560 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); 02561 v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); 02562 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); 02563 02564 _mm_storeu_pd(dst + x, v_dst_0); 02565 _mm_storeu_pd(dst + x + 2, v_dst_1); 02566 } 02567 02568 return x; 02569 } 02570 }; 02571 02572 // from double 02573 02574 template <> 02575 struct cvtScale_SIMD<double, uchar, float> 02576 { 02577 int operator () (const double * src, uchar * dst, int width, float scale, float shift) const 02578 { 02579 int x = 0; 02580 02581 if (!USE_SSE2) 02582 return x; 02583 02584 __m128i v_zero = _mm_setzero_si128(); 02585 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02586 02587 for ( ; x <= width - 8; x += 8) 02588 { 02589 __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 02590 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 02591 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02592 02593 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 02594 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 02595 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02596 02597 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02598 _mm_cvtps_epi32(v_dst_1)); 02599 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 02600 } 02601 02602 return x; 02603 } 02604 }; 02605 02606 template <> 02607 struct cvtScale_SIMD<double, schar, float> 02608 { 02609 int operator () (const double * src, schar * dst, int width, float scale, float shift) const 02610 { 02611 int x = 0; 02612 02613 if (!USE_SSE2) 02614 return x; 02615 02616 __m128i v_zero = _mm_setzero_si128(); 02617 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02618 02619 for ( ; x <= width - 8; x += 8) 02620 { 02621 __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 02622 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 02623 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02624 02625 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 02626 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 02627 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02628 02629 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02630 _mm_cvtps_epi32(v_dst_1)); 02631 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 02632 } 02633 02634 return x; 02635 } 02636 }; 02637 02638 #if CV_SSE4_1 02639 02640 template <> 02641 struct cvtScale_SIMD<double, ushort, float> 02642 { 02643 cvtScale_SIMD() 02644 { 02645 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 02646 } 02647 02648 int operator () (const double * src, ushort * dst, int width, float scale, float shift) const 02649 { 02650 int x = 0; 02651 02652 if (!haveSSE) 02653 return x; 02654 02655 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02656 02657 for ( ; x <= width - 8; x += 8) 02658 { 02659 __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 02660 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 02661 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02662 02663 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 02664 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 02665 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02666 02667 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 02668 _mm_cvtps_epi32(v_dst_1)); 02669 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 02670 } 02671 02672 return x; 02673 } 02674 02675 bool haveSSE; 02676 }; 02677 02678 #endif 02679 02680 template <> 02681 struct cvtScale_SIMD<double, short, float> 02682 { 02683 int operator () (const double * src, short * dst, int width, float scale, float shift) const 02684 { 02685 int x = 0; 02686 02687 if (!USE_SSE2) 02688 return x; 02689 02690 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 02691 02692 for ( ; x <= width - 8; x += 8) 02693 { 02694 __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 02695 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 02696 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02697 02698 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 02699 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 02700 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 02701 02702 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 02703 _mm_cvtps_epi32(v_dst_1)); 02704 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 02705 } 02706 02707 return x; 02708 } 02709 }; 02710 02711 template <> 02712 struct cvtScale_SIMD<double, int, double> 02713 { 02714 int operator () (const double * src, int * dst, int width, double scale, double shift) const 02715 { 02716 int x = 0; 02717 02718 if (!USE_SSE2) 02719 return x; 02720 02721 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 02722 02723 for ( ; x <= width - 4; x += 4) 02724 { 02725 __m128d v_src = _mm_loadu_pd(src + x); 02726 __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 02727 02728 v_src = _mm_loadu_pd(src + x + 2); 02729 __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 02730 02731 __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)), 02732 _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1))); 02733 02734 _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); 02735 } 02736 02737 return x; 02738 } 02739 }; 02740 02741 template <> 02742 struct cvtScale_SIMD<double, float, double> 02743 { 02744 int operator () (const double * src, float * dst, int width, double scale, double shift) const 02745 { 02746 int x = 0; 02747 02748 if (!USE_SSE2) 02749 return x; 02750 02751 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 02752 02753 for ( ; x <= width - 4; x += 4) 02754 { 02755 __m128d v_src = _mm_loadu_pd(src + x); 02756 __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 02757 02758 v_src = _mm_loadu_pd(src + x + 2); 02759 __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 02760 02761 __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0), 02762 _mm_cvtpd_ps(v_dst1)); 02763 02764 _mm_storeu_ps(dst + x, v_dst); 02765 } 02766 02767 return x; 02768 } 02769 }; 02770 02771 template <> 02772 struct cvtScale_SIMD<double, double, double> 02773 { 02774 int operator () (const double * src, double * dst, int width, double scale, double shift) const 02775 { 02776 int x = 0; 02777 02778 if (!USE_SSE2) 02779 return x; 02780 02781 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 02782 02783 for ( ; x <= width - 2; x += 2) 02784 { 02785 __m128d v_src = _mm_loadu_pd(src + x); 02786 __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 02787 _mm_storeu_pd(dst + x, v_dst); 02788 } 02789 02790 return x; 02791 } 02792 }; 02793 02794 #elif CV_NEON 02795 02796 // from uchar 02797 02798 template <> 02799 struct cvtScale_SIMD<uchar, uchar, float> 02800 { 02801 int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const 02802 { 02803 int x = 0; 02804 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02805 02806 for ( ; x <= width - 8; x += 8) 02807 { 02808 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 02809 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 02810 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 02811 02812 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 02813 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 02814 vst1_u8(dst + x, vqmovn_u16(v_dst)); 02815 } 02816 02817 return x; 02818 } 02819 }; 02820 02821 template <> 02822 struct cvtScale_SIMD<uchar, schar, float> 02823 { 02824 int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const 02825 { 02826 int x = 0; 02827 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02828 02829 for ( ; x <= width - 8; x += 8) 02830 { 02831 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 02832 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 02833 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 02834 02835 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 02836 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 02837 vst1_s8(dst + x, vqmovn_s16(v_dst)); 02838 } 02839 02840 return x; 02841 } 02842 }; 02843 02844 template <> 02845 struct cvtScale_SIMD<uchar, ushort, float> 02846 { 02847 int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const 02848 { 02849 int x = 0; 02850 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02851 02852 for ( ; x <= width - 8; x += 8) 02853 { 02854 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 02855 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 02856 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 02857 02858 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 02859 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 02860 vst1q_u16(dst + x, v_dst); 02861 } 02862 02863 return x; 02864 } 02865 }; 02866 02867 template <> 02868 struct cvtScale_SIMD<uchar, short, float> 02869 { 02870 int operator () (const uchar * src, short * dst, int width, float scale, float shift) const 02871 { 02872 int x = 0; 02873 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02874 02875 for ( ; x <= width - 8; x += 8) 02876 { 02877 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 02878 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 02879 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 02880 02881 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 02882 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 02883 vst1q_s16(dst + x, v_dst); 02884 } 02885 02886 return x; 02887 } 02888 }; 02889 02890 template <> 02891 struct cvtScale_SIMD<uchar, int, float> 02892 { 02893 int operator () (const uchar * src, int * dst, int width, float scale, float shift) const 02894 { 02895 int x = 0; 02896 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02897 02898 for ( ; x <= width - 8; x += 8) 02899 { 02900 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 02901 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 02902 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 02903 02904 vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); 02905 vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); 02906 } 02907 02908 return x; 02909 } 02910 }; 02911 02912 template <> 02913 struct cvtScale_SIMD<uchar, float, float> 02914 { 02915 int operator () (const uchar * src, float * dst, int width, float scale, float shift) const 02916 { 02917 int x = 0; 02918 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02919 02920 for ( ; x <= width - 8; x += 8) 02921 { 02922 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 02923 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift)); 02924 vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift)); 02925 } 02926 02927 return x; 02928 } 02929 }; 02930 02931 // from schar 02932 02933 template <> 02934 struct cvtScale_SIMD<schar, uchar, float> 02935 { 02936 int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const 02937 { 02938 int x = 0; 02939 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02940 02941 for ( ; x <= width - 8; x += 8) 02942 { 02943 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 02944 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 02945 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 02946 02947 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 02948 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 02949 vst1_u8(dst + x, vqmovn_u16(v_dst)); 02950 } 02951 02952 return x; 02953 } 02954 }; 02955 02956 template <> 02957 struct cvtScale_SIMD<schar, schar, float> 02958 { 02959 int operator () (const schar * src, schar * dst, int width, float scale, float shift) const 02960 { 02961 int x = 0; 02962 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02963 02964 for ( ; x <= width - 8; x += 8) 02965 { 02966 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 02967 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 02968 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 02969 02970 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 02971 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 02972 vst1_s8(dst + x, vqmovn_s16(v_dst)); 02973 } 02974 02975 return x; 02976 } 02977 }; 02978 02979 template <> 02980 struct cvtScale_SIMD<schar, ushort, float> 02981 { 02982 int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const 02983 { 02984 int x = 0; 02985 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 02986 02987 for ( ; x <= width - 8; x += 8) 02988 { 02989 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 02990 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 02991 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 02992 02993 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 02994 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 02995 vst1q_u16(dst + x, v_dst); 02996 } 02997 02998 return x; 02999 } 03000 }; 03001 03002 template <> 03003 struct cvtScale_SIMD<schar, short, float> 03004 { 03005 int operator () (const schar * src, short * dst, int width, float scale, float shift) const 03006 { 03007 int x = 0; 03008 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03009 03010 for ( ; x <= width - 8; x += 8) 03011 { 03012 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 03013 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 03014 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 03015 03016 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 03017 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 03018 vst1q_s16(dst + x, v_dst); 03019 } 03020 03021 return x; 03022 } 03023 }; 03024 03025 template <> 03026 struct cvtScale_SIMD<schar, int, float> 03027 { 03028 int operator () (const schar * src, int * dst, int width, float scale, float shift) const 03029 { 03030 int x = 0; 03031 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03032 03033 for ( ; x <= width - 8; x += 8) 03034 { 03035 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 03036 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 03037 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 03038 03039 vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); 03040 vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); 03041 } 03042 03043 return x; 03044 } 03045 }; 03046 03047 template <> 03048 struct cvtScale_SIMD<schar, float, float> 03049 { 03050 int operator () (const schar * src, float * dst, int width, float scale, float shift) const 03051 { 03052 int x = 0; 03053 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03054 03055 for ( ; x <= width - 8; x += 8) 03056 { 03057 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 03058 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift)); 03059 vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift)); 03060 } 03061 03062 return x; 03063 } 03064 }; 03065 03066 // from ushort 03067 03068 template <> 03069 struct cvtScale_SIMD<ushort, uchar, float> 03070 { 03071 int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const 03072 { 03073 int x = 0; 03074 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03075 03076 for ( ; x <= width - 8; x += 8) 03077 { 03078 uint16x8_t v_src = vld1q_u16(src + x); 03079 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 03080 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 03081 03082 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 03083 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 03084 vst1_u8(dst + x, vqmovn_u16(v_dst)); 03085 } 03086 03087 return x; 03088 } 03089 }; 03090 03091 template <> 03092 struct cvtScale_SIMD<ushort, schar, float> 03093 { 03094 int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const 03095 { 03096 int x = 0; 03097 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03098 03099 for ( ; x <= width - 8; x += 8) 03100 { 03101 uint16x8_t v_src = vld1q_u16(src + x); 03102 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 03103 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 03104 03105 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 03106 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 03107 vst1_s8(dst + x, vqmovn_s16(v_dst)); 03108 } 03109 03110 return x; 03111 } 03112 }; 03113 03114 template <> 03115 struct cvtScale_SIMD<ushort, ushort, float> 03116 { 03117 int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const 03118 { 03119 int x = 0; 03120 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03121 03122 for ( ; x <= width - 8; x += 8) 03123 { 03124 uint16x8_t v_src = vld1q_u16(src + x); 03125 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 03126 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 03127 03128 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 03129 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 03130 vst1q_u16(dst + x, v_dst); 03131 } 03132 03133 return x; 03134 } 03135 }; 03136 03137 template <> 03138 struct cvtScale_SIMD<ushort, short, float> 03139 { 03140 int operator () (const ushort * src, short * dst, int width, float scale, float shift) const 03141 { 03142 int x = 0; 03143 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03144 03145 for ( ; x <= width - 8; x += 8) 03146 { 03147 uint16x8_t v_src = vld1q_u16(src + x); 03148 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 03149 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 03150 03151 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 03152 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 03153 vst1q_s16(dst + x, v_dst); 03154 } 03155 03156 return x; 03157 } 03158 }; 03159 03160 template <> 03161 struct cvtScale_SIMD<ushort, int, float> 03162 { 03163 int operator () (const ushort * src, int * dst, int width, float scale, float shift) const 03164 { 03165 int x = 0; 03166 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03167 03168 for ( ; x <= width - 8; x += 8) 03169 { 03170 uint16x8_t v_src = vld1q_u16(src + x); 03171 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 03172 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 03173 03174 vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); 03175 vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); 03176 } 03177 03178 return x; 03179 } 03180 }; 03181 03182 template <> 03183 struct cvtScale_SIMD<ushort, float, float> 03184 { 03185 int operator () (const ushort * src, float * dst, int width, float scale, float shift) const 03186 { 03187 int x = 0; 03188 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03189 03190 for ( ; x <= width - 8; x += 8) 03191 { 03192 uint16x8_t v_src = vld1q_u16(src + x); 03193 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift)); 03194 vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift)); 03195 } 03196 03197 return x; 03198 } 03199 }; 03200 03201 // from short 03202 03203 template <> 03204 struct cvtScale_SIMD<short, uchar, float> 03205 { 03206 int operator () (const short * src, uchar * dst, int width, float scale, float shift) const 03207 { 03208 int x = 0; 03209 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03210 03211 for ( ; x <= width - 8; x += 8) 03212 { 03213 int16x8_t v_src = vld1q_s16(src + x); 03214 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 03215 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 03216 03217 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 03218 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 03219 vst1_u8(dst + x, vqmovn_u16(v_dst)); 03220 } 03221 03222 return x; 03223 } 03224 }; 03225 03226 template <> 03227 struct cvtScale_SIMD<short, schar, float> 03228 { 03229 int operator () (const short * src, schar * dst, int width, float scale, float shift) const 03230 { 03231 int x = 0; 03232 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03233 03234 for ( ; x <= width - 8; x += 8) 03235 { 03236 int16x8_t v_src = vld1q_s16(src + x); 03237 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 03238 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 03239 03240 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 03241 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 03242 vst1_s8(dst + x, vqmovn_s16(v_dst)); 03243 } 03244 03245 return x; 03246 } 03247 }; 03248 03249 template <> 03250 struct cvtScale_SIMD<short, ushort, float> 03251 { 03252 int operator () (const short * src, ushort * dst, int width, float scale, float shift) const 03253 { 03254 int x = 0; 03255 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03256 03257 for ( ; x <= width - 8; x += 8) 03258 { 03259 int16x8_t v_src = vld1q_s16(src + x); 03260 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 03261 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 03262 03263 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 03264 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 03265 vst1q_u16(dst + x, v_dst); 03266 } 03267 03268 return x; 03269 } 03270 }; 03271 03272 template <> 03273 struct cvtScale_SIMD<short, float, float> 03274 { 03275 int operator () (const short * src, float * dst, int width, float scale, float shift) const 03276 { 03277 int x = 0; 03278 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03279 03280 for ( ; x <= width - 8; x += 8) 03281 { 03282 int16x8_t v_src = vld1q_s16(src + x); 03283 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift)); 03284 vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift)); 03285 } 03286 03287 return x; 03288 } 03289 }; 03290 03291 // from int 03292 03293 template <> 03294 struct cvtScale_SIMD<int, uchar, float> 03295 { 03296 int operator () (const int * src, uchar * dst, int width, float scale, float shift) const 03297 { 03298 int x = 0; 03299 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03300 03301 for ( ; x <= width - 8; x += 8) 03302 { 03303 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); 03304 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); 03305 03306 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 03307 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 03308 vst1_u8(dst + x, vqmovn_u16(v_dst)); 03309 } 03310 03311 return x; 03312 } 03313 }; 03314 03315 template <> 03316 struct cvtScale_SIMD<int, schar, float> 03317 { 03318 int operator () (const int * src, schar * dst, int width, float scale, float shift) const 03319 { 03320 int x = 0; 03321 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03322 03323 for ( ; x <= width - 8; x += 8) 03324 { 03325 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); 03326 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); 03327 03328 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 03329 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 03330 vst1_s8(dst + x, vqmovn_s16(v_dst)); 03331 } 03332 03333 return x; 03334 } 03335 }; 03336 03337 template <> 03338 struct cvtScale_SIMD<int, ushort, float> 03339 { 03340 int operator () (const int * src, ushort * dst, int width, float scale, float shift) const 03341 { 03342 int x = 0; 03343 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03344 03345 for ( ; x <= width - 8; x += 8) 03346 { 03347 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); 03348 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); 03349 03350 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 03351 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 03352 vst1q_u16(dst + x, v_dst); 03353 } 03354 03355 return x; 03356 } 03357 }; 03358 03359 template <> 03360 struct cvtScale_SIMD<int, short, float> 03361 { 03362 int operator () (const int * src, short * dst, int width, float scale, float shift) const 03363 { 03364 int x = 0; 03365 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03366 03367 for ( ; x <= width - 8; x += 8) 03368 { 03369 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); 03370 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); 03371 03372 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 03373 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 03374 vst1q_s16(dst + x, v_dst); 03375 } 03376 03377 return x; 03378 } 03379 }; 03380 03381 // from float 03382 03383 template <> 03384 struct cvtScale_SIMD<float, uchar, float> 03385 { 03386 int operator () (const float * src, uchar * dst, int width, float scale, float shift) const 03387 { 03388 int x = 0; 03389 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03390 03391 for ( ; x <= width - 8; x += 8) 03392 { 03393 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); 03394 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); 03395 03396 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 03397 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 03398 vst1_u8(dst + x, vqmovn_u16(v_dst)); 03399 } 03400 03401 return x; 03402 } 03403 }; 03404 03405 template <> 03406 struct cvtScale_SIMD<float, schar, float> 03407 { 03408 int operator () (const float * src, schar * dst, int width, float scale, float shift) const 03409 { 03410 int x = 0; 03411 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03412 03413 for ( ; x <= width - 8; x += 8) 03414 { 03415 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); 03416 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); 03417 03418 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 03419 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 03420 vst1_s8(dst + x, vqmovn_s16(v_dst)); 03421 } 03422 03423 return x; 03424 } 03425 }; 03426 03427 template <> 03428 struct cvtScale_SIMD<float, ushort, float> 03429 { 03430 int operator () (const float * src, ushort * dst, int width, float scale, float shift) const 03431 { 03432 int x = 0; 03433 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03434 03435 for ( ; x <= width - 8; x += 8) 03436 { 03437 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); 03438 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); 03439 03440 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 03441 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 03442 vst1q_u16(dst + x, v_dst); 03443 } 03444 03445 return x; 03446 } 03447 }; 03448 03449 template <> 03450 struct cvtScale_SIMD<float, short, float> 03451 { 03452 int operator () (const float * src, short * dst, int width, float scale, float shift) const 03453 { 03454 int x = 0; 03455 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03456 03457 for ( ; x <= width - 8; x += 8) 03458 { 03459 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); 03460 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); 03461 03462 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 03463 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 03464 vst1q_s16(dst + x, v_dst); 03465 } 03466 03467 return x; 03468 } 03469 }; 03470 03471 template <> 03472 struct cvtScale_SIMD<float, int, float> 03473 { 03474 int operator () (const float * src, int * dst, int width, float scale, float shift) const 03475 { 03476 int x = 0; 03477 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03478 03479 for ( ; x <= width - 4; x += 4) 03480 vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift))); 03481 03482 return x; 03483 } 03484 }; 03485 03486 template <> 03487 struct cvtScale_SIMD<float, float, float> 03488 { 03489 int operator () (const float * src, float * dst, int width, float scale, float shift) const 03490 { 03491 int x = 0; 03492 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 03493 03494 for ( ; x <= width - 4; x += 4) 03495 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)); 03496 03497 return x; 03498 } 03499 }; 03500 03501 #endif 03502 03503 template<typename T, typename DT, typename WT> static void 03504 cvtScale_( const T* src, size_t sstep, 03505 DT* dst, size_t dstep, Size size, 03506 WT scale, WT shift ) 03507 { 03508 sstep /= sizeof(src[0]); 03509 dstep /= sizeof(dst[0]); 03510 03511 cvtScale_SIMD<T, DT, WT> vop; 03512 03513 for( ; size.height--; src += sstep, dst += dstep ) 03514 { 03515 int x = vop(src, dst, size.width, scale, shift); 03516 03517 #if CV_ENABLE_UNROLLED 03518 for( ; x <= size.width - 4; x += 4 ) 03519 { 03520 DT t0, t1; 03521 t0 = saturate_cast<DT>(src[x]*scale + shift); 03522 t1 = saturate_cast<DT>(src[x+1]*scale + shift); 03523 dst[x] = t0; dst[x+1] = t1; 03524 t0 = saturate_cast<DT>(src[x+2]*scale + shift); 03525 t1 = saturate_cast<DT>(src[x+3]*scale + shift); 03526 dst[x+2] = t0; dst[x+3] = t1; 03527 } 03528 #endif 03529 03530 for( ; x < size.width; x++ ) 03531 dst[x] = saturate_cast<DT>(src[x]*scale + shift); 03532 } 03533 } 03534 03535 //vz optimized template specialization 03536 template<> void 03537 cvtScale_<short, short, float>( const short* src, size_t sstep, 03538 short* dst, size_t dstep, Size size, 03539 float scale, float shift ) 03540 { 03541 sstep /= sizeof(src[0]); 03542 dstep /= sizeof(dst[0]); 03543 03544 for( ; size.height--; src += sstep, dst += dstep ) 03545 { 03546 int x = 0; 03547 #if CV_SSE2 03548 if(USE_SSE2) 03549 { 03550 __m128 scale128 = _mm_set1_ps (scale); 03551 __m128 shift128 = _mm_set1_ps (shift); 03552 for(; x <= size.width - 8; x += 8 ) 03553 { 03554 __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); 03555 __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); 03556 __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); 03557 __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); 03558 rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); 03559 rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); 03560 r0 = _mm_cvtps_epi32(rf0); 03561 r1 = _mm_cvtps_epi32(rf1); 03562 r0 = _mm_packs_epi32(r0, r1); 03563 _mm_storeu_si128((__m128i*)(dst + x), r0); 03564 } 03565 } 03566 #elif CV_NEON 03567 float32x4_t v_shift = vdupq_n_f32(shift); 03568 for(; x <= size.width - 8; x += 8 ) 03569 { 03570 int16x8_t v_src = vld1q_s16(src + x); 03571 float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))); 03572 float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))); 03573 03574 v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift); 03575 v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift); 03576 03577 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)), 03578 vqmovn_s32(cv_vrndq_s32_f32(v_tmp2)))); 03579 } 03580 #endif 03581 03582 for(; x < size.width; x++ ) 03583 dst[x] = saturate_cast<short>(src[x]*scale + shift); 03584 } 03585 } 03586 03587 template<> void 03588 cvtScale_<short, int, float>( const short* src, size_t sstep, 03589 int* dst, size_t dstep, Size size, 03590 float scale, float shift ) 03591 { 03592 sstep /= sizeof(src[0]); 03593 dstep /= sizeof(dst[0]); 03594 03595 for( ; size.height--; src += sstep, dst += dstep ) 03596 { 03597 int x = 0; 03598 03599 #if CV_AVX2 03600 if (USE_AVX2) 03601 { 03602 __m256 scale256 = _mm256_set1_ps(scale); 03603 __m256 shift256 = _mm256_set1_ps(shift); 03604 const int shuffle = 0xD8; 03605 03606 for ( ; x <= size.width - 16; x += 16) 03607 { 03608 __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x)); 03609 v_src = _mm256_permute4x64_epi64(v_src, shuffle); 03610 __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16); 03611 __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16); 03612 __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256); 03613 __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256); 03614 _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); 03615 _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); 03616 } 03617 } 03618 #endif 03619 #if CV_SSE2 03620 if (USE_SSE2)//~5X 03621 { 03622 __m128 scale128 = _mm_set1_ps (scale); 03623 __m128 shift128 = _mm_set1_ps (shift); 03624 for(; x <= size.width - 8; x += 8 ) 03625 { 03626 __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x)); 03627 03628 __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); 03629 __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16)); 03630 rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); 03631 rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); 03632 03633 _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0)); 03634 _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1)); 03635 } 03636 } 03637 #elif CV_NEON 03638 float32x4_t v_shift = vdupq_n_f32(shift); 03639 for(; x <= size.width - 8; x += 8 ) 03640 { 03641 int16x8_t v_src = vld1q_s16(src + x); 03642 float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))); 03643 float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))); 03644 03645 v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift); 03646 v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift); 03647 03648 vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1)); 03649 vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2)); 03650 } 03651 #endif 03652 03653 for(; x < size.width; x++ ) 03654 dst[x] = saturate_cast<int>(src[x]*scale + shift); 03655 } 03656 } 03657 03658 template <typename T, typename DT> 03659 struct Cvt_SIMD 03660 { 03661 int operator() (const T *, DT *, int) const 03662 { 03663 return 0; 03664 } 03665 }; 03666 03667 #if CV_SSE2 03668 03669 // from double 03670 03671 template <> 03672 struct Cvt_SIMD<double, uchar> 03673 { 03674 int operator() (const double * src, uchar * dst, int width) const 03675 { 03676 int x = 0; 03677 03678 if (!USE_SSE2) 03679 return x; 03680 03681 for ( ; x <= width - 8; x += 8) 03682 { 03683 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 03684 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 03685 __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); 03686 __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); 03687 03688 v_src0 = _mm_movelh_ps(v_src0, v_src1); 03689 v_src1 = _mm_movelh_ps(v_src2, v_src3); 03690 03691 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), 03692 _mm_cvtps_epi32(v_src1)); 03693 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst)); 03694 } 03695 03696 return x; 03697 } 03698 }; 03699 03700 template <> 03701 struct Cvt_SIMD<double, schar> 03702 { 03703 int operator() (const double * src, schar * dst, int width) const 03704 { 03705 int x = 0; 03706 03707 if (!USE_SSE2) 03708 return x; 03709 03710 for ( ; x <= width - 8; x += 8) 03711 { 03712 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 03713 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 03714 __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); 03715 __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); 03716 03717 v_src0 = _mm_movelh_ps(v_src0, v_src1); 03718 v_src1 = _mm_movelh_ps(v_src2, v_src3); 03719 03720 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), 03721 _mm_cvtps_epi32(v_src1)); 03722 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst)); 03723 } 03724 03725 return x; 03726 } 03727 }; 03728 03729 #if CV_SSE4_1 03730 03731 template <> 03732 struct Cvt_SIMD<double, ushort> 03733 { 03734 bool haveSIMD; 03735 Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } 03736 03737 int operator() (const double * src, ushort * dst, int width) const 03738 { 03739 int x = 0; 03740 03741 if (!haveSIMD) 03742 return x; 03743 03744 for ( ; x <= width - 8; x += 8) 03745 { 03746 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 03747 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 03748 __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); 03749 __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); 03750 03751 v_src0 = _mm_movelh_ps(v_src0, v_src1); 03752 v_src1 = _mm_movelh_ps(v_src2, v_src3); 03753 03754 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0), 03755 _mm_cvtps_epi32(v_src1)); 03756 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 03757 } 03758 03759 return x; 03760 } 03761 }; 03762 03763 #endif // CV_SSE4_1 03764 03765 template <> 03766 struct Cvt_SIMD<double, short> 03767 { 03768 int operator() (const double * src, short * dst, int width) const 03769 { 03770 int x = 0; 03771 03772 if (!USE_SSE2) 03773 return x; 03774 03775 for ( ; x <= width - 8; x += 8) 03776 { 03777 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 03778 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 03779 __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); 03780 __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); 03781 03782 v_src0 = _mm_movelh_ps(v_src0, v_src1); 03783 v_src1 = _mm_movelh_ps(v_src2, v_src3); 03784 03785 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), 03786 _mm_cvtps_epi32(v_src1)); 03787 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 03788 } 03789 03790 return x; 03791 } 03792 }; 03793 03794 template <> 03795 struct Cvt_SIMD<double, int> 03796 { 03797 int operator() (const double * src, int * dst, int width) const 03798 { 03799 int x = 0; 03800 03801 if (!USE_SSE2) 03802 return x; 03803 03804 for ( ; x <= width - 4; x += 4) 03805 { 03806 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 03807 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 03808 v_src0 = _mm_movelh_ps(v_src0, v_src1); 03809 03810 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0)); 03811 } 03812 03813 return x; 03814 } 03815 }; 03816 03817 template <> 03818 struct Cvt_SIMD<double, float> 03819 { 03820 int operator() (const double * src, float * dst, int width) const 03821 { 03822 int x = 0; 03823 03824 if (!USE_SSE2) 03825 return x; 03826 03827 for ( ; x <= width - 4; x += 4) 03828 { 03829 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 03830 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 03831 03832 _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1)); 03833 } 03834 03835 return x; 03836 } 03837 }; 03838 03839 03840 #elif CV_NEON 03841 03842 // from uchar 03843 03844 template <> 03845 struct Cvt_SIMD<uchar, schar> 03846 { 03847 int operator() (const uchar * src, schar * dst, int width) const 03848 { 03849 int x = 0; 03850 03851 for ( ; x <= width - 8; x += 8) 03852 vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))))); 03853 03854 return x; 03855 } 03856 }; 03857 03858 03859 template <> 03860 struct Cvt_SIMD<uchar, ushort> 03861 { 03862 int operator() (const uchar * src, ushort * dst, int width) const 03863 { 03864 int x = 0; 03865 03866 for ( ; x <= width - 8; x += 8) 03867 vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x))); 03868 03869 return x; 03870 } 03871 }; 03872 03873 template <> 03874 struct Cvt_SIMD<uchar, short> 03875 { 03876 int operator() (const uchar * src, short * dst, int width) const 03877 { 03878 int x = 0; 03879 03880 for ( ; x <= width - 8; x += 8) 03881 vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))); 03882 03883 return x; 03884 } 03885 }; 03886 03887 template <> 03888 struct Cvt_SIMD<uchar, int> 03889 { 03890 int operator() (const uchar * src, int * dst, int width) const 03891 { 03892 int x = 0; 03893 03894 for ( ; x <= width - 8; x += 8) 03895 { 03896 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 03897 vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)))); 03898 vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)))); 03899 } 03900 03901 return x; 03902 } 03903 }; 03904 03905 template <> 03906 struct Cvt_SIMD<uchar, float> 03907 { 03908 int operator() (const uchar * src, float * dst, int width) const 03909 { 03910 int x = 0; 03911 03912 for ( ; x <= width - 8; x += 8) 03913 { 03914 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 03915 vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src)))); 03916 vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src)))); 03917 } 03918 03919 return x; 03920 } 03921 }; 03922 03923 // from schar 03924 03925 template <> 03926 struct Cvt_SIMD<schar, uchar> 03927 { 03928 int operator() (const schar * src, uchar * dst, int width) const 03929 { 03930 int x = 0; 03931 03932 for ( ; x <= width - 8; x += 8) 03933 vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x)))); 03934 03935 return x; 03936 } 03937 }; 03938 03939 template <> 03940 struct Cvt_SIMD<schar, short> 03941 { 03942 int operator() (const schar * src, short * dst, int width) const 03943 { 03944 int x = 0; 03945 03946 for ( ; x <= width - 8; x += 8) 03947 vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x))); 03948 03949 return x; 03950 } 03951 }; 03952 03953 template <> 03954 struct Cvt_SIMD<schar, ushort> 03955 { 03956 int operator() (const schar * src, ushort * dst, int width) const 03957 { 03958 int x = 0; 03959 03960 for ( ; x <= width - 8; x += 8) 03961 { 03962 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 03963 vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))), 03964 vqmovun_s32(vmovl_s16(vget_high_s16(v_src))))); 03965 } 03966 03967 return x; 03968 } 03969 }; 03970 03971 03972 template <> 03973 struct Cvt_SIMD<schar, int> 03974 { 03975 int operator() (const schar * src, int * dst, int width) const 03976 { 03977 int x = 0; 03978 03979 for ( ; x <= width - 8; x += 8) 03980 { 03981 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 03982 vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src))); 03983 vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src))); 03984 } 03985 03986 return x; 03987 } 03988 }; 03989 03990 template <> 03991 struct Cvt_SIMD<schar, float> 03992 { 03993 int operator() (const schar * src, float * dst, int width) const 03994 { 03995 int x = 0; 03996 03997 for ( ; x <= width - 8; x += 8) 03998 { 03999 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 04000 vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)))); 04001 vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)))); 04002 } 04003 04004 return x; 04005 } 04006 }; 04007 04008 // from ushort 04009 04010 template <> 04011 struct Cvt_SIMD<ushort, uchar> 04012 { 04013 int operator() (const ushort * src, uchar * dst, int width) const 04014 { 04015 int x = 0; 04016 04017 for ( ; x <= width - 16; x += 16) 04018 { 04019 uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8); 04020 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2))); 04021 } 04022 04023 return x; 04024 } 04025 }; 04026 04027 template <> 04028 struct Cvt_SIMD<ushort, schar> 04029 { 04030 int operator() (const ushort * src, schar * dst, int width) const 04031 { 04032 int x = 0; 04033 04034 for ( ; x <= width - 16; x += 16) 04035 { 04036 uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8); 04037 int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1))); 04038 int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1))); 04039 int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2))); 04040 int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2))); 04041 04042 vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))), 04043 vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21))))); 04044 } 04045 04046 return x; 04047 } 04048 }; 04049 04050 template <> 04051 struct Cvt_SIMD<ushort, short> 04052 { 04053 int operator() (const ushort * src, short * dst, int width) const 04054 { 04055 int x = 0; 04056 04057 for ( ; x <= width - 8; x += 8) 04058 { 04059 uint16x8_t v_src = vld1q_u16(src + x); 04060 int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))); 04061 int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))); 04062 04063 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1))); 04064 } 04065 04066 return x; 04067 } 04068 }; 04069 04070 template <> 04071 struct Cvt_SIMD<ushort, int> 04072 { 04073 int operator() (const ushort * src, int * dst, int width) const 04074 { 04075 int x = 0; 04076 04077 for ( ; x <= width - 8; x += 8) 04078 { 04079 uint16x8_t v_src = vld1q_u16(src + x); 04080 vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)))); 04081 vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)))); 04082 } 04083 04084 return x; 04085 } 04086 }; 04087 04088 template <> 04089 struct Cvt_SIMD<ushort, float> 04090 { 04091 int operator() (const ushort * src, float * dst, int width) const 04092 { 04093 int x = 0; 04094 04095 for ( ; x <= width - 8; x += 8) 04096 { 04097 uint16x8_t v_src = vld1q_u16(src + x); 04098 vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src)))); 04099 vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src)))); 04100 } 04101 04102 return x; 04103 } 04104 }; 04105 04106 // from short 04107 04108 template <> 04109 struct Cvt_SIMD<short, uchar> 04110 { 04111 int operator() (const short * src, uchar * dst, int width) const 04112 { 04113 int x = 0; 04114 04115 for ( ; x <= width - 16; x += 16) 04116 { 04117 int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8); 04118 vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2))); 04119 } 04120 04121 return x; 04122 } 04123 }; 04124 04125 template <> 04126 struct Cvt_SIMD<short, schar> 04127 { 04128 int operator() (const short * src, schar * dst, int width) const 04129 { 04130 int x = 0; 04131 04132 for ( ; x <= width - 16; x += 16) 04133 { 04134 int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8); 04135 vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2))); 04136 } 04137 04138 return x; 04139 } 04140 }; 04141 04142 template <> 04143 struct Cvt_SIMD<short, ushort> 04144 { 04145 int operator() (const short * src, ushort * dst, int width) const 04146 { 04147 int x = 0; 04148 04149 for ( ; x <= width - 8; x += 8) 04150 { 04151 int16x8_t v_src = vld1q_s16(src + x); 04152 uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src))); 04153 uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src))); 04154 vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); 04155 } 04156 04157 return x; 04158 } 04159 }; 04160 04161 template <> 04162 struct Cvt_SIMD<short, int> 04163 { 04164 int operator() (const short * src, int * dst, int width) const 04165 { 04166 int x = 0; 04167 04168 for ( ; x <= width - 8; x += 8) 04169 { 04170 int16x8_t v_src = vld1q_s16(src + x); 04171 vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src))); 04172 vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src))); 04173 } 04174 04175 return x; 04176 } 04177 }; 04178 04179 template <> 04180 struct Cvt_SIMD<short, float> 04181 { 04182 int operator() (const short * src, float * dst, int width) const 04183 { 04184 int x = 0; 04185 04186 for ( ; x <= width - 8; x += 8) 04187 { 04188 int16x8_t v_src = vld1q_s16(src + x); 04189 vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)))); 04190 vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)))); 04191 } 04192 04193 return x; 04194 } 04195 }; 04196 04197 // from int 04198 04199 template <> 04200 struct Cvt_SIMD<int, uchar> 04201 { 04202 int operator() (const int * src, uchar * dst, int width) const 04203 { 04204 int x = 0; 04205 04206 for ( ; x <= width - 16; x += 16) 04207 { 04208 int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); 04209 int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12); 04210 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2))); 04211 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4))); 04212 vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2)); 04213 } 04214 04215 return x; 04216 } 04217 }; 04218 04219 template <> 04220 struct Cvt_SIMD<int, schar> 04221 { 04222 int operator() (const int * src, schar * dst, int width) const 04223 { 04224 int x = 0; 04225 04226 for ( ; x <= width - 16; x += 16) 04227 { 04228 int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); 04229 int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12); 04230 int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); 04231 int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4))); 04232 vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2)); 04233 } 04234 04235 return x; 04236 } 04237 }; 04238 04239 04240 template <> 04241 struct Cvt_SIMD<int, ushort> 04242 { 04243 int operator() (const int * src, ushort * dst, int width) const 04244 { 04245 int x = 0; 04246 04247 for ( ; x <= width - 8; x += 8) 04248 { 04249 int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); 04250 vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2))); 04251 } 04252 04253 return x; 04254 } 04255 }; 04256 04257 template <> 04258 struct Cvt_SIMD<int, short> 04259 { 04260 int operator() (const int * src, short * dst, int width) const 04261 { 04262 int x = 0; 04263 04264 for ( ; x <= width - 8; x += 8) 04265 { 04266 int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); 04267 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); 04268 } 04269 04270 return x; 04271 } 04272 }; 04273 04274 template <> 04275 struct Cvt_SIMD<int, float> 04276 { 04277 int operator() (const int * src, float * dst, int width) const 04278 { 04279 int x = 0; 04280 04281 for ( ; x <= width - 4; x += 4) 04282 vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x))); 04283 04284 return x; 04285 } 04286 }; 04287 04288 // from float 04289 04290 template <> 04291 struct Cvt_SIMD<float, uchar> 04292 { 04293 int operator() (const float * src, uchar * dst, int width) const 04294 { 04295 int x = 0; 04296 04297 for ( ; x <= width - 16; x += 16) 04298 { 04299 uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x)); 04300 uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4)); 04301 uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8)); 04302 uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12)); 04303 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2))); 04304 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4))); 04305 vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2)); 04306 } 04307 04308 return x; 04309 } 04310 }; 04311 04312 template <> 04313 struct Cvt_SIMD<float, schar> 04314 { 04315 int operator() (const float * src, schar * dst, int width) const 04316 { 04317 int x = 0; 04318 04319 for ( ; x <= width - 16; x += 16) 04320 { 04321 int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x)); 04322 int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4)); 04323 int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8)); 04324 int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12)); 04325 int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); 04326 int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4))); 04327 vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2)); 04328 } 04329 04330 return x; 04331 } 04332 }; 04333 04334 04335 template <> 04336 struct Cvt_SIMD<float, ushort> 04337 { 04338 int operator() (const float * src, ushort * dst, int width) const 04339 { 04340 int x = 0; 04341 04342 for ( ; x <= width - 8; x += 8) 04343 { 04344 uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x)); 04345 uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4)); 04346 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2))); 04347 } 04348 04349 return x; 04350 } 04351 }; 04352 04353 template <> 04354 struct Cvt_SIMD<float, int> 04355 { 04356 int operator() (const float * src, int * dst, int width) const 04357 { 04358 int x = 0; 04359 04360 for ( ; x <= width - 4; x += 4) 04361 vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x))); 04362 04363 return x; 04364 } 04365 }; 04366 04367 #endif 04368 04369 template<typename T, typename DT> static void 04370 cvt_( const T* src, size_t sstep, 04371 DT* dst, size_t dstep, Size size ) 04372 { 04373 sstep /= sizeof(src[0]); 04374 dstep /= sizeof(dst[0]); 04375 Cvt_SIMD<T, DT> vop; 04376 04377 for( ; size.height--; src += sstep, dst += dstep ) 04378 { 04379 int x = vop(src, dst, size.width); 04380 #if CV_ENABLE_UNROLLED 04381 for( ; x <= size.width - 4; x += 4 ) 04382 { 04383 DT t0, t1; 04384 t0 = saturate_cast<DT>(src[x]); 04385 t1 = saturate_cast<DT>(src[x+1]); 04386 dst[x] = t0; dst[x+1] = t1; 04387 t0 = saturate_cast<DT>(src[x+2]); 04388 t1 = saturate_cast<DT>(src[x+3]); 04389 dst[x+2] = t0; dst[x+3] = t1; 04390 } 04391 #endif 04392 for( ; x < size.width; x++ ) 04393 dst[x] = saturate_cast<DT>(src[x]); 04394 } 04395 } 04396 04397 //vz optimized template specialization, test Core_ConvertScale/ElemWiseTest 04398 template<> void 04399 cvt_<float, short>( const float* src, size_t sstep, 04400 short* dst, size_t dstep, Size size ) 04401 { 04402 sstep /= sizeof(src[0]); 04403 dstep /= sizeof(dst[0]); 04404 04405 for( ; size.height--; src += sstep, dst += dstep ) 04406 { 04407 int x = 0; 04408 #if CV_SSE2 04409 if(USE_SSE2) 04410 { 04411 for( ; x <= size.width - 8; x += 8 ) 04412 { 04413 __m128 src128 = _mm_loadu_ps (src + x); 04414 __m128i src_int128 = _mm_cvtps_epi32 (src128); 04415 04416 src128 = _mm_loadu_ps (src + x + 4); 04417 __m128i src1_int128 = _mm_cvtps_epi32 (src128); 04418 04419 src1_int128 = _mm_packs_epi32(src_int128, src1_int128); 04420 _mm_storeu_si128((__m128i*)(dst + x),src1_int128); 04421 } 04422 } 04423 #elif CV_NEON 04424 for( ; x <= size.width - 8; x += 8 ) 04425 { 04426 float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4); 04427 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)), 04428 vqmovn_s32(cv_vrndq_s32_f32(v_src2))); 04429 vst1q_s16(dst + x, v_dst); 04430 } 04431 #endif 04432 for( ; x < size.width; x++ ) 04433 dst[x] = saturate_cast<short>(src[x]); 04434 } 04435 04436 } 04437 04438 04439 template<typename T> static void 04440 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size ) 04441 { 04442 sstep /= sizeof(src[0]); 04443 dstep /= sizeof(dst[0]); 04444 04445 for( ; size.height--; src += sstep, dst += dstep ) 04446 memcpy(dst, src, size.width*sizeof(src[0])); 04447 } 04448 04449 #define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \ 04450 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 04451 dtype* dst, size_t dstep, Size size, double* scale) \ 04452 { \ 04453 tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ 04454 } 04455 04456 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ 04457 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 04458 dtype* dst, size_t dstep, Size size, double* scale) \ 04459 { \ 04460 cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ 04461 } 04462 04463 #if defined(HAVE_IPP) 04464 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \ 04465 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 04466 dtype* dst, size_t dstep, Size size, double*) \ 04467 { \ 04468 CV_IPP_RUN(src && dst, ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0)\ 04469 cvt_(src, sstep, dst, dstep, size); \ 04470 } 04471 04472 #define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \ 04473 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 04474 dtype* dst, size_t dstep, Size size, double*) \ 04475 { \ 04476 CV_IPP_RUN(src && dst, ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0)\ 04477 cvt_(src, sstep, dst, dstep, size); \ 04478 } 04479 #else 04480 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \ 04481 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 04482 dtype* dst, size_t dstep, Size size, double*) \ 04483 { \ 04484 cvt_(src, sstep, dst, dstep, size); \ 04485 } 04486 #define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F 04487 #endif 04488 04489 #define DEF_CVT_FUNC(suffix, stype, dtype) \ 04490 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 04491 dtype* dst, size_t dstep, Size size, double*) \ 04492 { \ 04493 cvt_(src, sstep, dst, dstep, size); \ 04494 } 04495 04496 #define DEF_CPY_FUNC(suffix, stype) \ 04497 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 04498 stype* dst, size_t dstep, Size size, double*) \ 04499 { \ 04500 cpy_(src, sstep, dst, dstep, size); \ 04501 } 04502 04503 04504 DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float) 04505 DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float) 04506 DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float) 04507 DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float) 04508 DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float) 04509 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float) 04510 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float) 04511 04512 DEF_CVT_SCALE_FUNC(8u, uchar, uchar, float) 04513 DEF_CVT_SCALE_FUNC(8s8u, schar, uchar, float) 04514 DEF_CVT_SCALE_FUNC(16u8u, ushort, uchar, float) 04515 DEF_CVT_SCALE_FUNC(16s8u, short, uchar, float) 04516 DEF_CVT_SCALE_FUNC(32s8u, int, uchar, float) 04517 DEF_CVT_SCALE_FUNC(32f8u, float, uchar, float) 04518 DEF_CVT_SCALE_FUNC(64f8u, double, uchar, float) 04519 04520 DEF_CVT_SCALE_FUNC(8u8s, uchar, schar, float) 04521 DEF_CVT_SCALE_FUNC(8s, schar, schar, float) 04522 DEF_CVT_SCALE_FUNC(16u8s, ushort, schar, float) 04523 DEF_CVT_SCALE_FUNC(16s8s, short, schar, float) 04524 DEF_CVT_SCALE_FUNC(32s8s, int, schar, float) 04525 DEF_CVT_SCALE_FUNC(32f8s, float, schar, float) 04526 DEF_CVT_SCALE_FUNC(64f8s, double, schar, float) 04527 04528 DEF_CVT_SCALE_FUNC(8u16u, uchar, ushort, float) 04529 DEF_CVT_SCALE_FUNC(8s16u, schar, ushort, float) 04530 DEF_CVT_SCALE_FUNC(16u, ushort, ushort, float) 04531 DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float) 04532 DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float) 04533 DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float) 04534 DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float) 04535 04536 DEF_CVT_SCALE_FUNC(8u16s, uchar, short, float) 04537 DEF_CVT_SCALE_FUNC(8s16s, schar, short, float) 04538 DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float) 04539 DEF_CVT_SCALE_FUNC(16s, short, short, float) 04540 DEF_CVT_SCALE_FUNC(32s16s, int, short, float) 04541 DEF_CVT_SCALE_FUNC(32f16s, float, short, float) 04542 DEF_CVT_SCALE_FUNC(64f16s, double, short, float) 04543 04544 DEF_CVT_SCALE_FUNC(8u32s, uchar, int, float) 04545 DEF_CVT_SCALE_FUNC(8s32s, schar, int, float) 04546 DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float) 04547 DEF_CVT_SCALE_FUNC(16s32s, short, int, float) 04548 DEF_CVT_SCALE_FUNC(32s, int, int, double) 04549 DEF_CVT_SCALE_FUNC(32f32s, float, int, float) 04550 DEF_CVT_SCALE_FUNC(64f32s, double, int, double) 04551 04552 DEF_CVT_SCALE_FUNC(8u32f, uchar, float, float) 04553 DEF_CVT_SCALE_FUNC(8s32f, schar, float, float) 04554 DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float) 04555 DEF_CVT_SCALE_FUNC(16s32f, short, float, float) 04556 DEF_CVT_SCALE_FUNC(32s32f, int, float, double) 04557 DEF_CVT_SCALE_FUNC(32f, float, float, float) 04558 DEF_CVT_SCALE_FUNC(64f32f, double, float, double) 04559 04560 DEF_CVT_SCALE_FUNC(8u64f, uchar, double, double) 04561 DEF_CVT_SCALE_FUNC(8s64f, schar, double, double) 04562 DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double) 04563 DEF_CVT_SCALE_FUNC(16s64f, short, double, double) 04564 DEF_CVT_SCALE_FUNC(32s64f, int, double, double) 04565 DEF_CVT_SCALE_FUNC(32f64f, float, double, double) 04566 DEF_CVT_SCALE_FUNC(64f, double, double, double) 04567 04568 DEF_CPY_FUNC(8u, uchar) 04569 DEF_CVT_FUNC_F(8s8u, schar, uchar, 8s8u_C1Rs) 04570 DEF_CVT_FUNC_F(16u8u, ushort, uchar, 16u8u_C1R) 04571 DEF_CVT_FUNC_F(16s8u, short, uchar, 16s8u_C1R) 04572 DEF_CVT_FUNC_F(32s8u, int, uchar, 32s8u_C1R) 04573 DEF_CVT_FUNC_F2(32f8u, float, uchar, 32f8u_C1RSfs) 04574 DEF_CVT_FUNC(64f8u, double, uchar) 04575 04576 DEF_CVT_FUNC_F2(8u8s, uchar, schar, 8u8s_C1RSfs) 04577 DEF_CVT_FUNC_F2(16u8s, ushort, schar, 16u8s_C1RSfs) 04578 DEF_CVT_FUNC_F2(16s8s, short, schar, 16s8s_C1RSfs) 04579 DEF_CVT_FUNC_F(32s8s, int, schar, 32s8s_C1R) 04580 DEF_CVT_FUNC_F2(32f8s, float, schar, 32f8s_C1RSfs) 04581 DEF_CVT_FUNC(64f8s, double, schar) 04582 04583 DEF_CVT_FUNC_F(8u16u, uchar, ushort, 8u16u_C1R) 04584 DEF_CVT_FUNC_F(8s16u, schar, ushort, 8s16u_C1Rs) 04585 DEF_CPY_FUNC(16u, ushort) 04586 DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs) 04587 DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs) 04588 DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs) 04589 DEF_CVT_FUNC(64f16u, double, ushort) 04590 04591 DEF_CVT_FUNC_F(8u16s, uchar, short, 8u16s_C1R) 04592 DEF_CVT_FUNC_F(8s16s, schar, short, 8s16s_C1R) 04593 DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs) 04594 DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs) 04595 DEF_CVT_FUNC(32f16s, float, short) 04596 DEF_CVT_FUNC(64f16s, double, short) 04597 04598 DEF_CVT_FUNC_F(8u32s, uchar, int, 8u32s_C1R) 04599 DEF_CVT_FUNC_F(8s32s, schar, int, 8s32s_C1R) 04600 DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R) 04601 DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R) 04602 DEF_CPY_FUNC(32s, int) 04603 DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs) 04604 DEF_CVT_FUNC(64f32s, double, int) 04605 04606 DEF_CVT_FUNC_F(8u32f, uchar, float, 8u32f_C1R) 04607 DEF_CVT_FUNC_F(8s32f, schar, float, 8s32f_C1R) 04608 DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R) 04609 DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R) 04610 DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R) 04611 DEF_CVT_FUNC(64f32f, double, float) 04612 04613 DEF_CVT_FUNC(8u64f, uchar, double) 04614 DEF_CVT_FUNC(8s64f, schar, double) 04615 DEF_CVT_FUNC(16u64f, ushort, double) 04616 DEF_CVT_FUNC(16s64f, short, double) 04617 DEF_CVT_FUNC(32s64f, int, double) 04618 DEF_CVT_FUNC(32f64f, float, double) 04619 DEF_CPY_FUNC(64s, int64) 04620 04621 static BinaryFunc getCvtScaleAbsFunc(int depth) 04622 { 04623 static BinaryFunc cvtScaleAbsTab[] = 04624 { 04625 (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u, 04626 (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u, 04627 (BinaryFunc)cvtScaleAbs64f8u, 0 04628 }; 04629 04630 return cvtScaleAbsTab[depth]; 04631 } 04632 04633 BinaryFunc getConvertFunc(int sdepth, int ddepth) 04634 { 04635 static BinaryFunc cvtTab[][8] = 04636 { 04637 { 04638 (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u), 04639 (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u), 04640 (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 04641 }, 04642 { 04643 (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s), 04644 (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s), 04645 (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 04646 }, 04647 { 04648 (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u, 04649 (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u), 04650 (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 04651 }, 04652 { 04653 (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s), 04654 (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s), 04655 (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 04656 }, 04657 { 04658 (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s), 04659 (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s), 04660 (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 04661 }, 04662 { 04663 (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f), 04664 (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s, 04665 (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 04666 }, 04667 { 04668 (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f), 04669 (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f), 04670 (BinaryFunc)(cvt64s), 0 04671 }, 04672 { 04673 0, 0, 0, 0, 0, 0, 0, 0 04674 } 04675 }; 04676 04677 return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)]; 04678 } 04679 04680 static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth) 04681 { 04682 static BinaryFunc cvtScaleTab[][8] = 04683 { 04684 { 04685 (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u), 04686 (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u), 04687 (BinaryFunc)cvtScale64f8u, 0 04688 }, 04689 { 04690 (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s), 04691 (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s), 04692 (BinaryFunc)cvtScale64f8s, 0 04693 }, 04694 { 04695 (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u), 04696 (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u), 04697 (BinaryFunc)cvtScale64f16u, 0 04698 }, 04699 { 04700 (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s), 04701 (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s), 04702 (BinaryFunc)cvtScale64f16s, 0 04703 }, 04704 { 04705 (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s), 04706 (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s), 04707 (BinaryFunc)cvtScale64f32s, 0 04708 }, 04709 { 04710 (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f), 04711 (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f), 04712 (BinaryFunc)cvtScale64f32f, 0 04713 }, 04714 { 04715 (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f, 04716 (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f, 04717 (BinaryFunc)cvtScale64f, 0 04718 }, 04719 { 04720 0, 0, 0, 0, 0, 0, 0, 0 04721 } 04722 }; 04723 04724 return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)]; 04725 } 04726 04727 #ifdef HAVE_OPENCL 04728 04729 static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) 04730 { 04731 const ocl::Device & d = ocl::Device::getDefault(); 04732 04733 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 04734 bool doubleSupport = d.doubleFPConfig() > 0; 04735 if (!doubleSupport && depth == CV_64F) 04736 return false; 04737 04738 _dst.create(_src.size(), CV_8UC(cn)); 04739 int kercn = 1; 04740 if (d.isIntel()) 04741 { 04742 static const int vectorWidths[] = {4, 4, 4, 4, 4, 4, 4, -1}; 04743 kercn = ocl::checkOptimalVectorWidth( vectorWidths, _src, _dst, 04744 noArray(), noArray(), noArray(), 04745 noArray(), noArray(), noArray(), 04746 noArray(), ocl::OCL_VECTOR_MAX); 04747 } 04748 else 04749 kercn = ocl::predictOptimalVectorWidthMax(_src, _dst); 04750 04751 int rowsPerWI = d.isIntel() ? 4 : 1; 04752 char cvt[2][50]; 04753 int wdepth = std::max(depth, CV_32F); 04754 String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s" 04755 " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s" 04756 " -D workT1=%s -D rowsPerWI=%d%s", 04757 ocl::typeToStr(CV_8UC(kercn)), 04758 ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), 04759 ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth, 04760 ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), 04761 ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]), 04762 ocl::typeToStr(wdepth), rowsPerWI, 04763 doubleSupport ? " -D DOUBLE_SUPPORT" : ""); 04764 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, build_opt); 04765 if (k.empty()) 04766 return false; 04767 04768 UMat src = _src.getUMat(); 04769 UMat dst = _dst.getUMat(); 04770 04771 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), 04772 dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); 04773 04774 if (wdepth == CV_32F) 04775 k.args(srcarg, dstarg, (float)alpha, (float)beta); 04776 else if (wdepth == CV_64F) 04777 k.args(srcarg, dstarg, alpha, beta); 04778 04779 size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI }; 04780 return k.run(2, globalsize, NULL, false); 04781 } 04782 04783 #endif 04784 04785 } 04786 04787 void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) 04788 { 04789 #ifdef HAVE_OPENCL 04790 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 04791 ocl_convertScaleAbs(_src, _dst, alpha, beta)) 04792 #endif 04793 04794 Mat src = _src.getMat(); 04795 int cn = src.channels(); 04796 double scale[] = {alpha, beta}; 04797 _dst.create( src.dims, src.size, CV_8UC(cn) ); 04798 Mat dst = _dst.getMat(); 04799 BinaryFunc func = getCvtScaleAbsFunc(src.depth()); 04800 CV_Assert( func != 0 ); 04801 04802 if( src.dims <= 2 ) 04803 { 04804 Size sz = getContinuousSize(src, dst, cn); 04805 func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale ); 04806 } 04807 else 04808 { 04809 const Mat* arrays[] = {&src, &dst, 0}; 04810 uchar* ptrs[2]; 04811 NAryMatIterator it(arrays, ptrs); 04812 Size sz((int)it.size*cn, 1); 04813 04814 for( size_t i = 0; i < it.nplanes; i++, ++it ) 04815 func( ptrs[0], 0, 0, 0, ptrs[1], 0, sz, scale ); 04816 } 04817 } 04818 04819 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const 04820 { 04821 bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON; 04822 04823 if( _type < 0 ) 04824 _type = _dst.fixedType() ? _dst.type() : type(); 04825 else 04826 _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels()); 04827 04828 int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type); 04829 if( sdepth == ddepth && noScale ) 04830 { 04831 copyTo(_dst); 04832 return; 04833 } 04834 04835 Mat src = *this; 04836 04837 BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth); 04838 double scale[] = {alpha, beta}; 04839 int cn = channels(); 04840 CV_Assert( func != 0 ); 04841 04842 if( dims <= 2 ) 04843 { 04844 _dst.create( size(), _type ); 04845 Mat dst = _dst.getMat(); 04846 Size sz = getContinuousSize(src, dst, cn); 04847 func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale ); 04848 } 04849 else 04850 { 04851 _dst.create( dims, size, _type ); 04852 Mat dst = _dst.getMat(); 04853 const Mat* arrays[] = {&src, &dst, 0}; 04854 uchar* ptrs[2]; 04855 NAryMatIterator it(arrays, ptrs); 04856 Size sz((int)(it.size*cn), 1); 04857 04858 for( size_t i = 0; i < it.nplanes; i++, ++it ) 04859 func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, scale); 04860 } 04861 } 04862 04863 /****************************************************************************************\ 04864 * LUT Transform * 04865 \****************************************************************************************/ 04866 04867 namespace cv 04868 { 04869 04870 template<typename T> static void 04871 LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn ) 04872 { 04873 if( lutcn == 1 ) 04874 { 04875 for( int i = 0; i < len*cn; i++ ) 04876 dst[i] = lut[src[i]]; 04877 } 04878 else 04879 { 04880 for( int i = 0; i < len*cn; i += cn ) 04881 for( int k = 0; k < cn; k++ ) 04882 dst[i+k] = lut[src[i+k]*cn+k]; 04883 } 04884 } 04885 04886 static void LUT8u_8u( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn ) 04887 { 04888 LUT8u_( src, lut, dst, len, cn, lutcn ); 04889 } 04890 04891 static void LUT8u_8s( const uchar* src, const schar* lut, schar* dst, int len, int cn, int lutcn ) 04892 { 04893 LUT8u_( src, lut, dst, len, cn, lutcn ); 04894 } 04895 04896 static void LUT8u_16u( const uchar* src, const ushort* lut, ushort* dst, int len, int cn, int lutcn ) 04897 { 04898 LUT8u_( src, lut, dst, len, cn, lutcn ); 04899 } 04900 04901 static void LUT8u_16s( const uchar* src, const short* lut, short* dst, int len, int cn, int lutcn ) 04902 { 04903 LUT8u_( src, lut, dst, len, cn, lutcn ); 04904 } 04905 04906 static void LUT8u_32s( const uchar* src, const int* lut, int* dst, int len, int cn, int lutcn ) 04907 { 04908 LUT8u_( src, lut, dst, len, cn, lutcn ); 04909 } 04910 04911 static void LUT8u_32f( const uchar* src, const float* lut, float* dst, int len, int cn, int lutcn ) 04912 { 04913 LUT8u_( src, lut, dst, len, cn, lutcn ); 04914 } 04915 04916 static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len, int cn, int lutcn ) 04917 { 04918 LUT8u_( src, lut, dst, len, cn, lutcn ); 04919 } 04920 04921 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn ); 04922 04923 static LUTFunc lutTab[] = 04924 { 04925 (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s, 04926 (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0 04927 }; 04928 04929 #ifdef HAVE_OPENCL 04930 04931 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst) 04932 { 04933 int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth(); 04934 04935 UMat src = _src.getUMat(), lut = _lut.getUMat(); 04936 _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn)); 04937 UMat dst = _dst.getUMat(); 04938 int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_src, _dst)) : dcn; 04939 04940 ocl::Kernel k("LUT", ocl::core::lut_oclsrc, 04941 format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn, 04942 ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth))); 04943 if (k.empty()) 04944 return false; 04945 04946 k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut), 04947 ocl::KernelArg::WriteOnly(dst, dcn, kercn)); 04948 04949 size_t globalSize[2] = { (size_t)dst.cols * dcn / kercn, ((size_t)dst.rows + 3) / 4 }; 04950 return k.run(2, globalSize, NULL, false); 04951 } 04952 04953 #endif 04954 04955 #if defined(HAVE_IPP) 04956 namespace ipp { 04957 04958 #if IPP_DISABLE_BLOCK // there are no performance benefits (PR #2653) 04959 class IppLUTParallelBody_LUTC1 : public ParallelLoopBody 04960 { 04961 public: 04962 bool* ok; 04963 const Mat& src_; 04964 const Mat& lut_; 04965 Mat& dst_; 04966 04967 typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep, 04968 IppiSize roiSize, const void* pTable, int nBitSize); 04969 IppFn fn; 04970 04971 int width; 04972 04973 IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) 04974 : ok(_ok), src_(src), lut_(lut), dst_(dst) 04975 { 04976 width = dst.cols * dst.channels(); 04977 04978 size_t elemSize1 = CV_ELEM_SIZE1(dst.depth()); 04979 04980 fn = 04981 elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R : 04982 elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R : 04983 NULL; 04984 04985 *ok = (fn != NULL); 04986 } 04987 04988 void operator()( const cv::Range& range ) const 04989 { 04990 if (!*ok) 04991 return; 04992 04993 const int row0 = range.start; 04994 const int row1 = range.end; 04995 04996 Mat src = src_.rowRange(row0, row1); 04997 Mat dst = dst_.rowRange(row0, row1); 04998 04999 IppiSize sz = { width, dst.rows }; 05000 05001 CV_DbgAssert(fn != NULL); 05002 if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0) 05003 { 05004 setIppErrorStatus(); 05005 *ok = false; 05006 } 05007 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 05008 } 05009 private: 05010 IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&); 05011 IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&); 05012 }; 05013 #endif 05014 05015 class IppLUTParallelBody_LUTCN : public ParallelLoopBody 05016 { 05017 public: 05018 bool *ok; 05019 const Mat& src_; 05020 const Mat& lut_; 05021 Mat& dst_; 05022 05023 int lutcn; 05024 05025 uchar* lutBuffer; 05026 uchar* lutTable[4]; 05027 05028 IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) 05029 : ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL) 05030 { 05031 lutcn = lut.channels(); 05032 IppiSize sz256 = {256, 1}; 05033 05034 size_t elemSize1 = dst.elemSize1(); 05035 CV_DbgAssert(elemSize1 == 1); 05036 lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4); 05037 lutTable[0] = lutBuffer + 0; 05038 lutTable[1] = lutBuffer + 1 * 256 * elemSize1; 05039 lutTable[2] = lutBuffer + 2 * 256 * elemSize1; 05040 lutTable[3] = lutBuffer + 3 * 256 * elemSize1; 05041 05042 CV_DbgAssert(lutcn == 3 || lutcn == 4); 05043 if (lutcn == 3) 05044 { 05045 IppStatus status = ippiCopy_8u_C3P3R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256); 05046 if (status < 0) 05047 { 05048 setIppErrorStatus(); 05049 return; 05050 } 05051 CV_IMPL_ADD(CV_IMPL_IPP); 05052 } 05053 else if (lutcn == 4) 05054 { 05055 IppStatus status = ippiCopy_8u_C4P4R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256); 05056 if (status < 0) 05057 { 05058 setIppErrorStatus(); 05059 return; 05060 } 05061 CV_IMPL_ADD(CV_IMPL_IPP); 05062 } 05063 05064 *ok = true; 05065 } 05066 05067 ~IppLUTParallelBody_LUTCN() 05068 { 05069 if (lutBuffer != NULL) 05070 ippFree(lutBuffer); 05071 lutBuffer = NULL; 05072 lutTable[0] = NULL; 05073 } 05074 05075 void operator()( const cv::Range& range ) const 05076 { 05077 if (!*ok) 05078 return; 05079 05080 const int row0 = range.start; 05081 const int row1 = range.end; 05082 05083 Mat src = src_.rowRange(row0, row1); 05084 Mat dst = dst_.rowRange(row0, row1); 05085 05086 if (lutcn == 3) 05087 { 05088 if (ippiLUTPalette_8u_C3R( 05089 src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0], 05090 ippiSize(dst.size()), lutTable, 8) >= 0) 05091 { 05092 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 05093 return; 05094 } 05095 } 05096 else if (lutcn == 4) 05097 { 05098 if (ippiLUTPalette_8u_C4R( 05099 src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0], 05100 ippiSize(dst.size()), lutTable, 8) >= 0) 05101 { 05102 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 05103 return; 05104 } 05105 } 05106 setIppErrorStatus(); 05107 *ok = false; 05108 } 05109 private: 05110 IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&); 05111 IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&); 05112 }; 05113 } // namespace ipp 05114 05115 static bool ipp_lut(Mat &src, Mat &lut, Mat &dst) 05116 { 05117 int lutcn = lut.channels(); 05118 05119 if(src.dims > 2) 05120 return false; 05121 05122 bool ok = false; 05123 Ptr<ParallelLoopBody> body; 05124 05125 size_t elemSize1 = CV_ELEM_SIZE1(dst.depth()); 05126 #if IPP_DISABLE_BLOCK // there are no performance benefits (PR #2653) 05127 if (lutcn == 1) 05128 { 05129 ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok); 05130 body.reset(p); 05131 } 05132 else 05133 #endif 05134 if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1) 05135 { 05136 ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok); 05137 body.reset(p); 05138 } 05139 05140 if (body != NULL && ok) 05141 { 05142 Range all(0, dst.rows); 05143 if (dst.total()>>18) 05144 parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16)); 05145 else 05146 (*body)(all); 05147 if (ok) 05148 return true; 05149 } 05150 05151 return false; 05152 } 05153 #endif // IPP 05154 05155 class LUTParallelBody : public ParallelLoopBody 05156 { 05157 public: 05158 bool* ok; 05159 const Mat& src_; 05160 const Mat& lut_; 05161 Mat& dst_; 05162 05163 LUTFunc func; 05164 05165 LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) 05166 : ok(_ok), src_(src), lut_(lut), dst_(dst) 05167 { 05168 func = lutTab[lut.depth()]; 05169 *ok = (func != NULL); 05170 } 05171 05172 void operator()( const cv::Range& range ) const 05173 { 05174 CV_DbgAssert(*ok); 05175 05176 const int row0 = range.start; 05177 const int row1 = range.end; 05178 05179 Mat src = src_.rowRange(row0, row1); 05180 Mat dst = dst_.rowRange(row0, row1); 05181 05182 int cn = src.channels(); 05183 int lutcn = lut_.channels(); 05184 05185 const Mat* arrays[] = {&src, &dst, 0}; 05186 uchar* ptrs[2]; 05187 NAryMatIterator it(arrays, ptrs); 05188 int len = (int)it.size; 05189 05190 for( size_t i = 0; i < it.nplanes; i++, ++it ) 05191 func(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn); 05192 } 05193 private: 05194 LUTParallelBody(const LUTParallelBody&); 05195 LUTParallelBody& operator=(const LUTParallelBody&); 05196 }; 05197 05198 } 05199 05200 void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst ) 05201 { 05202 int cn = _src.channels(), depth = _src.depth(); 05203 int lutcn = _lut.channels(); 05204 05205 CV_Assert( (lutcn == cn || lutcn == 1) && 05206 _lut.total() == 256 && _lut.isContinuous() && 05207 (depth == CV_8U || depth == CV_8S) ); 05208 05209 #ifdef HAVE_OPENCL 05210 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, 05211 ocl_LUT(_src, _lut, _dst)) 05212 #endif 05213 05214 Mat src = _src.getMat(), lut = _lut.getMat(); 05215 _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn)); 05216 Mat dst = _dst.getMat(); 05217 05218 CV_IPP_RUN(_src.dims() <= 2, ipp_lut(src, lut, dst)); 05219 05220 if (_src.dims() <= 2) 05221 { 05222 bool ok = false; 05223 Ptr<ParallelLoopBody> body; 05224 05225 if (body == NULL || ok == false) 05226 { 05227 ok = false; 05228 ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok); 05229 body.reset(p); 05230 } 05231 if (body != NULL && ok) 05232 { 05233 Range all(0, dst.rows); 05234 if (dst.total()>>18) 05235 parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16)); 05236 else 05237 (*body)(all); 05238 if (ok) 05239 return; 05240 } 05241 } 05242 05243 LUTFunc func = lutTab[lut.depth()]; 05244 CV_Assert( func != 0 ); 05245 05246 const Mat* arrays[] = {&src, &dst, 0}; 05247 uchar* ptrs[2]; 05248 NAryMatIterator it(arrays, ptrs); 05249 int len = (int)it.size; 05250 05251 for( size_t i = 0; i < it.nplanes; i++, ++it ) 05252 func(ptrs[0], lut.ptr(), ptrs[1], len, cn, lutcn); 05253 } 05254 05255 namespace cv { 05256 05257 #ifdef HAVE_OPENCL 05258 05259 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype, 05260 double scale, double delta ) 05261 { 05262 UMat src = _src.getUMat(); 05263 05264 if( _mask.empty() ) 05265 src.convertTo( _dst, dtype, scale, delta ); 05266 else if (src.channels() <= 4) 05267 { 05268 const ocl::Device & dev = ocl::Device::getDefault(); 05269 05270 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), 05271 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)), 05272 rowsPerWI = dev.isIntel() ? 4 : 1; 05273 05274 float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta); 05275 bool haveScale = std::fabs(scale - 1) > DBL_EPSILON, 05276 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON), 05277 haveDelta = std::fabs(delta) > DBL_EPSILON, 05278 doubleSupport = dev.doubleFPConfig() > 0; 05279 05280 if (!haveScale && !haveDelta && stype == dtype) 05281 { 05282 _src.copyTo(_dst, _mask); 05283 return true; 05284 } 05285 if (haveZeroScale) 05286 { 05287 _dst.setTo(Scalar(delta), _mask); 05288 return true; 05289 } 05290 05291 if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport) 05292 return false; 05293 05294 char cvt[2][40]; 05295 String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d" 05296 " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s", 05297 ocl::typeToStr(stype), ocl::typeToStr(dtype), 05298 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn, 05299 rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), 05300 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), 05301 doubleSupport ? " -D DOUBLE_SUPPORT" : "", 05302 haveScale ? " -D HAVE_SCALE" : "", 05303 haveDelta ? " -D HAVE_DELTA" : "", 05304 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth)); 05305 05306 ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts); 05307 if (k.empty()) 05308 return false; 05309 05310 UMat mask = _mask.getUMat(), dst = _dst.getUMat(); 05311 05312 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), 05313 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask), 05314 dstarg = ocl::KernelArg::ReadWrite(dst); 05315 05316 if (haveScale) 05317 { 05318 if (haveDelta) 05319 k.args(srcarg, maskarg, dstarg, fscale, fdelta); 05320 else 05321 k.args(srcarg, maskarg, dstarg, fscale); 05322 } 05323 else 05324 { 05325 if (haveDelta) 05326 k.args(srcarg, maskarg, dstarg, fdelta); 05327 else 05328 k.args(srcarg, maskarg, dstarg); 05329 } 05330 05331 size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI }; 05332 return k.run(2, globalsize, NULL, false); 05333 } 05334 else 05335 { 05336 UMat temp; 05337 src.convertTo( temp, dtype, scale, delta ); 05338 temp.copyTo( _dst, _mask ); 05339 } 05340 05341 return true; 05342 } 05343 05344 #endif 05345 05346 } 05347 05348 void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b, 05349 int norm_type, int rtype, InputArray _mask ) 05350 { 05351 double scale = 1, shift = 0; 05352 if( norm_type == CV_MINMAX ) 05353 { 05354 double smin = 0, smax = 0; 05355 double dmin = MIN( a, b ), dmax = MAX( a, b ); 05356 minMaxLoc( _src, &smin, &smax, 0, 0, _mask ); 05357 scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0); 05358 shift = dmin - smin*scale; 05359 } 05360 else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C ) 05361 { 05362 scale = norm( _src, norm_type, _mask ); 05363 scale = scale > DBL_EPSILON ? a/scale : 0.; 05364 shift = 0; 05365 } 05366 else 05367 CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" ); 05368 05369 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 05370 if( rtype < 0 ) 05371 rtype = _dst.fixedType() ? _dst.depth() : depth; 05372 _dst.createSameSize(_src, CV_MAKETYPE(rtype, cn)); 05373 05374 #ifdef HAVE_OPENCL 05375 CV_OCL_RUN(_dst.isUMat(), 05376 ocl_normalize(_src, _dst, _mask, rtype, scale, shift)) 05377 #endif 05378 05379 Mat src = _src.getMat(), dst = _dst.getMat(); 05380 if( _mask.empty() ) 05381 src.convertTo( dst, rtype, scale, shift ); 05382 else 05383 { 05384 Mat temp; 05385 src.convertTo( temp, rtype, scale, shift ); 05386 temp.copyTo( dst, _mask ); 05387 } 05388 } 05389 05390 CV_IMPL void 05391 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 ) 05392 { 05393 void* dptrs[] = { dstarr0, dstarr1, dstarr2, dstarr3 }; 05394 cv::Mat src = cv::cvarrToMat(srcarr); 05395 int i, j, nz = 0; 05396 for( i = 0; i < 4; i++ ) 05397 nz += dptrs[i] != 0; 05398 CV_Assert( nz > 0 ); 05399 std::vector<cv::Mat> dvec(nz); 05400 std::vector<int> pairs(nz*2); 05401 05402 for( i = j = 0; i < 4; i++ ) 05403 { 05404 if( dptrs[i] != 0 ) 05405 { 05406 dvec[j] = cv::cvarrToMat(dptrs[i]); 05407 CV_Assert( dvec[j].size() == src.size() ); 05408 CV_Assert( dvec[j].depth() == src.depth() ); 05409 CV_Assert( dvec[j].channels() == 1 ); 05410 CV_Assert( i < src.channels() ); 05411 pairs[j*2] = i; 05412 pairs[j*2+1] = j; 05413 j++; 05414 } 05415 } 05416 if( nz == src.channels() ) 05417 cv::split( src, dvec ); 05418 else 05419 { 05420 cv::mixChannels( &src, 1, &dvec[0], nz, &pairs[0], nz ); 05421 } 05422 } 05423 05424 05425 CV_IMPL void 05426 cvMerge( const void* srcarr0, const void* srcarr1, const void* srcarr2, 05427 const void* srcarr3, void* dstarr ) 05428 { 05429 const void* sptrs[] = { srcarr0, srcarr1, srcarr2, srcarr3 }; 05430 cv::Mat dst = cv::cvarrToMat(dstarr); 05431 int i, j, nz = 0; 05432 for( i = 0; i < 4; i++ ) 05433 nz += sptrs[i] != 0; 05434 CV_Assert( nz > 0 ); 05435 std::vector<cv::Mat> svec(nz); 05436 std::vector<int> pairs(nz*2); 05437 05438 for( i = j = 0; i < 4; i++ ) 05439 { 05440 if( sptrs[i] != 0 ) 05441 { 05442 svec[j] = cv::cvarrToMat(sptrs[i]); 05443 CV_Assert( svec[j].size == dst.size && 05444 svec[j].depth() == dst.depth() && 05445 svec[j].channels() == 1 && i < dst.channels() ); 05446 pairs[j*2] = j; 05447 pairs[j*2+1] = i; 05448 j++; 05449 } 05450 } 05451 05452 if( nz == dst.channels() ) 05453 cv::merge( svec, dst ); 05454 else 05455 { 05456 cv::mixChannels( &svec[0], nz, &dst, 1, &pairs[0], nz ); 05457 } 05458 } 05459 05460 05461 CV_IMPL void 05462 cvMixChannels( const CvArr** src, int src_count, 05463 CvArr** dst, int dst_count, 05464 const int* from_to, int pair_count ) 05465 { 05466 cv::AutoBuffer<cv::Mat> buf(src_count + dst_count); 05467 05468 int i; 05469 for( i = 0; i < src_count; i++ ) 05470 buf[i] = cv::cvarrToMat(src[i]); 05471 for( i = 0; i < dst_count; i++ ) 05472 buf[i+src_count] = cv::cvarrToMat(dst[i]); 05473 cv::mixChannels(&buf[0], src_count, &buf[src_count], dst_count, from_to, pair_count); 05474 } 05475 05476 CV_IMPL void 05477 cvConvertScaleAbs( const void* srcarr, void* dstarr, 05478 double scale, double shift ) 05479 { 05480 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 05481 CV_Assert( src.size == dst.size && dst.type() == CV_8UC(src.channels())); 05482 cv::convertScaleAbs( src, dst, scale, shift ); 05483 } 05484 05485 CV_IMPL void 05486 cvConvertScale( const void* srcarr, void* dstarr, 05487 double scale, double shift ) 05488 { 05489 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 05490 05491 CV_Assert( src.size == dst.size && src.channels() == dst.channels() ); 05492 src.convertTo(dst, dst.type(), scale, shift); 05493 } 05494 05495 CV_IMPL void cvLUT( const void* srcarr, void* dstarr, const void* lutarr ) 05496 { 05497 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), lut = cv::cvarrToMat(lutarr); 05498 05499 CV_Assert( dst.size() == src.size() && dst.type() == CV_MAKETYPE(lut.depth(), src.channels()) ); 05500 cv::LUT( src, lut, dst ); 05501 } 05502 05503 CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr, 05504 double a, double b, int norm_type, const CvArr* maskarr ) 05505 { 05506 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask; 05507 if( maskarr ) 05508 mask = cv::cvarrToMat(maskarr); 05509 CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() ); 05510 cv::normalize( src, dst, a, b, norm_type, dst.type(), mask ); 05511 } 05512 05513 /* End of file. */ 05514
Generated on Tue Jul 12 2022 14:46:28 by
