Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
arithm_simd.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 00016 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00017 // Third party copyrights are property of their respective owners. 00018 // 00019 // Redistribution and use in source and binary forms, with or without modification, 00020 // are permitted provided that the following conditions are met: 00021 // 00022 // * Redistribution's of source code must retain the above copyright notice, 00023 // this list of conditions and the following disclaimer. 00024 // 00025 // * Redistribution's in binary form must reproduce the above copyright notice, 00026 // this list of conditions and the following disclaimer in the documentation 00027 // and/or other materials provided with the distribution. 00028 // 00029 // * The name of the copyright holders may not be used to endorse or promote products 00030 // derived from this software without specific prior written permission. 00031 // 00032 // This software is provided by the copyright holders and contributors "as is" and 00033 // any express or implied warranties, including, but not limited to, the implied 00034 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00035 // In no event shall the Intel Corporation or contributors be liable for any direct, 00036 // indirect, incidental, special, exemplary, or consequential damages 00037 // (including, but not limited to, procurement of substitute goods or services; 00038 // loss of use, data, or profits; or business interruption) however caused 00039 // and on any theory of liability, whether in contract, strict liability, 00040 // or tort (including negligence or otherwise) arising in any way out of 00041 // the use of this software, even if advised of the possibility of such damage. 00042 // 00043 //M*/ 00044 00045 #ifndef __OPENCV_ARITHM_SIMD_HPP__ 00046 #define __OPENCV_ARITHM_SIMD_HPP__ 00047 00048 namespace cv { 00049 00050 struct NOP {}; 00051 00052 #if CV_SSE2 || CV_NEON 00053 #define IF_SIMD(op) op 00054 #else 00055 #define IF_SIMD(op) NOP 00056 #endif 00057 00058 00059 #if CV_SSE2 || CV_NEON 00060 00061 #define FUNCTOR_TEMPLATE(name) \ 00062 template<typename T> struct name {} 00063 00064 FUNCTOR_TEMPLATE(VLoadStore128); 00065 #if CV_SSE2 00066 FUNCTOR_TEMPLATE(VLoadStore64); 00067 FUNCTOR_TEMPLATE(VLoadStore128Aligned); 00068 #if CV_AVX2 00069 FUNCTOR_TEMPLATE(VLoadStore256); 00070 FUNCTOR_TEMPLATE(VLoadStore256Aligned); 00071 #endif 00072 #endif 00073 00074 #endif 00075 00076 #if CV_AVX2 00077 00078 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ 00079 template <> \ 00080 struct name<template_arg>{ \ 00081 typedef register_type reg_type; \ 00082 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ 00083 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ 00084 } 00085 00086 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ 00087 template <> \ 00088 struct name<template_arg>{ \ 00089 typedef register_type reg_type; \ 00090 static reg_type load(const template_arg * p) { return load_body (p); } \ 00091 static void store(template_arg * p, reg_type v) { store_body (p, v); } \ 00092 } 00093 00094 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ 00095 template<> \ 00096 struct name<template_arg> \ 00097 { \ 00098 VLoadStore256<template_arg>::reg_type operator()( \ 00099 const VLoadStore256<template_arg>::reg_type & a, \ 00100 const VLoadStore256<template_arg>::reg_type & b) const \ 00101 { \ 00102 body; \ 00103 } \ 00104 } 00105 00106 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ 00107 template<> \ 00108 struct name<template_arg> \ 00109 { \ 00110 VLoadStore256<template_arg>::reg_type operator()( \ 00111 const VLoadStore256<template_arg>::reg_type & a, \ 00112 const VLoadStore256<template_arg>::reg_type & ) const \ 00113 { \ 00114 body; \ 00115 } \ 00116 } 00117 00118 FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 00119 FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 00120 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 00121 FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 00122 FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 00123 FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); 00124 FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); 00125 00126 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); 00127 FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); 00128 FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); 00129 00130 FUNCTOR_TEMPLATE(VAdd); 00131 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); 00132 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); 00133 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); 00134 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); 00135 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); 00136 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); 00137 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); 00138 00139 FUNCTOR_TEMPLATE(VSub); 00140 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); 00141 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); 00142 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); 00143 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); 00144 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); 00145 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); 00146 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); 00147 00148 FUNCTOR_TEMPLATE(VMin); 00149 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); 00150 FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); 00151 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); 00152 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); 00153 FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); 00154 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); 00155 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); 00156 00157 FUNCTOR_TEMPLATE(VMax); 00158 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); 00159 FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); 00160 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); 00161 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); 00162 FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); 00163 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); 00164 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); 00165 00166 00167 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 00168 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; 00169 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, 00170 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; 00171 00172 FUNCTOR_TEMPLATE(VAbsDiff); 00173 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, 00174 return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); 00175 ); 00176 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, 00177 __m256i d = _mm256_subs_epi8(a, b); 00178 __m256i m = _mm256_cmpgt_epi8(b, a); 00179 return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); 00180 ); 00181 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, 00182 return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); 00183 ); 00184 FUNCTOR_CLOSURE_2arg(VAbsDiff, short, 00185 __m256i M = _mm256_max_epi16(a, b); 00186 __m256i m = _mm256_min_epi16(a, b); 00187 return _mm256_subs_epi16(M, m); 00188 ); 00189 FUNCTOR_CLOSURE_2arg(VAbsDiff, int, 00190 __m256i d = _mm256_sub_epi32(a, b); 00191 __m256i m = _mm256_cmpgt_epi32(b, a); 00192 return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); 00193 ); 00194 FUNCTOR_CLOSURE_2arg(VAbsDiff, float, 00195 return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); 00196 ); 00197 FUNCTOR_CLOSURE_2arg(VAbsDiff, double, 00198 return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); 00199 ); 00200 00201 FUNCTOR_TEMPLATE(VAnd); 00202 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); 00203 FUNCTOR_TEMPLATE(VOr); 00204 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); 00205 FUNCTOR_TEMPLATE(VXor); 00206 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); 00207 FUNCTOR_TEMPLATE(VNot); 00208 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); 00209 00210 #elif CV_SSE2 00211 00212 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ 00213 template <> \ 00214 struct name<template_arg>{ \ 00215 typedef register_type reg_type; \ 00216 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ 00217 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ 00218 } 00219 00220 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ 00221 template <> \ 00222 struct name<template_arg>{ \ 00223 typedef register_type reg_type; \ 00224 static reg_type load(const template_arg * p) { return load_body (p); } \ 00225 static void store(template_arg * p, reg_type v) { store_body (p, v); } \ 00226 } 00227 00228 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ 00229 template<> \ 00230 struct name<template_arg> \ 00231 { \ 00232 VLoadStore128<template_arg>::reg_type operator()( \ 00233 const VLoadStore128<template_arg>::reg_type & a, \ 00234 const VLoadStore128<template_arg>::reg_type & b) const \ 00235 { \ 00236 body; \ 00237 } \ 00238 } 00239 00240 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ 00241 template<> \ 00242 struct name<template_arg> \ 00243 { \ 00244 VLoadStore128<template_arg>::reg_type operator()( \ 00245 const VLoadStore128<template_arg>::reg_type & a, \ 00246 const VLoadStore128<template_arg>::reg_type & ) const \ 00247 { \ 00248 body; \ 00249 } \ 00250 } 00251 00252 FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); 00253 FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); 00254 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); 00255 FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); 00256 FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); 00257 FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); 00258 FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); 00259 00260 FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); 00261 FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); 00262 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); 00263 FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); 00264 00265 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); 00266 FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); 00267 FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); 00268 00269 FUNCTOR_TEMPLATE(VAdd); 00270 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); 00271 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); 00272 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); 00273 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); 00274 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); 00275 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); 00276 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); 00277 00278 FUNCTOR_TEMPLATE(VSub); 00279 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); 00280 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); 00281 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); 00282 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); 00283 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); 00284 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); 00285 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); 00286 00287 FUNCTOR_TEMPLATE(VMin); 00288 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); 00289 FUNCTOR_CLOSURE_2arg(VMin, schar, 00290 __m128i m = _mm_cmpgt_epi8(a, b); 00291 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); 00292 ); 00293 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); 00294 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); 00295 FUNCTOR_CLOSURE_2arg(VMin, int, 00296 __m128i m = _mm_cmpgt_epi32(a, b); 00297 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); 00298 ); 00299 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); 00300 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); 00301 00302 FUNCTOR_TEMPLATE(VMax); 00303 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); 00304 FUNCTOR_CLOSURE_2arg(VMax, schar, 00305 __m128i m = _mm_cmpgt_epi8(b, a); 00306 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); 00307 ); 00308 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); 00309 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); 00310 FUNCTOR_CLOSURE_2arg(VMax, int, 00311 __m128i m = _mm_cmpgt_epi32(b, a); 00312 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); 00313 ); 00314 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); 00315 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); 00316 00317 00318 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; 00319 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; 00320 00321 FUNCTOR_TEMPLATE(VAbsDiff); 00322 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, 00323 return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); 00324 ); 00325 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, 00326 __m128i d = _mm_subs_epi8(a, b); 00327 __m128i m = _mm_cmpgt_epi8(b, a); 00328 return _mm_subs_epi8(_mm_xor_si128(d, m), m); 00329 ); 00330 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, 00331 return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); 00332 ); 00333 FUNCTOR_CLOSURE_2arg(VAbsDiff, short, 00334 __m128i M = _mm_max_epi16(a, b); 00335 __m128i m = _mm_min_epi16(a, b); 00336 return _mm_subs_epi16(M, m); 00337 ); 00338 FUNCTOR_CLOSURE_2arg(VAbsDiff, int, 00339 __m128i d = _mm_sub_epi32(a, b); 00340 __m128i m = _mm_cmpgt_epi32(b, a); 00341 return _mm_sub_epi32(_mm_xor_si128(d, m), m); 00342 ); 00343 FUNCTOR_CLOSURE_2arg(VAbsDiff, float, 00344 return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); 00345 ); 00346 FUNCTOR_CLOSURE_2arg(VAbsDiff, double, 00347 return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); 00348 ); 00349 00350 FUNCTOR_TEMPLATE(VAnd); 00351 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); 00352 FUNCTOR_TEMPLATE(VOr); 00353 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); 00354 FUNCTOR_TEMPLATE(VXor); 00355 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); 00356 FUNCTOR_TEMPLATE(VNot); 00357 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); 00358 #endif 00359 00360 #if CV_NEON 00361 00362 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ 00363 template <> \ 00364 struct name<template_arg>{ \ 00365 typedef register_type reg_type; \ 00366 static reg_type load(const template_arg * p) { return load_body (p);}; \ 00367 static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ 00368 } 00369 00370 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ 00371 template<> \ 00372 struct name<template_arg> \ 00373 { \ 00374 VLoadStore128<template_arg>::reg_type operator()( \ 00375 VLoadStore128<template_arg>::reg_type a, \ 00376 VLoadStore128<template_arg>::reg_type b) const \ 00377 { \ 00378 return body; \ 00379 }; \ 00380 } 00381 00382 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ 00383 template<> \ 00384 struct name<template_arg> \ 00385 { \ 00386 VLoadStore128<template_arg>::reg_type operator()( \ 00387 VLoadStore128<template_arg>::reg_type a, \ 00388 VLoadStore128<template_arg>::reg_type ) const \ 00389 { \ 00390 return body; \ 00391 }; \ 00392 } 00393 00394 FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 ); 00395 FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 ); 00396 FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16); 00397 FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16); 00398 FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32); 00399 FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32); 00400 00401 FUNCTOR_TEMPLATE(VAdd); 00402 FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b)); 00403 FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b)); 00404 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b)); 00405 FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b)); 00406 FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b)); 00407 FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b)); 00408 00409 FUNCTOR_TEMPLATE(VSub); 00410 FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b)); 00411 FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b)); 00412 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b)); 00413 FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b)); 00414 FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b)); 00415 FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b)); 00416 00417 FUNCTOR_TEMPLATE(VMin); 00418 FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b)); 00419 FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b)); 00420 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b)); 00421 FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b)); 00422 FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b)); 00423 FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b)); 00424 00425 FUNCTOR_TEMPLATE(VMax); 00426 FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b)); 00427 FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b)); 00428 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b)); 00429 FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b)); 00430 FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b)); 00431 FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b)); 00432 00433 FUNCTOR_TEMPLATE(VAbsDiff); 00434 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b)); 00435 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b))); 00436 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b)); 00437 FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b))); 00438 FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b)); 00439 FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b)); 00440 00441 FUNCTOR_TEMPLATE(VAnd); 00442 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b)); 00443 FUNCTOR_TEMPLATE(VOr); 00444 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b)); 00445 FUNCTOR_TEMPLATE(VXor); 00446 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b)); 00447 FUNCTOR_TEMPLATE(VNot); 00448 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a )); 00449 #endif 00450 00451 00452 template <typename T> 00453 struct Cmp_SIMD 00454 { 00455 explicit Cmp_SIMD(int) 00456 { 00457 } 00458 00459 int operator () (const T *, const T *, uchar *, int) const 00460 { 00461 return 0; 00462 } 00463 }; 00464 00465 #if CV_NEON 00466 00467 template <> 00468 struct Cmp_SIMD<schar> 00469 { 00470 explicit Cmp_SIMD(int code_) : 00471 code(code_) 00472 { 00473 // CV_Assert(code == CMP_GT || code == CMP_LE || 00474 // code == CMP_EQ || code == CMP_NE); 00475 00476 v_mask = vdupq_n_u8(255); 00477 } 00478 00479 int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const 00480 { 00481 int x = 0; 00482 00483 if (code == CMP_GT) 00484 for ( ; x <= width - 16; x += 16) 00485 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); 00486 else if (code == CMP_LE) 00487 for ( ; x <= width - 16; x += 16) 00488 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); 00489 else if (code == CMP_EQ) 00490 for ( ; x <= width - 16; x += 16) 00491 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); 00492 else if (code == CMP_NE) 00493 for ( ; x <= width - 16; x += 16) 00494 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask)); 00495 00496 return x; 00497 } 00498 00499 int code; 00500 uint8x16_t v_mask; 00501 }; 00502 00503 template <> 00504 struct Cmp_SIMD<ushort> 00505 { 00506 explicit Cmp_SIMD(int code_) : 00507 code(code_) 00508 { 00509 // CV_Assert(code == CMP_GT || code == CMP_LE || 00510 // code == CMP_EQ || code == CMP_NE); 00511 00512 v_mask = vdup_n_u8(255); 00513 } 00514 00515 int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const 00516 { 00517 int x = 0; 00518 00519 if (code == CMP_GT) 00520 for ( ; x <= width - 8; x += 8) 00521 { 00522 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); 00523 vst1_u8(dst + x, vmovn_u16(v_dst)); 00524 } 00525 else if (code == CMP_LE) 00526 for ( ; x <= width - 8; x += 8) 00527 { 00528 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); 00529 vst1_u8(dst + x, vmovn_u16(v_dst)); 00530 } 00531 else if (code == CMP_EQ) 00532 for ( ; x <= width - 8; x += 8) 00533 { 00534 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); 00535 vst1_u8(dst + x, vmovn_u16(v_dst)); 00536 } 00537 else if (code == CMP_NE) 00538 for ( ; x <= width - 8; x += 8) 00539 { 00540 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); 00541 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask)); 00542 } 00543 00544 return x; 00545 } 00546 00547 int code; 00548 uint8x8_t v_mask; 00549 }; 00550 00551 template <> 00552 struct Cmp_SIMD<int> 00553 { 00554 explicit Cmp_SIMD(int code_) : 00555 code(code_) 00556 { 00557 // CV_Assert(code == CMP_GT || code == CMP_LE || 00558 // code == CMP_EQ || code == CMP_NE); 00559 00560 v_mask = vdup_n_u8(255); 00561 } 00562 00563 int operator () (const int * src1, const int * src2, uchar * dst, int width) const 00564 { 00565 int x = 0; 00566 00567 if (code == CMP_GT) 00568 for ( ; x <= width - 8; x += 8) 00569 { 00570 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); 00571 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); 00572 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 00573 } 00574 else if (code == CMP_LE) 00575 for ( ; x <= width - 8; x += 8) 00576 { 00577 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); 00578 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); 00579 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 00580 } 00581 else if (code == CMP_EQ) 00582 for ( ; x <= width - 8; x += 8) 00583 { 00584 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); 00585 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); 00586 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 00587 } 00588 else if (code == CMP_NE) 00589 for ( ; x <= width - 8; x += 8) 00590 { 00591 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); 00592 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); 00593 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); 00594 vst1_u8(dst + x, veor_u8(v_dst, v_mask)); 00595 } 00596 00597 return x; 00598 } 00599 00600 int code; 00601 uint8x8_t v_mask; 00602 }; 00603 00604 template <> 00605 struct Cmp_SIMD<float> 00606 { 00607 explicit Cmp_SIMD(int code_) : 00608 code(code_) 00609 { 00610 // CV_Assert(code == CMP_GT || code == CMP_LE || 00611 // code == CMP_EQ || code == CMP_NE); 00612 00613 v_mask = vdup_n_u8(255); 00614 } 00615 00616 int operator () (const float * src1, const float * src2, uchar * dst, int width) const 00617 { 00618 int x = 0; 00619 00620 if (code == CMP_GT) 00621 for ( ; x <= width - 8; x += 8) 00622 { 00623 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 00624 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 00625 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 00626 } 00627 else if (code == CMP_LE) 00628 for ( ; x <= width - 8; x += 8) 00629 { 00630 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 00631 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 00632 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 00633 } 00634 else if (code == CMP_EQ) 00635 for ( ; x <= width - 8; x += 8) 00636 { 00637 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 00638 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 00639 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 00640 } 00641 else if (code == CMP_NE) 00642 for ( ; x <= width - 8; x += 8) 00643 { 00644 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 00645 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 00646 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); 00647 vst1_u8(dst + x, veor_u8(v_dst, v_mask)); 00648 } 00649 00650 return x; 00651 } 00652 00653 int code; 00654 uint8x8_t v_mask; 00655 }; 00656 00657 #elif CV_SSE2 00658 00659 template <> 00660 struct Cmp_SIMD<schar> 00661 { 00662 explicit Cmp_SIMD(int code_) : 00663 code(code_) 00664 { 00665 // CV_Assert(code == CMP_GT || code == CMP_LE || 00666 // code == CMP_EQ || code == CMP_NE); 00667 00668 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 00669 00670 v_mask = _mm_set1_epi8(-1); 00671 } 00672 00673 int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const 00674 { 00675 int x = 0; 00676 00677 if (!haveSSE) 00678 return x; 00679 00680 if (code == CMP_GT) 00681 for ( ; x <= width - 16; x += 16) 00682 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), 00683 _mm_loadu_si128((const __m128i *)(src2 + x)))); 00684 else if (code == CMP_LE) 00685 for ( ; x <= width - 16; x += 16) 00686 { 00687 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), 00688 _mm_loadu_si128((const __m128i *)(src2 + x))); 00689 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); 00690 } 00691 else if (code == CMP_EQ) 00692 for ( ; x <= width - 16; x += 16) 00693 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), 00694 _mm_loadu_si128((const __m128i *)(src2 + x)))); 00695 else if (code == CMP_NE) 00696 for ( ; x <= width - 16; x += 16) 00697 { 00698 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), 00699 _mm_loadu_si128((const __m128i *)(src2 + x))); 00700 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); 00701 } 00702 00703 return x; 00704 } 00705 00706 int code; 00707 __m128i v_mask; 00708 bool haveSSE; 00709 }; 00710 00711 template <> 00712 struct Cmp_SIMD<int> 00713 { 00714 explicit Cmp_SIMD(int code_) : 00715 code(code_) 00716 { 00717 // CV_Assert(code == CMP_GT || code == CMP_LE || 00718 // code == CMP_EQ || code == CMP_NE); 00719 00720 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 00721 00722 v_mask = _mm_set1_epi32(0xffffffff); 00723 } 00724 00725 int operator () (const int * src1, const int * src2, uchar * dst, int width) const 00726 { 00727 int x = 0; 00728 00729 if (!haveSSE) 00730 return x; 00731 00732 if (code == CMP_GT) 00733 for ( ; x <= width - 8; x += 8) 00734 { 00735 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), 00736 _mm_loadu_si128((const __m128i *)(src2 + x))); 00737 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), 00738 _mm_loadu_si128((const __m128i *)(src2 + x + 4))); 00739 00740 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); 00741 } 00742 else if (code == CMP_LE) 00743 for ( ; x <= width - 8; x += 8) 00744 { 00745 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), 00746 _mm_loadu_si128((const __m128i *)(src2 + x))); 00747 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), 00748 _mm_loadu_si128((const __m128i *)(src2 + x + 4))); 00749 00750 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); 00751 } 00752 else if (code == CMP_EQ) 00753 for ( ; x <= width - 8; x += 8) 00754 { 00755 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), 00756 _mm_loadu_si128((const __m128i *)(src2 + x))); 00757 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), 00758 _mm_loadu_si128((const __m128i *)(src2 + x + 4))); 00759 00760 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); 00761 } 00762 else if (code == CMP_NE) 00763 for ( ; x <= width - 8; x += 8) 00764 { 00765 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), 00766 _mm_loadu_si128((const __m128i *)(src2 + x))); 00767 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), 00768 _mm_loadu_si128((const __m128i *)(src2 + x + 4))); 00769 00770 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); 00771 } 00772 00773 return x; 00774 } 00775 00776 int code; 00777 __m128i v_mask; 00778 bool haveSSE; 00779 }; 00780 00781 #endif 00782 00783 00784 template <typename T, typename WT> 00785 struct Mul_SIMD 00786 { 00787 int operator() (const T *, const T *, T *, int, WT) const 00788 { 00789 return 0; 00790 } 00791 }; 00792 00793 #if CV_NEON 00794 00795 template <> 00796 struct Mul_SIMD<uchar, float> 00797 { 00798 int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const 00799 { 00800 int x = 0; 00801 00802 if( scale == 1.0f ) 00803 for ( ; x <= width - 8; x += 8) 00804 { 00805 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); 00806 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); 00807 00808 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), 00809 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); 00810 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), 00811 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); 00812 00813 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 00814 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 00815 vst1_u8(dst + x, vqmovn_u16(v_dst)); 00816 } 00817 else 00818 { 00819 float32x4_t v_scale = vdupq_n_f32(scale); 00820 for ( ; x <= width - 8; x += 8) 00821 { 00822 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); 00823 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); 00824 00825 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), 00826 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); 00827 v_dst1 = vmulq_f32(v_dst1, v_scale); 00828 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), 00829 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); 00830 v_dst2 = vmulq_f32(v_dst2, v_scale); 00831 00832 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 00833 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 00834 vst1_u8(dst + x, vqmovn_u16(v_dst)); 00835 } 00836 } 00837 00838 return x; 00839 } 00840 }; 00841 00842 template <> 00843 struct Mul_SIMD<schar, float> 00844 { 00845 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const 00846 { 00847 int x = 0; 00848 00849 if( scale == 1.0f ) 00850 for ( ; x <= width - 8; x += 8) 00851 { 00852 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); 00853 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); 00854 00855 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), 00856 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); 00857 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), 00858 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); 00859 00860 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 00861 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 00862 vst1_s8(dst + x, vqmovn_s16(v_dst)); 00863 } 00864 else 00865 { 00866 float32x4_t v_scale = vdupq_n_f32(scale); 00867 for ( ; x <= width - 8; x += 8) 00868 { 00869 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); 00870 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); 00871 00872 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), 00873 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); 00874 v_dst1 = vmulq_f32(v_dst1, v_scale); 00875 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), 00876 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); 00877 v_dst2 = vmulq_f32(v_dst2, v_scale); 00878 00879 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 00880 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 00881 vst1_s8(dst + x, vqmovn_s16(v_dst)); 00882 } 00883 } 00884 00885 return x; 00886 } 00887 }; 00888 00889 template <> 00890 struct Mul_SIMD<ushort, float> 00891 { 00892 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const 00893 { 00894 int x = 0; 00895 00896 if( scale == 1.0f ) 00897 for ( ; x <= width - 8; x += 8) 00898 { 00899 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); 00900 00901 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), 00902 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); 00903 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), 00904 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); 00905 00906 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 00907 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 00908 vst1q_u16(dst + x, v_dst); 00909 } 00910 else 00911 { 00912 float32x4_t v_scale = vdupq_n_f32(scale); 00913 for ( ; x <= width - 8; x += 8) 00914 { 00915 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); 00916 00917 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), 00918 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); 00919 v_dst1 = vmulq_f32(v_dst1, v_scale); 00920 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), 00921 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); 00922 v_dst2 = vmulq_f32(v_dst2, v_scale); 00923 00924 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 00925 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 00926 vst1q_u16(dst + x, v_dst); 00927 } 00928 } 00929 00930 return x; 00931 } 00932 }; 00933 00934 template <> 00935 struct Mul_SIMD<short, float> 00936 { 00937 int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const 00938 { 00939 int x = 0; 00940 00941 if( scale == 1.0f ) 00942 for ( ; x <= width - 8; x += 8) 00943 { 00944 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); 00945 00946 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), 00947 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); 00948 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), 00949 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); 00950 00951 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 00952 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 00953 vst1q_s16(dst + x, v_dst); 00954 } 00955 else 00956 { 00957 float32x4_t v_scale = vdupq_n_f32(scale); 00958 for ( ; x <= width - 8; x += 8) 00959 { 00960 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); 00961 00962 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), 00963 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); 00964 v_dst1 = vmulq_f32(v_dst1, v_scale); 00965 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), 00966 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); 00967 v_dst2 = vmulq_f32(v_dst2, v_scale); 00968 00969 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 00970 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 00971 vst1q_s16(dst + x, v_dst); 00972 } 00973 } 00974 00975 return x; 00976 } 00977 }; 00978 00979 template <> 00980 struct Mul_SIMD<float, float> 00981 { 00982 int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const 00983 { 00984 int x = 0; 00985 00986 if( scale == 1.0f ) 00987 for ( ; x <= width - 8; x += 8) 00988 { 00989 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 00990 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 00991 vst1q_f32(dst + x, v_dst1); 00992 vst1q_f32(dst + x + 4, v_dst2); 00993 } 00994 else 00995 { 00996 float32x4_t v_scale = vdupq_n_f32(scale); 00997 for ( ; x <= width - 8; x += 8) 00998 { 00999 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 01000 v_dst1 = vmulq_f32(v_dst1, v_scale); 01001 01002 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 01003 v_dst2 = vmulq_f32(v_dst2, v_scale); 01004 01005 vst1q_f32(dst + x, v_dst1); 01006 vst1q_f32(dst + x + 4, v_dst2); 01007 } 01008 } 01009 01010 return x; 01011 } 01012 }; 01013 01014 #elif CV_SSE2 01015 01016 #if CV_SSE4_1 01017 01018 template <> 01019 struct Mul_SIMD<ushort, float> 01020 { 01021 Mul_SIMD() 01022 { 01023 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 01024 } 01025 01026 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const 01027 { 01028 int x = 0; 01029 01030 if (!haveSSE) 01031 return x; 01032 01033 __m128i v_zero = _mm_setzero_si128(); 01034 01035 if( scale != 1.0f ) 01036 { 01037 __m128 v_scale = _mm_set1_ps(scale); 01038 for ( ; x <= width - 8; x += 8) 01039 { 01040 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); 01041 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); 01042 01043 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), 01044 _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero))); 01045 v_dst1 = _mm_mul_ps(v_dst1, v_scale); 01046 01047 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), 01048 _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero))); 01049 v_dst2 = _mm_mul_ps(v_dst2, v_scale); 01050 01051 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); 01052 _mm_storeu_si128((__m128i *)(dst + x), v_dsti); 01053 } 01054 } 01055 01056 return x; 01057 } 01058 01059 bool haveSSE; 01060 }; 01061 01062 #endif 01063 01064 template <> 01065 struct Mul_SIMD<schar, float> 01066 { 01067 Mul_SIMD() 01068 { 01069 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 01070 } 01071 01072 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const 01073 { 01074 int x = 0; 01075 01076 if (!haveSSE) 01077 return x; 01078 01079 __m128i v_zero = _mm_setzero_si128(); 01080 01081 if( scale == 1.0f ) 01082 for ( ; x <= width - 8; x += 8) 01083 { 01084 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); 01085 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); 01086 01087 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); 01088 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); 01089 01090 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), 01091 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); 01092 01093 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), 01094 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); 01095 01096 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); 01097 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); 01098 } 01099 else 01100 { 01101 __m128 v_scale = _mm_set1_ps(scale); 01102 for ( ; x <= width - 8; x += 8) 01103 { 01104 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); 01105 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); 01106 01107 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); 01108 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); 01109 01110 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), 01111 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); 01112 v_dst1 = _mm_mul_ps(v_dst1, v_scale); 01113 01114 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), 01115 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); 01116 v_dst2 = _mm_mul_ps(v_dst2, v_scale); 01117 01118 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); 01119 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); 01120 } 01121 } 01122 01123 return x; 01124 } 01125 01126 bool haveSSE; 01127 }; 01128 01129 template <> 01130 struct Mul_SIMD<short, float> 01131 { 01132 Mul_SIMD() 01133 { 01134 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 01135 } 01136 01137 int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const 01138 { 01139 int x = 0; 01140 01141 if (!haveSSE) 01142 return x; 01143 01144 __m128i v_zero = _mm_setzero_si128(); 01145 01146 if( scale != 1.0f ) 01147 { 01148 __m128 v_scale = _mm_set1_ps(scale); 01149 for ( ; x <= width - 8; x += 8) 01150 { 01151 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); 01152 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); 01153 01154 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), 01155 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); 01156 v_dst1 = _mm_mul_ps(v_dst1, v_scale); 01157 01158 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), 01159 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); 01160 v_dst2 = _mm_mul_ps(v_dst2, v_scale); 01161 01162 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); 01163 _mm_storeu_si128((__m128i *)(dst + x), v_dsti); 01164 } 01165 } 01166 01167 return x; 01168 } 01169 01170 bool haveSSE; 01171 }; 01172 01173 #endif 01174 01175 template <typename T> 01176 struct Div_SIMD 01177 { 01178 int operator() (const T *, const T *, T *, int, double) const 01179 { 01180 return 0; 01181 } 01182 }; 01183 01184 template <typename T> 01185 struct Recip_SIMD 01186 { 01187 int operator() (const T *, T *, int, double) const 01188 { 01189 return 0; 01190 } 01191 }; 01192 01193 01194 #if CV_SIMD128 01195 01196 template <> 01197 struct Div_SIMD<uchar> 01198 { 01199 bool haveSIMD; 01200 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01201 01202 int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const 01203 { 01204 int x = 0; 01205 01206 if (!haveSIMD) 01207 return x; 01208 01209 v_float32x4 v_scale = v_setall_f32((float)scale); 01210 v_uint16x8 v_zero = v_setzero_u16(); 01211 01212 for ( ; x <= width - 8; x += 8) 01213 { 01214 v_uint16x8 v_src1 = v_load_expand(src1 + x); 01215 v_uint16x8 v_src2 = v_load_expand(src2 + x); 01216 01217 v_uint32x4 t0, t1, t2, t3; 01218 v_expand(v_src1, t0, t1); 01219 v_expand(v_src2, t2, t3); 01220 01221 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); 01222 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); 01223 01224 v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); 01225 v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); 01226 01227 f0 = f0 * v_scale / f2; 01228 f1 = f1 * v_scale / f3; 01229 01230 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 01231 v_uint16x8 res = v_pack_u(i0, i1); 01232 01233 res = v_select(v_src2 == v_zero, v_zero, res); 01234 v_pack_store(dst + x, res); 01235 } 01236 01237 return x; 01238 } 01239 }; 01240 01241 01242 template <> 01243 struct Div_SIMD<schar> 01244 { 01245 bool haveSIMD; 01246 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01247 01248 int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const 01249 { 01250 int x = 0; 01251 01252 if (!haveSIMD) 01253 return x; 01254 01255 v_float32x4 v_scale = v_setall_f32((float)scale); 01256 v_int16x8 v_zero = v_setzero_s16(); 01257 01258 for ( ; x <= width - 8; x += 8) 01259 { 01260 v_int16x8 v_src1 = v_load_expand(src1 + x); 01261 v_int16x8 v_src2 = v_load_expand(src2 + x); 01262 01263 v_int32x4 t0, t1, t2, t3; 01264 v_expand(v_src1, t0, t1); 01265 v_expand(v_src2, t2, t3); 01266 01267 v_float32x4 f0 = v_cvt_f32(t0); 01268 v_float32x4 f1 = v_cvt_f32(t1); 01269 01270 v_float32x4 f2 = v_cvt_f32(t2); 01271 v_float32x4 f3 = v_cvt_f32(t3); 01272 01273 f0 = f0 * v_scale / f2; 01274 f1 = f1 * v_scale / f3; 01275 01276 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 01277 v_int16x8 res = v_pack(i0, i1); 01278 01279 res = v_select(v_src2 == v_zero, v_zero, res); 01280 v_pack_store(dst + x, res); 01281 } 01282 01283 return x; 01284 } 01285 }; 01286 01287 01288 template <> 01289 struct Div_SIMD<ushort> 01290 { 01291 bool haveSIMD; 01292 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01293 01294 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const 01295 { 01296 int x = 0; 01297 01298 if (!haveSIMD) 01299 return x; 01300 01301 v_float32x4 v_scale = v_setall_f32((float)scale); 01302 v_uint16x8 v_zero = v_setzero_u16(); 01303 01304 for ( ; x <= width - 8; x += 8) 01305 { 01306 v_uint16x8 v_src1 = v_load(src1 + x); 01307 v_uint16x8 v_src2 = v_load(src2 + x); 01308 01309 v_uint32x4 t0, t1, t2, t3; 01310 v_expand(v_src1, t0, t1); 01311 v_expand(v_src2, t2, t3); 01312 01313 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); 01314 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); 01315 01316 v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); 01317 v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); 01318 01319 f0 = f0 * v_scale / f2; 01320 f1 = f1 * v_scale / f3; 01321 01322 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 01323 v_uint16x8 res = v_pack_u(i0, i1); 01324 01325 res = v_select(v_src2 == v_zero, v_zero, res); 01326 v_store(dst + x, res); 01327 } 01328 01329 return x; 01330 } 01331 }; 01332 01333 template <> 01334 struct Div_SIMD<short> 01335 { 01336 bool haveSIMD; 01337 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01338 01339 int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const 01340 { 01341 int x = 0; 01342 01343 if (!haveSIMD) 01344 return x; 01345 01346 v_float32x4 v_scale = v_setall_f32((float)scale); 01347 v_int16x8 v_zero = v_setzero_s16(); 01348 01349 for ( ; x <= width - 8; x += 8) 01350 { 01351 v_int16x8 v_src1 = v_load(src1 + x); 01352 v_int16x8 v_src2 = v_load(src2 + x); 01353 01354 v_int32x4 t0, t1, t2, t3; 01355 v_expand(v_src1, t0, t1); 01356 v_expand(v_src2, t2, t3); 01357 01358 v_float32x4 f0 = v_cvt_f32(t0); 01359 v_float32x4 f1 = v_cvt_f32(t1); 01360 01361 v_float32x4 f2 = v_cvt_f32(t2); 01362 v_float32x4 f3 = v_cvt_f32(t3); 01363 01364 f0 = f0 * v_scale / f2; 01365 f1 = f1 * v_scale / f3; 01366 01367 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 01368 v_int16x8 res = v_pack(i0, i1); 01369 01370 res = v_select(v_src2 == v_zero, v_zero, res); 01371 v_store(dst + x, res); 01372 } 01373 01374 return x; 01375 } 01376 }; 01377 01378 template <> 01379 struct Div_SIMD<int> 01380 { 01381 bool haveSIMD; 01382 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01383 01384 int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const 01385 { 01386 int x = 0; 01387 01388 if (!haveSIMD) 01389 return x; 01390 01391 v_float32x4 v_scale = v_setall_f32((float)scale); 01392 v_int32x4 v_zero = v_setzero_s32(); 01393 01394 for ( ; x <= width - 8; x += 8) 01395 { 01396 v_int32x4 t0 = v_load(src1 + x); 01397 v_int32x4 t1 = v_load(src1 + x + 4); 01398 v_int32x4 t2 = v_load(src2 + x); 01399 v_int32x4 t3 = v_load(src2 + x + 4); 01400 01401 v_float32x4 f0 = v_cvt_f32(t0); 01402 v_float32x4 f1 = v_cvt_f32(t1); 01403 v_float32x4 f2 = v_cvt_f32(t2); 01404 v_float32x4 f3 = v_cvt_f32(t3); 01405 01406 f0 = f0 * v_scale / f2; 01407 f1 = f1 * v_scale / f3; 01408 01409 v_int32x4 res0 = v_round(f0), res1 = v_round(f1); 01410 01411 res0 = v_select(t2 == v_zero, v_zero, res0); 01412 res1 = v_select(t3 == v_zero, v_zero, res1); 01413 v_store(dst + x, res0); 01414 v_store(dst + x + 4, res1); 01415 } 01416 01417 return x; 01418 } 01419 }; 01420 01421 01422 template <> 01423 struct Div_SIMD<float> 01424 { 01425 bool haveSIMD; 01426 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01427 01428 int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const 01429 { 01430 int x = 0; 01431 01432 if (!haveSIMD) 01433 return x; 01434 01435 v_float32x4 v_scale = v_setall_f32((float)scale); 01436 v_float32x4 v_zero = v_setzero_f32(); 01437 01438 for ( ; x <= width - 8; x += 8) 01439 { 01440 v_float32x4 f0 = v_load(src1 + x); 01441 v_float32x4 f1 = v_load(src1 + x + 4); 01442 v_float32x4 f2 = v_load(src2 + x); 01443 v_float32x4 f3 = v_load(src2 + x + 4); 01444 01445 v_float32x4 res0 = f0 * v_scale / f2; 01446 v_float32x4 res1 = f1 * v_scale / f3; 01447 01448 res0 = v_select(f2 == v_zero, v_zero, res0); 01449 res1 = v_select(f3 == v_zero, v_zero, res1); 01450 01451 v_store(dst + x, res0); 01452 v_store(dst + x + 4, res1); 01453 } 01454 01455 return x; 01456 } 01457 }; 01458 01459 01460 ///////////////////////// RECIPROCAL ////////////////////// 01461 01462 template <> 01463 struct Recip_SIMD<uchar> 01464 { 01465 bool haveSIMD; 01466 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01467 01468 int operator() (const uchar * src2, uchar * dst, int width, double scale) const 01469 { 01470 int x = 0; 01471 01472 if (!haveSIMD) 01473 return x; 01474 01475 v_float32x4 v_scale = v_setall_f32((float)scale); 01476 v_uint16x8 v_zero = v_setzero_u16(); 01477 01478 for ( ; x <= width - 8; x += 8) 01479 { 01480 v_uint16x8 v_src2 = v_load_expand(src2 + x); 01481 01482 v_uint32x4 t0, t1; 01483 v_expand(v_src2, t0, t1); 01484 01485 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); 01486 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); 01487 01488 f0 = v_scale / f0; 01489 f1 = v_scale / f1; 01490 01491 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 01492 v_uint16x8 res = v_pack_u(i0, i1); 01493 01494 res = v_select(v_src2 == v_zero, v_zero, res); 01495 v_pack_store(dst + x, res); 01496 } 01497 01498 return x; 01499 } 01500 }; 01501 01502 01503 template <> 01504 struct Recip_SIMD<schar> 01505 { 01506 bool haveSIMD; 01507 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01508 01509 int operator() (const schar * src2, schar * dst, int width, double scale) const 01510 { 01511 int x = 0; 01512 01513 if (!haveSIMD) 01514 return x; 01515 01516 v_float32x4 v_scale = v_setall_f32((float)scale); 01517 v_int16x8 v_zero = v_setzero_s16(); 01518 01519 for ( ; x <= width - 8; x += 8) 01520 { 01521 v_int16x8 v_src2 = v_load_expand(src2 + x); 01522 01523 v_int32x4 t0, t1; 01524 v_expand(v_src2, t0, t1); 01525 01526 v_float32x4 f0 = v_cvt_f32(t0); 01527 v_float32x4 f1 = v_cvt_f32(t1); 01528 01529 f0 = v_scale / f0; 01530 f1 = v_scale / f1; 01531 01532 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 01533 v_int16x8 res = v_pack(i0, i1); 01534 01535 res = v_select(v_src2 == v_zero, v_zero, res); 01536 v_pack_store(dst + x, res); 01537 } 01538 01539 return x; 01540 } 01541 }; 01542 01543 01544 template <> 01545 struct Recip_SIMD<ushort> 01546 { 01547 bool haveSIMD; 01548 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01549 01550 int operator() (const ushort * src2, ushort * dst, int width, double scale) const 01551 { 01552 int x = 0; 01553 01554 if (!haveSIMD) 01555 return x; 01556 01557 v_float32x4 v_scale = v_setall_f32((float)scale); 01558 v_uint16x8 v_zero = v_setzero_u16(); 01559 01560 for ( ; x <= width - 8; x += 8) 01561 { 01562 v_uint16x8 v_src2 = v_load(src2 + x); 01563 01564 v_uint32x4 t0, t1; 01565 v_expand(v_src2, t0, t1); 01566 01567 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); 01568 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); 01569 01570 f0 = v_scale / f0; 01571 f1 = v_scale / f1; 01572 01573 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 01574 v_uint16x8 res = v_pack_u(i0, i1); 01575 01576 res = v_select(v_src2 == v_zero, v_zero, res); 01577 v_store(dst + x, res); 01578 } 01579 01580 return x; 01581 } 01582 }; 01583 01584 template <> 01585 struct Recip_SIMD<short> 01586 { 01587 bool haveSIMD; 01588 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01589 01590 int operator() (const short * src2, short * dst, int width, double scale) const 01591 { 01592 int x = 0; 01593 01594 if (!haveSIMD) 01595 return x; 01596 01597 v_float32x4 v_scale = v_setall_f32((float)scale); 01598 v_int16x8 v_zero = v_setzero_s16(); 01599 01600 for ( ; x <= width - 8; x += 8) 01601 { 01602 v_int16x8 v_src2 = v_load(src2 + x); 01603 01604 v_int32x4 t0, t1; 01605 v_expand(v_src2, t0, t1); 01606 01607 v_float32x4 f0 = v_cvt_f32(t0); 01608 v_float32x4 f1 = v_cvt_f32(t1); 01609 01610 f0 = v_scale / f0; 01611 f1 = v_scale / f1; 01612 01613 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 01614 v_int16x8 res = v_pack(i0, i1); 01615 01616 res = v_select(v_src2 == v_zero, v_zero, res); 01617 v_store(dst + x, res); 01618 } 01619 01620 return x; 01621 } 01622 }; 01623 01624 template <> 01625 struct Recip_SIMD<int> 01626 { 01627 bool haveSIMD; 01628 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01629 01630 int operator() (const int * src2, int * dst, int width, double scale) const 01631 { 01632 int x = 0; 01633 01634 if (!haveSIMD) 01635 return x; 01636 01637 v_float32x4 v_scale = v_setall_f32((float)scale); 01638 v_int32x4 v_zero = v_setzero_s32(); 01639 01640 for ( ; x <= width - 8; x += 8) 01641 { 01642 v_int32x4 t0 = v_load(src2 + x); 01643 v_int32x4 t1 = v_load(src2 + x + 4); 01644 01645 v_float32x4 f0 = v_cvt_f32(t0); 01646 v_float32x4 f1 = v_cvt_f32(t1); 01647 01648 f0 = v_scale / f0; 01649 f1 = v_scale / f1; 01650 01651 v_int32x4 res0 = v_round(f0), res1 = v_round(f1); 01652 01653 res0 = v_select(t0 == v_zero, v_zero, res0); 01654 res1 = v_select(t1 == v_zero, v_zero, res1); 01655 v_store(dst + x, res0); 01656 v_store(dst + x + 4, res1); 01657 } 01658 01659 return x; 01660 } 01661 }; 01662 01663 01664 template <> 01665 struct Recip_SIMD<float> 01666 { 01667 bool haveSIMD; 01668 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01669 01670 int operator() (const float * src2, float * dst, int width, double scale) const 01671 { 01672 int x = 0; 01673 01674 if (!haveSIMD) 01675 return x; 01676 01677 v_float32x4 v_scale = v_setall_f32((float)scale); 01678 v_float32x4 v_zero = v_setzero_f32(); 01679 01680 for ( ; x <= width - 8; x += 8) 01681 { 01682 v_float32x4 f0 = v_load(src2 + x); 01683 v_float32x4 f1 = v_load(src2 + x + 4); 01684 01685 v_float32x4 res0 = v_scale / f0; 01686 v_float32x4 res1 = v_scale / f1; 01687 01688 res0 = v_select(f0 == v_zero, v_zero, res0); 01689 res1 = v_select(f1 == v_zero, v_zero, res1); 01690 01691 v_store(dst + x, res0); 01692 v_store(dst + x + 4, res1); 01693 } 01694 01695 return x; 01696 } 01697 }; 01698 01699 #if CV_SIMD128_64F 01700 01701 template <> 01702 struct Div_SIMD<double> 01703 { 01704 bool haveSIMD; 01705 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01706 01707 int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const 01708 { 01709 int x = 0; 01710 01711 if (!haveSIMD) 01712 return x; 01713 01714 v_float64x2 v_scale = v_setall_f64(scale); 01715 v_float64x2 v_zero = v_setzero_f64(); 01716 01717 for ( ; x <= width - 4; x += 4) 01718 { 01719 v_float64x2 f0 = v_load(src1 + x); 01720 v_float64x2 f1 = v_load(src1 + x + 2); 01721 v_float64x2 f2 = v_load(src2 + x); 01722 v_float64x2 f3 = v_load(src2 + x + 2); 01723 01724 v_float64x2 res0 = f0 * v_scale / f2; 01725 v_float64x2 res1 = f1 * v_scale / f3; 01726 01727 res0 = v_select(f0 == v_zero, v_zero, res0); 01728 res1 = v_select(f1 == v_zero, v_zero, res1); 01729 01730 v_store(dst + x, res0); 01731 v_store(dst + x + 2, res1); 01732 } 01733 01734 return x; 01735 } 01736 }; 01737 01738 template <> 01739 struct Recip_SIMD<double> 01740 { 01741 bool haveSIMD; 01742 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 01743 01744 int operator() (const double * src2, double * dst, int width, double scale) const 01745 { 01746 int x = 0; 01747 01748 if (!haveSIMD) 01749 return x; 01750 01751 v_float64x2 v_scale = v_setall_f64(scale); 01752 v_float64x2 v_zero = v_setzero_f64(); 01753 01754 for ( ; x <= width - 4; x += 4) 01755 { 01756 v_float64x2 f0 = v_load(src2 + x); 01757 v_float64x2 f1 = v_load(src2 + x + 2); 01758 01759 v_float64x2 res0 = v_scale / f0; 01760 v_float64x2 res1 = v_scale / f1; 01761 01762 res0 = v_select(f0 == v_zero, v_zero, res0); 01763 res1 = v_select(f1 == v_zero, v_zero, res1); 01764 01765 v_store(dst + x, res0); 01766 v_store(dst + x + 2, res1); 01767 } 01768 01769 return x; 01770 } 01771 }; 01772 01773 #endif 01774 01775 #endif 01776 01777 01778 template <typename T, typename WT> 01779 struct AddWeighted_SIMD 01780 { 01781 int operator() (const T *, const T *, T *, int, WT, WT, WT) const 01782 { 01783 return 0; 01784 } 01785 }; 01786 01787 #if CV_SSE2 01788 01789 template <> 01790 struct AddWeighted_SIMD<schar, float> 01791 { 01792 AddWeighted_SIMD() 01793 { 01794 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 01795 } 01796 01797 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const 01798 { 01799 int x = 0; 01800 01801 if (!haveSSE2) 01802 return x; 01803 01804 __m128i v_zero = _mm_setzero_si128(); 01805 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), 01806 v_gamma = _mm_set1_ps(gamma); 01807 01808 for( ; x <= width - 8; x += 8 ) 01809 { 01810 __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x)); 01811 __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); 01812 01813 __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); 01814 __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); 01815 01816 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha); 01817 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), 01818 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta)); 01819 01820 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha); 01821 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), 01822 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta)); 01823 01824 __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), 01825 _mm_cvtps_epi32(v_dstf1)); 01826 01827 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero)); 01828 } 01829 01830 return x; 01831 } 01832 01833 bool haveSSE2; 01834 }; 01835 01836 template <> 01837 struct AddWeighted_SIMD<short, float> 01838 { 01839 AddWeighted_SIMD() 01840 { 01841 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 01842 } 01843 01844 int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const 01845 { 01846 int x = 0; 01847 01848 if (!haveSSE2) 01849 return x; 01850 01851 __m128i v_zero = _mm_setzero_si128(); 01852 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), 01853 v_gamma = _mm_set1_ps(gamma); 01854 01855 for( ; x <= width - 8; x += 8 ) 01856 { 01857 __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); 01858 __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); 01859 01860 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha); 01861 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), 01862 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta)); 01863 01864 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha); 01865 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), 01866 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta)); 01867 01868 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), 01869 _mm_cvtps_epi32(v_dstf1))); 01870 } 01871 01872 return x; 01873 } 01874 01875 bool haveSSE2; 01876 }; 01877 01878 #if CV_SSE4_1 01879 01880 template <> 01881 struct AddWeighted_SIMD<ushort, float> 01882 { 01883 AddWeighted_SIMD() 01884 { 01885 haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); 01886 } 01887 01888 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const 01889 { 01890 int x = 0; 01891 01892 if (!haveSSE4_1) 01893 return x; 01894 01895 __m128i v_zero = _mm_setzero_si128(); 01896 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), 01897 v_gamma = _mm_set1_ps(gamma); 01898 01899 for( ; x <= width - 8; x += 8 ) 01900 { 01901 __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); 01902 __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); 01903 01904 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha); 01905 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), 01906 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta)); 01907 01908 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha); 01909 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), 01910 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta)); 01911 01912 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0), 01913 _mm_cvtps_epi32(v_dstf1))); 01914 } 01915 01916 return x; 01917 } 01918 01919 bool haveSSE4_1; 01920 }; 01921 01922 #endif 01923 01924 #elif CV_NEON 01925 01926 template <> 01927 struct AddWeighted_SIMD<schar, float> 01928 { 01929 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const 01930 { 01931 int x = 0; 01932 01933 float32x4_t g = vdupq_n_f32 (gamma); 01934 01935 for( ; x <= width - 8; x += 8 ) 01936 { 01937 int8x8_t in1 = vld1_s8(src1 + x); 01938 int16x8_t in1_16 = vmovl_s8(in1); 01939 float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); 01940 float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); 01941 01942 int8x8_t in2 = vld1_s8(src2+x); 01943 int16x8_t in2_16 = vmovl_s8(in2); 01944 float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); 01945 float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); 01946 01947 float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); 01948 float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); 01949 out_f_l = vaddq_f32(out_f_l, g); 01950 out_f_h = vaddq_f32(out_f_h, g); 01951 01952 int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); 01953 int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); 01954 01955 int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); 01956 int8x8_t out = vqmovn_s16(out_16); 01957 01958 vst1_s8(dst + x, out); 01959 } 01960 01961 return x; 01962 } 01963 }; 01964 01965 template <> 01966 struct AddWeighted_SIMD<ushort, float> 01967 { 01968 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const 01969 { 01970 int x = 0; 01971 01972 float32x4_t g = vdupq_n_f32(gamma); 01973 01974 for( ; x <= width - 8; x += 8 ) 01975 { 01976 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); 01977 01978 float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); 01979 float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); 01980 uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); 01981 01982 v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); 01983 v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); 01984 uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); 01985 01986 vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); 01987 } 01988 01989 return x; 01990 } 01991 }; 01992 01993 template <> 01994 struct AddWeighted_SIMD<short, float> 01995 { 01996 int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const 01997 { 01998 int x = 0; 01999 02000 float32x4_t g = vdupq_n_f32(gamma); 02001 02002 for( ; x <= width - 8; x += 8 ) 02003 { 02004 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); 02005 02006 float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); 02007 float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); 02008 int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); 02009 02010 v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); 02011 v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); 02012 int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); 02013 02014 vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); 02015 } 02016 02017 return x; 02018 } 02019 }; 02020 02021 #endif 02022 02023 } 02024 02025 #endif // __OPENCV_ARITHM_SIMD_HPP__ 02026
Generated on Tue Jul 12 2022 14:45:58 by
1.7.2
