Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Dependents: RZ_A2M_Mbed_samples
intrin_cpp.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 00016 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00017 // Third party copyrights are property of their respective owners. 00018 // 00019 // Redistribution and use in source and binary forms, with or without modification, 00020 // are permitted provided that the following conditions are met: 00021 // 00022 // * Redistribution's of source code must retain the above copyright notice, 00023 // this list of conditions and the following disclaimer. 00024 // 00025 // * Redistribution's in binary form must reproduce the above copyright notice, 00026 // this list of conditions and the following disclaimer in the documentation 00027 // and/or other materials provided with the distribution. 00028 // 00029 // * The name of the copyright holders may not be used to endorse or promote products 00030 // derived from this software without specific prior written permission. 00031 // 00032 // This software is provided by the copyright holders and contributors "as is" and 00033 // any express or implied warranties, including, but not limited to, the implied 00034 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00035 // In no event shall the Intel Corporation or contributors be liable for any direct, 00036 // indirect, incidental, special, exemplary, or consequential damages 00037 // (including, but not limited to, procurement of substitute goods or services; 00038 // loss of use, data, or profits; or business interruption) however caused 00039 // and on any theory of liability, whether in contract, strict liability, 00040 // or tort (including negligence or otherwise) arising in any way out of 00041 // the use of this software, even if advised of the possibility of such damage. 00042 // 00043 //M*/ 00044 00045 #ifndef OPENCV_HAL_INTRIN_CPP_HPP 00046 #define OPENCV_HAL_INTRIN_CPP_HPP 00047 00048 #include <limits> 00049 #include <cstring> 00050 #include <algorithm> 00051 #include "opencv2/core/saturate.hpp" 00052 00053 namespace cv 00054 { 00055 00056 /** @addtogroup core_hal_intrin 00057 00058 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on 00059 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86 00060 architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers 00061 containing packed values of different types. In case when there is no SIMD extension available 00062 during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as 00063 expected although it could be slower. 00064 00065 ### Types 00066 00067 There are several types representing 128-bit register as a vector of packed values, each type is 00068 implemented as a structure based on a one SIMD register. 00069 00070 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char 00071 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short 00072 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int 00073 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64 00074 - cv::v_float32x4: four 32-bit floating point values (signed) - float 00075 - cv::v_float64x2: two 64-bit floating point valies (signed) - double 00076 00077 @note 00078 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to 00079 check the CV_SIMD128_64F preprocessor definition: 00080 @code 00081 #if CV_SIMD128_64F 00082 //... 00083 #endif 00084 @endcode 00085 00086 ### Load and store operations 00087 00088 These operations allow to set contents of the register explicitly or by loading it from some memory 00089 block and to save contents of the register to memory block. 00090 00091 - Constructors: 00092 @ref v_reg::v_reg(const _Tp *ptr) "from memory", 00093 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ... 00094 - Other create methods: 00095 @ref v_setall_s8, @ref v_setall_u8, ..., 00096 @ref v_setzero_u8, @ref v_setzero_s8, ... 00097 - Memory operations: 00098 @ref v_load, @ref v_load_aligned, @ref v_load_halves, 00099 @ref v_store, @ref v_store_aligned, 00100 @ref v_store_high, @ref v_store_low 00101 00102 ### Value reordering 00103 00104 These operations allow to reorder or recombine elements in one or multiple vectors. 00105 00106 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave 00107 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand 00108 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, 00109 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store 00110 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high 00111 - Extract: @ref v_extract 00112 00113 00114 ### Arithmetic, bitwise and comparison operations 00115 00116 Element-wise binary and unary operations. 00117 00118 - Arithmetics: 00119 @ref operator +(const v_reg &a, const v_reg &b) "+", 00120 @ref operator -(const v_reg &a, const v_reg &b) "-", 00121 @ref operator *(const v_reg &a, const v_reg &b) "*", 00122 @ref operator /(const v_reg &a, const v_reg &b) "/", 00123 @ref v_mul_expand 00124 00125 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap 00126 00127 - Bitwise shifts: 00128 @ref operator <<(const v_reg &a, int s) "<<", 00129 @ref operator >>(const v_reg &a, int s) ">>", 00130 @ref v_shl, @ref v_shr 00131 00132 - Bitwise logic: 00133 @ref operator&(const v_reg &a, const v_reg &b) "&", 00134 @ref operator |(const v_reg &a, const v_reg &b) "|", 00135 @ref operator ^(const v_reg &a, const v_reg &b) "^", 00136 @ref operator ~(const v_reg &a) "~" 00137 00138 - Comparison: 00139 @ref operator >(const v_reg &a, const v_reg &b) ">", 00140 @ref operator >=(const v_reg &a, const v_reg &b) ">=", 00141 @ref operator <(const v_reg &a, const v_reg &b) "<", 00142 @ref operator <=(const v_reg &a, const v_reg &b) "<=", 00143 @ref operator==(const v_reg &a, const v_reg &b) "==", 00144 @ref operator !=(const v_reg &a, const v_reg &b) "!=" 00145 00146 - min/max: @ref v_min, @ref v_max 00147 00148 ### Reduce and mask 00149 00150 Most of these operations return only one value. 00151 00152 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum 00153 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select 00154 00155 ### Other math 00156 00157 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude 00158 - Absolute values: @ref v_abs, @ref v_absdiff 00159 00160 ### Conversions 00161 00162 Different type conversions and casts: 00163 00164 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc, 00165 - To float: @ref v_cvt_f32, @ref v_cvt_f64 00166 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ... 00167 00168 ### Matrix operations 00169 00170 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4 00171 00172 ### Usability 00173 00174 Most operations are implemented only for some subset of the available types, following matrices 00175 shows the applicability of different operations to the types. 00176 00177 Regular integers: 00178 00179 | Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 | 00180 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:| 00181 |load, store | x | x | x | x | x | x | 00182 |interleave | x | x | x | x | x | x | 00183 |expand | x | x | x | x | x | x | 00184 |expand_q | x | x | | | | | 00185 |add, sub | x | x | x | x | x | x | 00186 |add_wrap, sub_wrap | x | x | x | x | | | 00187 |mul | | | x | x | x | x | 00188 |mul_expand | | | x | x | x | | 00189 |compare | x | x | x | x | x | x | 00190 |shift | | | x | x | x | x | 00191 |dotprod | | | | x | | | 00192 |logical | x | x | x | x | x | x | 00193 |min, max | x | x | x | x | x | x | 00194 |absdiff | x | x | x | x | x | x | 00195 |reduce | | | | | x | x | 00196 |mask | x | x | x | x | x | x | 00197 |pack | x | x | x | x | x | x | 00198 |pack_u | x | | x | | | | 00199 |unpack | x | x | x | x | x | x | 00200 |extract | x | x | x | x | x | x | 00201 |cvt_flt32 | | | | | | x | 00202 |cvt_flt64 | | | | | | x | 00203 |transpose4x4 | | | | | x | x | 00204 00205 Big integers: 00206 00207 | Operations\\Types | uint 64x2 | int 64x2 | 00208 |-------------------|:-:|:-:| 00209 |load, store | x | x | 00210 |add, sub | x | x | 00211 |shift | x | x | 00212 |logical | x | x | 00213 |extract | x | x | 00214 00215 Floating point: 00216 00217 | Operations\\Types | float 32x4 | float 64x2 | 00218 |-------------------|:-:|:-:| 00219 |load, store | x | x | 00220 |interleave | x | | 00221 |add, sub | x | x | 00222 |mul | x | x | 00223 |div | x | x | 00224 |compare | x | x | 00225 |min, max | x | x | 00226 |absdiff | x | x | 00227 |reduce | x | | 00228 |mask | x | x | 00229 |unpack | x | x | 00230 |cvt_flt32 | | x | 00231 |cvt_flt64 | x | | 00232 |sqrt, abs | x | x | 00233 |float math | x | x | 00234 |transpose4x4 | x | | 00235 00236 00237 @{ */ 00238 00239 template<typename _Tp, int n> struct v_reg 00240 { 00241 //! @cond IGNORED 00242 typedef _Tp lane_type; 00243 typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec; 00244 typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec; 00245 enum { nlanes = n }; 00246 // !@endcond 00247 00248 /** @brief Constructor 00249 00250 Initializes register with data from memory 00251 @param ptr pointer to memory block with data for register */ 00252 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } 00253 00254 /** @brief Constructor 00255 00256 Initializes register with two 64-bit values */ 00257 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } 00258 00259 /** @brief Constructor 00260 00261 Initializes register with four 32-bit values */ 00262 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } 00263 00264 /** @brief Constructor 00265 00266 Initializes register with eight 16-bit values */ 00267 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, 00268 _Tp s4, _Tp s5, _Tp s6, _Tp s7) 00269 { 00270 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; 00271 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; 00272 } 00273 00274 /** @brief Constructor 00275 00276 Initializes register with sixteen 8-bit values */ 00277 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, 00278 _Tp s4, _Tp s5, _Tp s6, _Tp s7, 00279 _Tp s8, _Tp s9, _Tp s10, _Tp s11, 00280 _Tp s12, _Tp s13, _Tp s14, _Tp s15) 00281 { 00282 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; 00283 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; 00284 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11; 00285 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; 00286 } 00287 00288 /** @brief Default constructor 00289 00290 Does not initialize anything*/ 00291 v_reg() {} 00292 00293 /** @brief Copy constructor */ 00294 v_reg(const v_reg<_Tp, n> & r) 00295 { 00296 for( int i = 0; i < n; i++ ) 00297 s[i] = r.s[i]; 00298 } 00299 /** @brief Access first value 00300 00301 Returns value of the first lane according to register type, for example: 00302 @code{.cpp} 00303 v_int32x4 r(1, 2, 3, 4); 00304 int v = r.get0(); // returns 1 00305 v_uint64x2 r(1, 2); 00306 uint64_t v = r.get0(); // returns 1 00307 @endcode 00308 */ 00309 _Tp get0() const { return s[0]; } 00310 00311 //! @cond IGNORED 00312 _Tp get(const int i) const { return s[i]; } 00313 v_reg<_Tp, n> high() const 00314 { 00315 v_reg<_Tp, n> c; 00316 int i; 00317 for( i = 0; i < n/2; i++ ) 00318 { 00319 c.s[i] = s[i+(n/2)]; 00320 c.s[i+(n/2)] = 0; 00321 } 00322 return c; 00323 } 00324 00325 static v_reg<_Tp, n> zero() 00326 { 00327 v_reg<_Tp, n> c; 00328 for( int i = 0; i < n; i++ ) 00329 c.s[i] = (_Tp)0; 00330 return c; 00331 } 00332 00333 static v_reg<_Tp, n> all(_Tp s) 00334 { 00335 v_reg<_Tp, n> c; 00336 for( int i = 0; i < n; i++ ) 00337 c.s[i] = s; 00338 return c; 00339 } 00340 00341 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const 00342 { 00343 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); 00344 v_reg<_Tp2, n2> c; 00345 std::memcpy(&c.s[0], &s[0], bytes); 00346 return c; 00347 } 00348 00349 _Tp s[n]; 00350 //! @endcond 00351 }; 00352 00353 /** @brief Sixteen 8-bit unsigned integer values */ 00354 typedef v_reg<uchar, 16> v_uint8x16; 00355 /** @brief Sixteen 8-bit signed integer values */ 00356 typedef v_reg<schar, 16> v_int8x16; 00357 /** @brief Eight 16-bit unsigned integer values */ 00358 typedef v_reg<ushort, 8> v_uint16x8; 00359 /** @brief Eight 16-bit signed integer values */ 00360 typedef v_reg<short, 8> v_int16x8; 00361 /** @brief Four 32-bit unsigned integer values */ 00362 typedef v_reg<unsigned, 4> v_uint32x4; 00363 /** @brief Four 32-bit signed integer values */ 00364 typedef v_reg<int, 4> v_int32x4; 00365 /** @brief Four 32-bit floating point values (single precision) */ 00366 typedef v_reg<float, 4> v_float32x4; 00367 /** @brief Two 64-bit floating point values (double precision) */ 00368 typedef v_reg<double, 2> v_float64x2; 00369 /** @brief Two 64-bit unsigned integer values */ 00370 typedef v_reg<uint64, 2> v_uint64x2; 00371 /** @brief Two 64-bit signed integer values */ 00372 typedef v_reg<int64, 2> v_int64x2; 00373 00374 //! @brief Helper macro 00375 //! @ingroup core_hal_intrin_impl 00376 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \ 00377 template<typename _Tp, int n> inline v_reg<_Tp, n> \ 00378 operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00379 { \ 00380 v_reg<_Tp, n> c; \ 00381 for( int i = 0; i < n; i++ ) \ 00382 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ 00383 return c; \ 00384 } \ 00385 template<typename _Tp, int n> inline v_reg<_Tp, n>& \ 00386 operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00387 { \ 00388 for( int i = 0; i < n; i++ ) \ 00389 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ 00390 return a; \ 00391 } 00392 00393 /** @brief Add values 00394 00395 For all types. */ 00396 OPENCV_HAL_IMPL_BIN_OP(+) 00397 00398 /** @brief Subtract values 00399 00400 For all types. */ 00401 OPENCV_HAL_IMPL_BIN_OP(-) 00402 00403 /** @brief Multiply values 00404 00405 For 16- and 32-bit integer types and floating types. */ 00406 OPENCV_HAL_IMPL_BIN_OP(*) 00407 00408 /** @brief Divide values 00409 00410 For floating types only. */ 00411 OPENCV_HAL_IMPL_BIN_OP(/) 00412 00413 //! @brief Helper macro 00414 //! @ingroup core_hal_intrin_impl 00415 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \ 00416 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \ 00417 (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00418 { \ 00419 v_reg<_Tp, n> c; \ 00420 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 00421 for( int i = 0; i < n; i++ ) \ 00422 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ 00423 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ 00424 return c; \ 00425 } \ 00426 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \ 00427 bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00428 { \ 00429 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 00430 for( int i = 0; i < n; i++ ) \ 00431 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ 00432 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ 00433 return a; \ 00434 } 00435 00436 /** @brief Bitwise AND 00437 00438 Only for integer types. */ 00439 OPENCV_HAL_IMPL_BIT_OP(&) 00440 00441 /** @brief Bitwise OR 00442 00443 Only for integer types. */ 00444 OPENCV_HAL_IMPL_BIT_OP(|) 00445 00446 /** @brief Bitwise XOR 00447 00448 Only for integer types.*/ 00449 OPENCV_HAL_IMPL_BIT_OP(^) 00450 00451 /** @brief Bitwise NOT 00452 00453 Only for integer types.*/ 00454 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) 00455 { 00456 v_reg<_Tp, n> c; 00457 for( int i = 0; i < n; i++ ) 00458 { 00459 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); 00460 } 00461 return c; 00462 } 00463 00464 //! @brief Helper macro 00465 //! @ingroup core_hal_intrin_impl 00466 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ 00467 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ 00468 { \ 00469 v_reg<_Tp2, n> c; \ 00470 for( int i = 0; i < n; i++ ) \ 00471 c.s[i] = cfunc(a.s[i]); \ 00472 return c; \ 00473 } 00474 00475 /** @brief Square root of elements 00476 00477 Only for floating point types.*/ 00478 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) 00479 00480 //! @cond IGNORED 00481 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) 00482 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) 00483 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) 00484 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) 00485 //! @endcond 00486 00487 /** @brief Absolute value of elements 00488 00489 Only for floating point types.*/ 00490 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs, 00491 typename V_TypeTraits<_Tp>::abs_type) 00492 00493 /** @brief Round elements 00494 00495 Only for floating point types.*/ 00496 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int) 00497 00498 /** @brief Floor elements 00499 00500 Only for floating point types.*/ 00501 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int) 00502 00503 /** @brief Ceil elements 00504 00505 Only for floating point types.*/ 00506 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int) 00507 00508 /** @brief Truncate elements 00509 00510 Only for floating point types.*/ 00511 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int) 00512 00513 //! @brief Helper macro 00514 //! @ingroup core_hal_intrin_impl 00515 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \ 00516 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00517 { \ 00518 v_reg<_Tp, n> c; \ 00519 for( int i = 0; i < n; i++ ) \ 00520 c.s[i] = cfunc(a.s[i], b.s[i]); \ 00521 return c; \ 00522 } 00523 00524 //! @brief Helper macro 00525 //! @ingroup core_hal_intrin_impl 00526 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \ 00527 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \ 00528 { \ 00529 _Tp c = a.s[0]; \ 00530 for( int i = 1; i < n; i++ ) \ 00531 c = cfunc(c, a.s[i]); \ 00532 return c; \ 00533 } 00534 00535 /** @brief Choose min values for each pair 00536 00537 Scheme: 00538 @code 00539 {A1 A2 ...} 00540 {B1 B2 ...} 00541 -------------- 00542 {min(A1,B1) min(A2,B2) ...} 00543 @endcode 00544 For all types except 64-bit integer. */ 00545 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min) 00546 00547 /** @brief Choose max values for each pair 00548 00549 Scheme: 00550 @code 00551 {A1 A2 ...} 00552 {B1 B2 ...} 00553 -------------- 00554 {max(A1,B1) max(A2,B2) ...} 00555 @endcode 00556 For all types except 64-bit integer. */ 00557 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max) 00558 00559 /** @brief Find one min value 00560 00561 Scheme: 00562 @code 00563 {A1 A2 A3 ...} => min(A1,A2,A3,...) 00564 @endcode 00565 For 32-bit integer and 32-bit floating point types. */ 00566 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min) 00567 00568 /** @brief Find one max value 00569 00570 Scheme: 00571 @code 00572 {A1 A2 A3 ...} => max(A1,A2,A3,...) 00573 @endcode 00574 For 32-bit integer and 32-bit floating point types. */ 00575 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max) 00576 00577 //! @cond IGNORED 00578 template<typename _Tp, int n> 00579 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 00580 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval ) 00581 { 00582 for( int i = 0; i < n; i++ ) 00583 { 00584 minval.s[i] = std::min(a.s[i], b.s[i]); 00585 maxval.s[i] = std::max(a.s[i], b.s[i]); 00586 } 00587 } 00588 //! @endcond 00589 00590 //! @brief Helper macro 00591 //! @ingroup core_hal_intrin_impl 00592 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ 00593 template<typename _Tp, int n> \ 00594 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00595 { \ 00596 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 00597 v_reg<_Tp, n> c; \ 00598 for( int i = 0; i < n; i++ ) \ 00599 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \ 00600 return c; \ 00601 } 00602 00603 /** @brief Less-than comparison 00604 00605 For all types except 64-bit integer values. */ 00606 OPENCV_HAL_IMPL_CMP_OP(<) 00607 00608 /** @brief Greater-than comparison 00609 00610 For all types except 64-bit integer values. */ 00611 OPENCV_HAL_IMPL_CMP_OP(>) 00612 00613 /** @brief Less-than or equal comparison 00614 00615 For all types except 64-bit integer values. */ 00616 OPENCV_HAL_IMPL_CMP_OP(<=) 00617 00618 /** @brief Greater-than or equal comparison 00619 00620 For all types except 64-bit integer values. */ 00621 OPENCV_HAL_IMPL_CMP_OP(>=) 00622 00623 /** @brief Equal comparison 00624 00625 For all types except 64-bit integer values. */ 00626 OPENCV_HAL_IMPL_CMP_OP(==) 00627 00628 /** @brief Not equal comparison 00629 00630 For all types except 64-bit integer values. */ 00631 OPENCV_HAL_IMPL_CMP_OP(!=) 00632 00633 //! @brief Helper macro 00634 //! @ingroup core_hal_intrin_impl 00635 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \ 00636 template<typename _Tp, int n> \ 00637 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00638 { \ 00639 typedef _Tp2 rtype; \ 00640 v_reg<rtype, n> c; \ 00641 for( int i = 0; i < n; i++ ) \ 00642 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \ 00643 return c; \ 00644 } 00645 00646 /** @brief Add values without saturation 00647 00648 For 8- and 16-bit integer values. */ 00649 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp) 00650 00651 /** @brief Subtract values without saturation 00652 00653 For 8- and 16-bit integer values. */ 00654 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp) 00655 00656 //! @cond IGNORED 00657 template<typename T> inline T _absdiff(T a, T b) 00658 { 00659 return a > b ? a - b : b - a; 00660 } 00661 //! @endcond 00662 00663 /** @brief Absolute difference 00664 00665 Returns \f$ |a - b| \f$ converted to corresponding unsigned type. 00666 Example: 00667 @code{.cpp} 00668 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1} 00669 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3} 00670 @endcode 00671 For 8-, 16-, 32-bit integer source types. */ 00672 template<typename _Tp, int n> 00673 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b) 00674 { 00675 typedef typename V_TypeTraits<_Tp>::abs_type rtype; 00676 v_reg<rtype, n> c; 00677 const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0; 00678 for( int i = 0; i < n; i++ ) 00679 { 00680 rtype ua = a.s[i] ^ mask; 00681 rtype ub = b.s[i] ^ mask; 00682 c.s[i] = _absdiff(ua, ub); 00683 } 00684 return c; 00685 } 00686 00687 /** @overload 00688 00689 For 32-bit floating point values */ 00690 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) 00691 { 00692 v_float32x4 c; 00693 for( int i = 0; i < c.nlanes; i++ ) 00694 c.s[i] = _absdiff(a.s[i], b.s[i]); 00695 return c; 00696 } 00697 00698 /** @overload 00699 00700 For 64-bit floating point values */ 00701 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) 00702 { 00703 v_float64x2 c; 00704 for( int i = 0; i < c.nlanes; i++ ) 00705 c.s[i] = _absdiff(a.s[i], b.s[i]); 00706 return c; 00707 } 00708 00709 /** @brief Inversed square root 00710 00711 Returns \f$ 1/sqrt(a) \f$ 00712 For floating point types only. */ 00713 template<typename _Tp, int n> 00714 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) 00715 { 00716 v_reg<_Tp, n> c; 00717 for( int i = 0; i < n; i++ ) 00718 c.s[i] = 1.f/std::sqrt(a.s[i]); 00719 return c; 00720 } 00721 00722 /** @brief Magnitude 00723 00724 Returns \f$ sqrt(a^2 + b^2) \f$ 00725 For floating point types only. */ 00726 template<typename _Tp, int n> 00727 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 00728 { 00729 v_reg<_Tp, n> c; 00730 for( int i = 0; i < n; i++ ) 00731 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]); 00732 return c; 00733 } 00734 00735 /** @brief Square of the magnitude 00736 00737 Returns \f$ a^2 + b^2 \f$ 00738 For floating point types only. */ 00739 template<typename _Tp, int n> 00740 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 00741 { 00742 v_reg<_Tp, n> c; 00743 for( int i = 0; i < n; i++ ) 00744 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i]; 00745 return c; 00746 } 00747 00748 /** @brief Multiply and add 00749 00750 Returns \f$ a*b + c \f$ 00751 For floating point types only. */ 00752 template<typename _Tp, int n> 00753 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 00754 const v_reg<_Tp, n>& c) 00755 { 00756 v_reg<_Tp, n> d; 00757 for( int i = 0; i < n; i++ ) 00758 d.s[i] = a.s[i]*b.s[i] + c.s[i]; 00759 return d; 00760 } 00761 00762 /** @brief Dot product of elements 00763 00764 Multiply values in two registers and sum adjacent result pairs. 00765 Scheme: 00766 @code 00767 {A1 A2 ...} // 16-bit 00768 x {B1 B2 ...} // 16-bit 00769 ------------- 00770 {A1B1+A2B2 ...} // 32-bit 00771 @endcode 00772 Implemented only for 16-bit signed source type (v_int16x8). 00773 */ 00774 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> 00775 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 00776 { 00777 typedef typename V_TypeTraits<_Tp>::w_type w_type; 00778 v_reg<w_type, n/2> c; 00779 for( int i = 0; i < (n/2); i++ ) 00780 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1]; 00781 return c; 00782 } 00783 00784 /** @brief Multiply and expand 00785 00786 Multiply values two registers and store results in two registers with wider pack type. 00787 Scheme: 00788 @code 00789 {A B C D} // 32-bit 00790 x {E F G H} // 32-bit 00791 --------------- 00792 {AE BF} // 64-bit 00793 {CG DH} // 64-bit 00794 @endcode 00795 Example: 00796 @code{.cpp} 00797 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2} 00798 v_uint64x2 c, d; // results 00799 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8} 00800 @endcode 00801 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4). 00802 */ 00803 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 00804 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c, 00805 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d) 00806 { 00807 typedef typename V_TypeTraits<_Tp>::w_type w_type; 00808 for( int i = 0; i < (n/2); i++ ) 00809 { 00810 c.s[i] = (w_type)a.s[i]*b.s[i]; 00811 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)]; 00812 } 00813 } 00814 00815 //! @cond IGNORED 00816 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a, 00817 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c) 00818 { 00819 typedef typename V_TypeTraits<_Tp>::w_type w_type; 00820 for( int i = 0; i < (n/2); i++ ) 00821 { 00822 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1]; 00823 } 00824 } 00825 //! @endcond 00826 00827 //! @brief Helper macro 00828 //! @ingroup core_hal_intrin_impl 00829 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ 00830 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ 00831 { \ 00832 v_reg<_Tp, n> c; \ 00833 for( int i = 0; i < n; i++ ) \ 00834 c.s[i] = (_Tp)(a.s[i] shift_op imm); \ 00835 return c; \ 00836 } 00837 00838 /** @brief Bitwise shift left 00839 00840 For 16-, 32- and 64-bit integer values. */ 00841 OPENCV_HAL_IMPL_SHIFT_OP(<<) 00842 00843 /** @brief Bitwise shift right 00844 00845 For 16-, 32- and 64-bit integer values. */ 00846 OPENCV_HAL_IMPL_SHIFT_OP(>>) 00847 00848 /** @brief Sum packed values 00849 00850 Scheme: 00851 @code 00852 {A1 A2 A3 ...} => sum{A1,A2,A3,...} 00853 @endcode 00854 For 32-bit integer and 32-bit floating point types.*/ 00855 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) 00856 { 00857 typename V_TypeTraits<_Tp>::sum_type c = a.s[0]; 00858 for( int i = 1; i < n; i++ ) 00859 c += a.s[i]; 00860 return c; 00861 } 00862 00863 /** @brief Get negative values mask 00864 00865 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. 00866 Example: 00867 @code{.cpp} 00868 v_int32x4 r; // set to {-1, -1, 1, 1} 00869 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011 00870 @endcode 00871 For all types except 64-bit. */ 00872 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a) 00873 { 00874 int mask = 0; 00875 for( int i = 0; i < n; i++ ) 00876 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i; 00877 return mask; 00878 } 00879 00880 /** @brief Check if all packed values are less than zero 00881 00882 Unsigned values will be casted to signed: `uchar 254 => char -2`. 00883 For all types except 64-bit. */ 00884 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a) 00885 { 00886 for( int i = 0; i < n; i++ ) 00887 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 ) 00888 return false; 00889 return true; 00890 } 00891 00892 /** @brief Check if any of packed values is less than zero 00893 00894 Unsigned values will be casted to signed: `uchar 254 => char -2`. 00895 For all types except 64-bit. */ 00896 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a) 00897 { 00898 for( int i = 0; i < n; i++ ) 00899 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 ) 00900 return true; 00901 return false; 00902 } 00903 00904 /** @brief Bitwise select 00905 00906 Return value will be built by combining values a and b using the following scheme: 00907 If the i-th bit in _mask_ is 1 00908 select i-th bit from _a_ 00909 else 00910 select i-th bit from _b_ */ 00911 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, 00912 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 00913 { 00914 typedef V_TypeTraits<_Tp> Traits; 00915 typedef typename Traits::int_type int_type; 00916 v_reg<_Tp, n> c; 00917 for( int i = 0; i < n; i++ ) 00918 { 00919 int_type m = Traits::reinterpret_int(mask.s[i]); 00920 c.s[i] = Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m) 00921 | (Traits::reinterpret_int(b.s[i]) & ~m)); 00922 } 00923 return c; 00924 } 00925 00926 /** @brief Expand values to the wider pack type 00927 00928 Copy contents of register to two registers with 2x wider pack type. 00929 Scheme: 00930 @code 00931 int32x4 int64x2 int64x2 00932 {A B C D} ==> {A B} , {C D} 00933 @endcode */ 00934 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a, 00935 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0, 00936 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1) 00937 { 00938 for( int i = 0; i < (n/2); i++ ) 00939 { 00940 b0.s[i] = a.s[i]; 00941 b1.s[i] = a.s[i+(n/2)]; 00942 } 00943 } 00944 00945 //! @cond IGNORED 00946 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n> 00947 v_reinterpret_as_int(const v_reg<_Tp, n>& a) 00948 { 00949 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c; 00950 for( int i = 0; i < n; i++ ) 00951 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]); 00952 return c; 00953 } 00954 00955 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n> 00956 v_reinterpret_as_uint(const v_reg<_Tp, n>& a) 00957 { 00958 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c; 00959 for( int i = 0; i < n; i++ ) 00960 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]); 00961 return c; 00962 } 00963 //! @endcond 00964 00965 /** @brief Interleave two vectors 00966 00967 Scheme: 00968 @code 00969 {A1 A2 A3 A4} 00970 {B1 B2 B3 B4} 00971 --------------- 00972 {A1 B1 A2 B2} and {A3 B3 A4 B4} 00973 @endcode 00974 For all types except 64-bit. 00975 */ 00976 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, 00977 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) 00978 { 00979 int i; 00980 for( i = 0; i < n/2; i++ ) 00981 { 00982 b0.s[i*2] = a0.s[i]; 00983 b0.s[i*2+1] = a1.s[i]; 00984 } 00985 for( ; i < n; i++ ) 00986 { 00987 b1.s[i*2-n] = a0.s[i]; 00988 b1.s[i*2-n+1] = a1.s[i]; 00989 } 00990 } 00991 00992 /** @brief Load register contents from memory 00993 00994 @param ptr pointer to memory block with data 00995 @return register object 00996 00997 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc. 00998 */ 00999 template<typename _Tp> 01000 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr) 01001 { 01002 return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); 01003 } 01004 01005 /** @brief Load register contents from memory (aligned) 01006 01007 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary) 01008 */ 01009 template<typename _Tp> 01010 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr) 01011 { 01012 return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); 01013 } 01014 01015 /** @brief Load register contents from two memory blocks 01016 01017 @param loptr memory block containing data for first half (0..n/2) 01018 @param hiptr memory block containing data for second half (n/2..n) 01019 01020 @code{.cpp} 01021 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 }; 01022 v_int32x4 r = v_load_halves(lo, hi); 01023 @endcode 01024 */ 01025 template<typename _Tp> 01026 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr) 01027 { 01028 v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c; 01029 for( int i = 0; i < c.nlanes/2; i++ ) 01030 { 01031 c.s[i] = loptr[i]; 01032 c.s[i+c.nlanes/2] = hiptr[i]; 01033 } 01034 return c; 01035 } 01036 01037 /** @brief Load register contents from memory with double expand 01038 01039 Same as cv::v_load, but result pack type will be 2x wider than memory type. 01040 01041 @code{.cpp} 01042 short buf[4] = {1, 2, 3, 4}; // type is int16 01043 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32 01044 @endcode 01045 For 8-, 16-, 32-bit integer source types. */ 01046 template<typename _Tp> 01047 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2> 01048 v_load_expand(const _Tp* ptr) 01049 { 01050 typedef typename V_TypeTraits<_Tp>::w_type w_type; 01051 v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c; 01052 for( int i = 0; i < c.nlanes; i++ ) 01053 { 01054 c.s[i] = ptr[i]; 01055 } 01056 return c; 01057 } 01058 01059 /** @brief Load register contents from memory with quad expand 01060 01061 Same as cv::v_load_expand, but result type is 4 times wider than source. 01062 @code{.cpp} 01063 char buf[4] = {1, 2, 3, 4}; // type is int8 01064 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32 01065 @endcode 01066 For 8-bit integer source types. */ 01067 template<typename _Tp> 01068 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4> 01069 v_load_expand_q(const _Tp* ptr) 01070 { 01071 typedef typename V_TypeTraits<_Tp>::q_type q_type; 01072 v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c; 01073 for( int i = 0; i < c.nlanes; i++ ) 01074 { 01075 c.s[i] = ptr[i]; 01076 } 01077 return c; 01078 } 01079 01080 /** @brief Load and deinterleave (2 channels) 01081 01082 Load data from memory deinterleave and store to 2 registers. 01083 Scheme: 01084 @code 01085 {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...} 01086 @endcode 01087 For all types except 64-bit. */ 01088 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, 01089 v_reg<_Tp, n>& b) 01090 { 01091 int i, i2; 01092 for( i = i2 = 0; i < n; i++, i2 += 2 ) 01093 { 01094 a.s[i] = ptr[i2]; 01095 b.s[i] = ptr[i2+1]; 01096 } 01097 } 01098 01099 /** @brief Load and deinterleave (3 channels) 01100 01101 Load data from memory deinterleave and store to 3 registers. 01102 Scheme: 01103 @code 01104 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} 01105 @endcode 01106 For all types except 64-bit. */ 01107 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, 01108 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) 01109 { 01110 int i, i3; 01111 for( i = i3 = 0; i < n; i++, i3 += 3 ) 01112 { 01113 a.s[i] = ptr[i3]; 01114 b.s[i] = ptr[i3+1]; 01115 c.s[i] = ptr[i3+2]; 01116 } 01117 } 01118 01119 /** @brief Load and deinterleave (4 channels) 01120 01121 Load data from memory deinterleave and store to 4 registers. 01122 Scheme: 01123 @code 01124 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} 01125 @endcode 01126 For all types except 64-bit. */ 01127 template<typename _Tp, int n> 01128 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, 01129 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, 01130 v_reg<_Tp, n>& d) 01131 { 01132 int i, i4; 01133 for( i = i4 = 0; i < n; i++, i4 += 4 ) 01134 { 01135 a.s[i] = ptr[i4]; 01136 b.s[i] = ptr[i4+1]; 01137 c.s[i] = ptr[i4+2]; 01138 d.s[i] = ptr[i4+3]; 01139 } 01140 } 01141 01142 /** @brief Interleave and store (2 channels) 01143 01144 Interleave and store data from 2 registers to memory. 01145 Scheme: 01146 @code 01147 {A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...} 01148 @endcode 01149 For all types except 64-bit. */ 01150 template<typename _Tp, int n> 01151 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, 01152 const v_reg<_Tp, n>& b) 01153 { 01154 int i, i2; 01155 for( i = i2 = 0; i < n; i++, i2 += 2 ) 01156 { 01157 ptr[i2] = a.s[i]; 01158 ptr[i2+1] = b.s[i]; 01159 } 01160 } 01161 01162 /** @brief Interleave and store (3 channels) 01163 01164 Interleave and store data from 3 registers to memory. 01165 Scheme: 01166 @code 01167 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...} 01168 @endcode 01169 For all types except 64-bit. */ 01170 template<typename _Tp, int n> 01171 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, 01172 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) 01173 { 01174 int i, i3; 01175 for( i = i3 = 0; i < n; i++, i3 += 3 ) 01176 { 01177 ptr[i3] = a.s[i]; 01178 ptr[i3+1] = b.s[i]; 01179 ptr[i3+2] = c.s[i]; 01180 } 01181 } 01182 01183 /** @brief Interleave and store (4 channels) 01184 01185 Interleave and store data from 4 registers to memory. 01186 Scheme: 01187 @code 01188 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...} 01189 @endcode 01190 For all types except 64-bit. */ 01191 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, 01192 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, 01193 const v_reg<_Tp, n>& d) 01194 { 01195 int i, i4; 01196 for( i = i4 = 0; i < n; i++, i4 += 4 ) 01197 { 01198 ptr[i4] = a.s[i]; 01199 ptr[i4+1] = b.s[i]; 01200 ptr[i4+2] = c.s[i]; 01201 ptr[i4+3] = d.s[i]; 01202 } 01203 } 01204 01205 /** @brief Store data to memory 01206 01207 Store register contents to memory. 01208 Scheme: 01209 @code 01210 REG {A B C D} ==> MEM {A B C D} 01211 @endcode 01212 Pointer can be unaligned. */ 01213 template<typename _Tp, int n> 01214 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) 01215 { 01216 for( int i = 0; i < n; i++ ) 01217 ptr[i] = a.s[i]; 01218 } 01219 01220 /** @brief Store data to memory (lower half) 01221 01222 Store lower half of register contents to memory. 01223 Scheme: 01224 @code 01225 REG {A B C D} ==> MEM {A B} 01226 @endcode */ 01227 template<typename _Tp, int n> 01228 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) 01229 { 01230 for( int i = 0; i < (n/2); i++ ) 01231 ptr[i] = a.s[i]; 01232 } 01233 01234 /** @brief Store data to memory (higher half) 01235 01236 Store higher half of register contents to memory. 01237 Scheme: 01238 @code 01239 REG {A B C D} ==> MEM {C D} 01240 @endcode */ 01241 template<typename _Tp, int n> 01242 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) 01243 { 01244 for( int i = 0; i < (n/2); i++ ) 01245 ptr[i] = a.s[i+(n/2)]; 01246 } 01247 01248 /** @brief Store data to memory (aligned) 01249 01250 Store register contents to memory. 01251 Scheme: 01252 @code 01253 REG {A B C D} ==> MEM {A B C D} 01254 @endcode 01255 Pointer __should__ be aligned by 16-byte boundary. */ 01256 template<typename _Tp, int n> 01257 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) 01258 { 01259 for( int i = 0; i < n; i++ ) 01260 ptr[i] = a.s[i]; 01261 } 01262 01263 /** @brief Combine vector from first elements of two vectors 01264 01265 Scheme: 01266 @code 01267 {A1 A2 A3 A4} 01268 {B1 B2 B3 B4} 01269 --------------- 01270 {A1 A2 B1 B2} 01271 @endcode 01272 For all types except 64-bit. */ 01273 template<typename _Tp, int n> 01274 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 01275 { 01276 v_reg<_Tp, n> c; 01277 for( int i = 0; i < (n/2); i++ ) 01278 { 01279 c.s[i] = a.s[i]; 01280 c.s[i+(n/2)] = b.s[i]; 01281 } 01282 return c; 01283 } 01284 01285 /** @brief Combine vector from last elements of two vectors 01286 01287 Scheme: 01288 @code 01289 {A1 A2 A3 A4} 01290 {B1 B2 B3 B4} 01291 --------------- 01292 {A3 A4 B3 B4} 01293 @endcode 01294 For all types except 64-bit. */ 01295 template<typename _Tp, int n> 01296 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 01297 { 01298 v_reg<_Tp, n> c; 01299 for( int i = 0; i < (n/2); i++ ) 01300 { 01301 c.s[i] = a.s[i+(n/2)]; 01302 c.s[i+(n/2)] = b.s[i+(n/2)]; 01303 } 01304 return c; 01305 } 01306 01307 /** @brief Combine two vectors from lower and higher parts of two other vectors 01308 01309 @code{.cpp} 01310 low = cv::v_combine_low(a, b); 01311 high = cv::v_combine_high(a, b); 01312 @endcode */ 01313 template<typename _Tp, int n> 01314 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 01315 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) 01316 { 01317 for( int i = 0; i < (n/2); i++ ) 01318 { 01319 low.s[i] = a.s[i]; 01320 low.s[i+(n/2)] = b.s[i]; 01321 high.s[i] = a.s[i+(n/2)]; 01322 high.s[i+(n/2)] = b.s[i+(n/2)]; 01323 } 01324 } 01325 01326 /** @brief Vector extract 01327 01328 Scheme: 01329 @code 01330 {A1 A2 A3 A4} 01331 {B1 B2 B3 B4} 01332 ======================== 01333 shift = 1 {A2 A3 A4 B1} 01334 shift = 2 {A3 A4 B1 B2} 01335 shift = 3 {A4 B1 B2 B3} 01336 @endcode 01337 Restriction: 0 <= shift < nlanes 01338 01339 Usage: 01340 @code 01341 v_int32x4 a, b, c; 01342 c = v_extract<2>(a, b); 01343 @endcode 01344 For integer types only. */ 01345 template<int s, typename _Tp, int n> 01346 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 01347 { 01348 v_reg<_Tp, n> r; 01349 const int shift = n - s; 01350 int i = 0; 01351 for (; i < shift; ++i) 01352 r.s[i] = a.s[i+s]; 01353 for (; i < n; ++i) 01354 r.s[i] = b.s[i-shift]; 01355 return r; 01356 } 01357 01358 /** @brief Round 01359 01360 Rounds each value. Input type is float vector ==> output type is int vector.*/ 01361 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a) 01362 { 01363 v_reg<int, n> c; 01364 for( int i = 0; i < n; i++ ) 01365 c.s[i] = cvRound(a.s[i]); 01366 return c; 01367 } 01368 01369 /** @brief Floor 01370 01371 Floor each value. Input type is float vector ==> output type is int vector.*/ 01372 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a) 01373 { 01374 v_reg<int, n> c; 01375 for( int i = 0; i < n; i++ ) 01376 c.s[i] = cvFloor(a.s[i]); 01377 return c; 01378 } 01379 01380 /** @brief Ceil 01381 01382 Ceil each value. Input type is float vector ==> output type is int vector.*/ 01383 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a) 01384 { 01385 v_reg<int, n> c; 01386 for( int i = 0; i < n; i++ ) 01387 c.s[i] = cvCeil(a.s[i]); 01388 return c; 01389 } 01390 01391 /** @brief Trunc 01392 01393 Truncate each value. Input type is float vector ==> output type is int vector.*/ 01394 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a) 01395 { 01396 v_reg<int, n> c; 01397 for( int i = 0; i < n; i++ ) 01398 c.s[i] = (int)(a.s[i]); 01399 return c; 01400 } 01401 01402 /** @overload */ 01403 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a) 01404 { 01405 v_reg<int, n*2> c; 01406 for( int i = 0; i < n; i++ ) 01407 { 01408 c.s[i] = cvRound(a.s[i]); 01409 c.s[i+n] = 0; 01410 } 01411 return c; 01412 } 01413 01414 /** @overload */ 01415 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a) 01416 { 01417 v_reg<int, n> c; 01418 for( int i = 0; i < n; i++ ) 01419 { 01420 c.s[i] = cvFloor(a.s[i]); 01421 c.s[i+n] = 0; 01422 } 01423 return c; 01424 } 01425 01426 /** @overload */ 01427 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a) 01428 { 01429 v_reg<int, n> c; 01430 for( int i = 0; i < n; i++ ) 01431 { 01432 c.s[i] = cvCeil(a.s[i]); 01433 c.s[i+n] = 0; 01434 } 01435 return c; 01436 } 01437 01438 /** @overload */ 01439 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a) 01440 { 01441 v_reg<int, n> c; 01442 for( int i = 0; i < n; i++ ) 01443 { 01444 c.s[i] = cvCeil(a.s[i]); 01445 c.s[i+n] = 0; 01446 } 01447 return c; 01448 } 01449 01450 /** @brief Convert to float 01451 01452 Supported input type is cv::v_int32x4. */ 01453 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a) 01454 { 01455 v_reg<float, n> c; 01456 for( int i = 0; i < n; i++ ) 01457 c.s[i] = (float)a.s[i]; 01458 return c; 01459 } 01460 01461 /** @brief Convert to double 01462 01463 Supported input type is cv::v_int32x4. */ 01464 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a) 01465 { 01466 v_reg<double, n> c; 01467 for( int i = 0; i < n; i++ ) 01468 c.s[i] = (double)a.s[i]; 01469 return c; 01470 } 01471 01472 /** @brief Convert to double 01473 01474 Supported input type is cv::v_float32x4. */ 01475 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a) 01476 { 01477 v_reg<double, n> c; 01478 for( int i = 0; i < n; i++ ) 01479 c.s[i] = (double)a.s[i]; 01480 return c; 01481 } 01482 01483 /** @brief Transpose 4x4 matrix 01484 01485 Scheme: 01486 @code 01487 a0 {A1 A2 A3 A4} 01488 a1 {B1 B2 B3 B4} 01489 a2 {C1 C2 C3 C4} 01490 a3 {D1 D2 D3 D4} 01491 =============== 01492 b0 {A1 B1 C1 D1} 01493 b1 {A2 B2 C2 D2} 01494 b2 {A3 B3 C3 D3} 01495 b3 {A4 B4 C4 D4} 01496 @endcode 01497 */ 01498 template<typename _Tp> 01499 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, 01500 const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, 01501 v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, 01502 v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 ) 01503 { 01504 b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]); 01505 b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]); 01506 b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]); 01507 b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); 01508 } 01509 01510 //! @brief Helper macro 01511 //! @ingroup core_hal_intrin_impl 01512 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \ 01513 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } 01514 01515 //! @name Init with zero 01516 //! @{ 01517 //! @brief Create new vector with zero elements 01518 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8) 01519 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8) 01520 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16) 01521 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16) 01522 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32) 01523 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32) 01524 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32) 01525 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64) 01526 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64) 01527 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64) 01528 //! @} 01529 01530 //! @brief Helper macro 01531 //! @ingroup core_hal_intrin_impl 01532 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \ 01533 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } 01534 01535 //! @name Init with value 01536 //! @{ 01537 //! @brief Create new vector with elements set to a specific value 01538 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8) 01539 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8) 01540 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16) 01541 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16) 01542 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32) 01543 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32) 01544 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32) 01545 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64) 01546 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64) 01547 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64) 01548 //! @} 01549 01550 //! @brief Helper macro 01551 //! @ingroup core_hal_intrin_impl 01552 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \ 01553 template<typename _Tp0, int n0> inline _Tpvec \ 01554 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \ 01555 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); } 01556 01557 //! @name Reinterpret 01558 //! @{ 01559 //! @brief Convert vector to different type without modifying underlying data. 01560 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8) 01561 OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8) 01562 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16) 01563 OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16) 01564 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32) 01565 OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32) 01566 OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32) 01567 OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64) 01568 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64) 01569 OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64) 01570 //! @} 01571 01572 //! @brief Helper macro 01573 //! @ingroup core_hal_intrin_impl 01574 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \ 01575 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \ 01576 { return a << n; } 01577 01578 //! @name Left shift 01579 //! @{ 01580 //! @brief Shift left 01581 OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort) 01582 OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short) 01583 OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned) 01584 OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int) 01585 OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64) 01586 OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64) 01587 //! @} 01588 01589 //! @brief Helper macro 01590 //! @ingroup core_hal_intrin_impl 01591 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \ 01592 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \ 01593 { return a >> n; } 01594 01595 //! @name Right shift 01596 //! @{ 01597 //! @brief Shift right 01598 OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort) 01599 OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short) 01600 OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned) 01601 OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int) 01602 OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64) 01603 OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64) 01604 //! @} 01605 01606 //! @brief Helper macro 01607 //! @ingroup core_hal_intrin_impl 01608 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \ 01609 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \ 01610 { \ 01611 _Tpvec c; \ 01612 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01613 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 01614 return c; \ 01615 } 01616 01617 //! @name Rounding shift 01618 //! @{ 01619 //! @brief Rounding shift right 01620 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort) 01621 OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short) 01622 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned) 01623 OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int) 01624 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64) 01625 OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64) 01626 //! @} 01627 01628 //! @brief Helper macro 01629 //! @ingroup core_hal_intrin_impl 01630 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \ 01631 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ 01632 { \ 01633 _Tpnvec c; \ 01634 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01635 { \ 01636 c.s[i] = saturate_cast<_Tpn>(a.s[i]); \ 01637 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \ 01638 } \ 01639 return c; \ 01640 } 01641 01642 //! @name Pack 01643 //! @{ 01644 //! @brief Pack values from two vectors to one 01645 //! 01646 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also 01647 //! converts to corresponding unsigned type. 01648 //! 01649 //! - pack: for 16-, 32- and 64-bit integer input types 01650 //! - pack_u: for 16- and 32-bit signed integer input types 01651 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack) 01652 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack) 01653 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack) 01654 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack) 01655 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack) 01656 OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack) 01657 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u) 01658 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u) 01659 //! @} 01660 01661 //! @brief Helper macro 01662 //! @ingroup core_hal_intrin_impl 01663 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ 01664 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ 01665 { \ 01666 _Tpnvec c; \ 01667 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01668 { \ 01669 c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 01670 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 01671 } \ 01672 return c; \ 01673 } 01674 01675 //! @name Pack with rounding shift 01676 //! @{ 01677 //! @brief Pack values from two vectors to one with rounding shift 01678 //! 01679 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower 01680 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type. 01681 //! 01682 //! - pack: for 16-, 32- and 64-bit integer input types 01683 //! - pack_u: for 16- and 32-bit signed integer input types 01684 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack) 01685 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack) 01686 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack) 01687 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack) 01688 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack) 01689 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack) 01690 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u) 01691 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u) 01692 //! @} 01693 01694 //! @brief Helper macro 01695 //! @ingroup core_hal_intrin_impl 01696 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ 01697 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ 01698 { \ 01699 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01700 ptr[i] = saturate_cast<_Tpn>(a.s[i]); \ 01701 } 01702 01703 //! @name Pack and store 01704 //! @{ 01705 //! @brief Store values from the input vector into memory with pack 01706 //! 01707 //! Values will be stored into memory with saturating conversion to narrower type. 01708 //! Variant with _u_ suffix converts to corresponding unsigned type. 01709 //! 01710 //! - pack: for 16-, 32- and 64-bit integer input types 01711 //! - pack_u: for 16- and 32-bit signed integer input types 01712 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack) 01713 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack) 01714 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack) 01715 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack) 01716 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack) 01717 OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack) 01718 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u) 01719 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u) 01720 //! @} 01721 01722 //! @brief Helper macro 01723 //! @ingroup core_hal_intrin_impl 01724 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ 01725 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ 01726 { \ 01727 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01728 ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 01729 } 01730 01731 //! @name Pack and store with rounding shift 01732 //! @{ 01733 //! @brief Store values from the input vector into memory with pack 01734 //! 01735 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into 01736 //! memory. Variant with _u_ suffix converts to unsigned type. 01737 //! 01738 //! - pack: for 16-, 32- and 64-bit integer input types 01739 //! - pack_u: for 16- and 32-bit signed integer input types 01740 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack) 01741 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack) 01742 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack) 01743 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack) 01744 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack) 01745 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack) 01746 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u) 01747 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u) 01748 //! @} 01749 01750 /** @brief Matrix multiplication 01751 01752 Scheme: 01753 @code 01754 {A0 A1 A2 A3} |V0| 01755 {B0 B1 B2 B3} |V1| 01756 {C0 C1 C2 C3} |V2| 01757 {D0 D1 D2 D3} x |V3| 01758 ==================== 01759 {R0 R1 R2 R3}, where: 01760 R0 = A0V0 + A1V1 + A2V2 + A3V3, 01761 R1 = B0V0 + B1V1 + B2V2 + B3V3 01762 ... 01763 @endcode 01764 */ 01765 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, 01766 const v_float32x4& m1, const v_float32x4& m2, 01767 const v_float32x4& m3) 01768 { 01769 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0], 01770 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1], 01771 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2], 01772 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); 01773 } 01774 01775 //! @} 01776 01777 //! @name Check SIMD support 01778 //! @{ 01779 //! @brief Check CPU capability of SIMD operation 01780 static inline bool hasSIMD128() 01781 { 01782 return false; 01783 } 01784 01785 //! @} 01786 01787 01788 } 01789 01790 #endif
Generated on Tue Jul 12 2022 18:20:17 by
