Renesas / opencv-lib

Dependents:   RZ_A2M_Mbed_samples

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers intrin_cpp.hpp Source File

intrin_cpp.hpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                          License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
00016 // Copyright (C) 2015, Itseez Inc., all rights reserved.
00017 // Third party copyrights are property of their respective owners.
00018 //
00019 // Redistribution and use in source and binary forms, with or without modification,
00020 // are permitted provided that the following conditions are met:
00021 //
00022 //   * Redistribution's of source code must retain the above copyright notice,
00023 //     this list of conditions and the following disclaimer.
00024 //
00025 //   * Redistribution's in binary form must reproduce the above copyright notice,
00026 //     this list of conditions and the following disclaimer in the documentation
00027 //     and/or other materials provided with the distribution.
00028 //
00029 //   * The name of the copyright holders may not be used to endorse or promote products
00030 //     derived from this software without specific prior written permission.
00031 //
00032 // This software is provided by the copyright holders and contributors "as is" and
00033 // any express or implied warranties, including, but not limited to, the implied
00034 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00035 // In no event shall the Intel Corporation or contributors be liable for any direct,
00036 // indirect, incidental, special, exemplary, or consequential damages
00037 // (including, but not limited to, procurement of substitute goods or services;
00038 // loss of use, data, or profits; or business interruption) however caused
00039 // and on any theory of liability, whether in contract, strict liability,
00040 // or tort (including negligence or otherwise) arising in any way out of
00041 // the use of this software, even if advised of the possibility of such damage.
00042 //
00043 //M*/
00044 
00045 #ifndef OPENCV_HAL_INTRIN_CPP_HPP
00046 #define OPENCV_HAL_INTRIN_CPP_HPP
00047 
00048 #include <limits>
00049 #include <cstring>
00050 #include <algorithm>
00051 #include "opencv2/core/saturate.hpp"
00052 
00053 namespace cv
00054 {
00055 
00056 /** @addtogroup core_hal_intrin
00057 
00058 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
00059 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
00060 architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
00061 containing packed values of different types. In case when there is no SIMD extension available
00062 during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
00063 expected although it could be slower.
00064 
00065 ### Types
00066 
00067 There are several types representing 128-bit register as a vector of packed values, each type is
00068 implemented as a structure based on a one SIMD register.
00069 
00070 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
00071 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
00072 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
00073 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
00074 - cv::v_float32x4: four 32-bit floating point values (signed) - float
00075 - cv::v_float64x2: two 64-bit floating point valies (signed) - double
00076 
00077 @note
00078 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
00079 check the CV_SIMD128_64F preprocessor definition:
00080 @code
00081 #if CV_SIMD128_64F
00082 //...
00083 #endif
00084 @endcode
00085 
00086 ### Load and store operations
00087 
00088 These operations allow to set contents of the register explicitly or by loading it from some memory
00089 block and to save contents of the register to memory block.
00090 
00091 - Constructors:
00092 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
00093 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
00094 - Other create methods:
00095 @ref v_setall_s8, @ref v_setall_u8, ...,
00096 @ref v_setzero_u8, @ref v_setzero_s8, ...
00097 - Memory operations:
00098 @ref v_load, @ref v_load_aligned, @ref v_load_halves,
00099 @ref v_store, @ref v_store_aligned,
00100 @ref v_store_high, @ref v_store_low
00101 
00102 ### Value reordering
00103 
00104 These operations allow to reorder or recombine elements in one or multiple vectors.
00105 
00106 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
00107 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
00108 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
00109 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
00110 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
00111 - Extract: @ref v_extract
00112 
00113 
00114 ### Arithmetic, bitwise and comparison operations
00115 
00116 Element-wise binary and unary operations.
00117 
00118 - Arithmetics:
00119 @ref operator +(const v_reg &a, const v_reg &b) "+",
00120 @ref operator -(const v_reg &a, const v_reg &b) "-",
00121 @ref operator *(const v_reg &a, const v_reg &b) "*",
00122 @ref operator /(const v_reg &a, const v_reg &b) "/",
00123 @ref v_mul_expand
00124 
00125 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
00126 
00127 - Bitwise shifts:
00128 @ref operator <<(const v_reg &a, int s) "<<",
00129 @ref operator >>(const v_reg &a, int s) ">>",
00130 @ref v_shl, @ref v_shr
00131 
00132 - Bitwise logic:
00133 @ref operator&(const v_reg &a, const v_reg &b) "&",
00134 @ref operator |(const v_reg &a, const v_reg &b) "|",
00135 @ref operator ^(const v_reg &a, const v_reg &b) "^",
00136 @ref operator ~(const v_reg &a) "~"
00137 
00138 - Comparison:
00139 @ref operator >(const v_reg &a, const v_reg &b) ">",
00140 @ref operator >=(const v_reg &a, const v_reg &b) ">=",
00141 @ref operator <(const v_reg &a, const v_reg &b) "<",
00142 @ref operator <=(const v_reg &a, const v_reg &b) "<=",
00143 @ref operator==(const v_reg &a, const v_reg &b) "==",
00144 @ref operator !=(const v_reg &a, const v_reg &b) "!="
00145 
00146 - min/max: @ref v_min, @ref v_max
00147 
00148 ### Reduce and mask
00149 
00150 Most of these operations return only one value.
00151 
00152 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
00153 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
00154 
00155 ### Other math
00156 
00157 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
00158 - Absolute values: @ref v_abs, @ref v_absdiff
00159 
00160 ### Conversions
00161 
00162 Different type conversions and casts:
00163 
00164 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
00165 - To float: @ref v_cvt_f32, @ref v_cvt_f64
00166 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
00167 
00168 ### Matrix operations
00169 
00170 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
00171 
00172 ### Usability
00173 
00174 Most operations are implemented only for some subset of the available types, following matrices
00175 shows the applicability of different operations to the types.
00176 
00177 Regular integers:
00178 
00179 | Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
00180 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
00181 |load, store        | x | x | x | x | x | x |
00182 |interleave         | x | x | x | x | x | x |
00183 |expand             | x | x | x | x | x | x |
00184 |expand_q           | x | x |   |   |   |   |
00185 |add, sub           | x | x | x | x | x | x |
00186 |add_wrap, sub_wrap | x | x | x | x |   |   |
00187 |mul                |   |   | x | x | x | x |
00188 |mul_expand         |   |   | x | x | x |   |
00189 |compare            | x | x | x | x | x | x |
00190 |shift              |   |   | x | x | x | x |
00191 |dotprod            |   |   |   | x |   |   |
00192 |logical            | x | x | x | x | x | x |
00193 |min, max           | x | x | x | x | x | x |
00194 |absdiff            | x | x | x | x | x | x |
00195 |reduce             |   |   |   |   | x | x |
00196 |mask               | x | x | x | x | x | x |
00197 |pack               | x | x | x | x | x | x |
00198 |pack_u             | x |   | x |   |   |   |
00199 |unpack             | x | x | x | x | x | x |
00200 |extract            | x | x | x | x | x | x |
00201 |cvt_flt32          |   |   |   |   |   | x |
00202 |cvt_flt64          |   |   |   |   |   | x |
00203 |transpose4x4       |   |   |   |   | x | x |
00204 
00205 Big integers:
00206 
00207 | Operations\\Types | uint 64x2 | int 64x2 |
00208 |-------------------|:-:|:-:|
00209 |load, store        | x | x |
00210 |add, sub           | x | x |
00211 |shift              | x | x |
00212 |logical            | x | x |
00213 |extract            | x | x |
00214 
00215 Floating point:
00216 
00217 | Operations\\Types | float 32x4 | float 64x2 |
00218 |-------------------|:-:|:-:|
00219 |load, store        | x | x |
00220 |interleave         | x |   |
00221 |add, sub           | x | x |
00222 |mul                | x | x |
00223 |div                | x | x |
00224 |compare            | x | x |
00225 |min, max           | x | x |
00226 |absdiff            | x | x |
00227 |reduce             | x |   |
00228 |mask               | x | x |
00229 |unpack             | x | x |
00230 |cvt_flt32          |   | x |
00231 |cvt_flt64          | x |   |
00232 |sqrt, abs          | x | x |
00233 |float math         | x | x |
00234 |transpose4x4       | x |   |
00235 
00236 
00237  @{ */
00238 
00239 template<typename _Tp, int n> struct v_reg
00240 {
00241 //! @cond IGNORED
00242     typedef _Tp lane_type;
00243     typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
00244     typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
00245     enum { nlanes = n };
00246 // !@endcond
00247 
00248     /** @brief Constructor
00249 
00250     Initializes register with data from memory
00251     @param ptr pointer to memory block with data for register */
00252     explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
00253 
00254     /** @brief Constructor
00255 
00256     Initializes register with two 64-bit values */
00257     v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
00258 
00259     /** @brief Constructor
00260 
00261     Initializes register with four 32-bit values */
00262     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
00263 
00264     /** @brief Constructor
00265 
00266     Initializes register with eight 16-bit values */
00267     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
00268            _Tp s4, _Tp s5, _Tp s6, _Tp s7)
00269     {
00270         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
00271         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
00272     }
00273 
00274     /** @brief Constructor
00275 
00276     Initializes register with sixteen 8-bit values */
00277     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
00278            _Tp s4, _Tp s5, _Tp s6, _Tp s7,
00279            _Tp s8, _Tp s9, _Tp s10, _Tp s11,
00280            _Tp s12, _Tp s13, _Tp s14, _Tp s15)
00281     {
00282         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
00283         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
00284         s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
00285         s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
00286     }
00287 
00288     /** @brief Default constructor
00289 
00290     Does not initialize anything*/
00291     v_reg() {}
00292 
00293     /** @brief Copy constructor */
00294     v_reg(const v_reg<_Tp, n> & r)
00295     {
00296         for( int i = 0; i < n; i++ )
00297             s[i] = r.s[i];
00298     }
00299     /** @brief Access first value
00300 
00301     Returns value of the first lane according to register type, for example:
00302     @code{.cpp}
00303     v_int32x4 r(1, 2, 3, 4);
00304     int v = r.get0(); // returns 1
00305     v_uint64x2 r(1, 2);
00306     uint64_t v = r.get0(); // returns 1
00307     @endcode
00308     */
00309     _Tp get0() const { return s[0]; }
00310 
00311 //! @cond IGNORED
00312     _Tp get(const int i) const { return s[i]; }
00313     v_reg<_Tp, n> high() const
00314     {
00315         v_reg<_Tp, n> c;
00316         int i;
00317         for( i = 0; i < n/2; i++ )
00318         {
00319             c.s[i] = s[i+(n/2)];
00320             c.s[i+(n/2)] = 0;
00321         }
00322         return c;
00323     }
00324 
00325     static v_reg<_Tp, n> zero()
00326     {
00327         v_reg<_Tp, n> c;
00328         for( int i = 0; i < n; i++ )
00329             c.s[i] = (_Tp)0;
00330         return c;
00331     }
00332 
00333     static v_reg<_Tp, n> all(_Tp s)
00334     {
00335         v_reg<_Tp, n> c;
00336         for( int i = 0; i < n; i++ )
00337             c.s[i] = s;
00338         return c;
00339     }
00340 
00341     template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
00342     {
00343         size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
00344         v_reg<_Tp2, n2> c;
00345         std::memcpy(&c.s[0], &s[0], bytes);
00346         return c;
00347     }
00348 
00349     _Tp s[n];
00350 //! @endcond
00351 };
00352 
00353 /** @brief Sixteen 8-bit unsigned integer values */
00354 typedef v_reg<uchar, 16> v_uint8x16;
00355 /** @brief Sixteen 8-bit signed integer values */
00356 typedef v_reg<schar, 16> v_int8x16;
00357 /** @brief Eight 16-bit unsigned integer values */
00358 typedef v_reg<ushort, 8> v_uint16x8;
00359 /** @brief Eight 16-bit signed integer values */
00360 typedef v_reg<short, 8> v_int16x8;
00361 /** @brief Four 32-bit unsigned integer values */
00362 typedef v_reg<unsigned, 4> v_uint32x4;
00363 /** @brief Four 32-bit signed integer values */
00364 typedef v_reg<int, 4> v_int32x4;
00365 /** @brief Four 32-bit floating point values (single precision) */
00366 typedef v_reg<float, 4> v_float32x4;
00367 /** @brief Two 64-bit floating point values (double precision) */
00368 typedef v_reg<double, 2> v_float64x2;
00369 /** @brief Two 64-bit unsigned integer values */
00370 typedef v_reg<uint64, 2> v_uint64x2;
00371 /** @brief Two 64-bit signed integer values */
00372 typedef v_reg<int64, 2> v_int64x2;
00373 
00374 //! @brief Helper macro
00375 //! @ingroup core_hal_intrin_impl
00376 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
00377 template<typename _Tp, int n> inline v_reg<_Tp, n> \
00378     operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00379 { \
00380     v_reg<_Tp, n> c; \
00381     for( int i = 0; i < n; i++ ) \
00382         c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
00383     return c; \
00384 } \
00385 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
00386     operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00387 { \
00388     for( int i = 0; i < n; i++ ) \
00389         a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
00390     return a; \
00391 }
00392 
00393 /** @brief Add values
00394 
00395 For all types. */
00396 OPENCV_HAL_IMPL_BIN_OP(+)
00397 
00398 /** @brief Subtract values
00399 
00400 For all types. */
00401 OPENCV_HAL_IMPL_BIN_OP(-)
00402 
00403 /** @brief Multiply values
00404 
00405 For 16- and 32-bit integer types and floating types. */
00406 OPENCV_HAL_IMPL_BIN_OP(*)
00407 
00408 /** @brief Divide values
00409 
00410 For floating types only. */
00411 OPENCV_HAL_IMPL_BIN_OP(/)
00412 
00413 //! @brief Helper macro
00414 //! @ingroup core_hal_intrin_impl
00415 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
00416 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
00417     (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00418 { \
00419     v_reg<_Tp, n> c; \
00420     typedef typename V_TypeTraits<_Tp>::int_type itype; \
00421     for( int i = 0; i < n; i++ ) \
00422         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
00423                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
00424     return c; \
00425 } \
00426 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
00427     bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00428 { \
00429     typedef typename V_TypeTraits<_Tp>::int_type itype; \
00430     for( int i = 0; i < n; i++ ) \
00431         a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
00432                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
00433     return a; \
00434 }
00435 
00436 /** @brief Bitwise AND
00437 
00438 Only for integer types. */
00439 OPENCV_HAL_IMPL_BIT_OP(&)
00440 
00441 /** @brief Bitwise OR
00442 
00443 Only for integer types. */
00444 OPENCV_HAL_IMPL_BIT_OP(|)
00445 
00446 /** @brief Bitwise XOR
00447 
00448 Only for integer types.*/
00449 OPENCV_HAL_IMPL_BIT_OP(^)
00450 
00451 /** @brief Bitwise NOT
00452 
00453 Only for integer types.*/
00454 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
00455 {
00456     v_reg<_Tp, n> c;
00457     for( int i = 0; i < n; i++ )
00458     {
00459         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
00460     }
00461     return c;
00462 }
00463 
00464 //! @brief Helper macro
00465 //! @ingroup core_hal_intrin_impl
00466 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
00467 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
00468 { \
00469     v_reg<_Tp2, n> c; \
00470     for( int i = 0; i < n; i++ ) \
00471         c.s[i] = cfunc(a.s[i]); \
00472     return c; \
00473 }
00474 
00475 /** @brief Square root of elements
00476 
00477 Only for floating point types.*/
00478 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
00479 
00480 //! @cond IGNORED
00481 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
00482 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
00483 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
00484 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
00485 //! @endcond
00486 
00487 /** @brief Absolute value of elements
00488 
00489 Only for floating point types.*/
00490 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
00491                           typename V_TypeTraits<_Tp>::abs_type)
00492 
00493 /** @brief Round elements
00494 
00495 Only for floating point types.*/
00496 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
00497 
00498 /** @brief Floor elements
00499 
00500 Only for floating point types.*/
00501 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
00502 
00503 /** @brief Ceil elements
00504 
00505 Only for floating point types.*/
00506 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
00507 
00508 /** @brief Truncate elements
00509 
00510 Only for floating point types.*/
00511 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
00512 
00513 //! @brief Helper macro
00514 //! @ingroup core_hal_intrin_impl
00515 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
00516 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00517 { \
00518     v_reg<_Tp, n> c; \
00519     for( int i = 0; i < n; i++ ) \
00520         c.s[i] = cfunc(a.s[i], b.s[i]); \
00521     return c; \
00522 }
00523 
00524 //! @brief Helper macro
00525 //! @ingroup core_hal_intrin_impl
00526 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
00527 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
00528 { \
00529     _Tp c = a.s[0]; \
00530     for( int i = 1; i < n; i++ ) \
00531         c = cfunc(c, a.s[i]); \
00532     return c; \
00533 }
00534 
00535 /** @brief Choose min values for each pair
00536 
00537 Scheme:
00538 @code
00539 {A1 A2 ...}
00540 {B1 B2 ...}
00541 --------------
00542 {min(A1,B1) min(A2,B2) ...}
00543 @endcode
00544 For all types except 64-bit integer. */
00545 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
00546 
00547 /** @brief Choose max values for each pair
00548 
00549 Scheme:
00550 @code
00551 {A1 A2 ...}
00552 {B1 B2 ...}
00553 --------------
00554 {max(A1,B1) max(A2,B2) ...}
00555 @endcode
00556 For all types except 64-bit integer. */
00557 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
00558 
00559 /** @brief Find one min value
00560 
00561 Scheme:
00562 @code
00563 {A1 A2 A3 ...} => min(A1,A2,A3,...)
00564 @endcode
00565 For 32-bit integer and 32-bit floating point types. */
00566 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
00567 
00568 /** @brief Find one max value
00569 
00570 Scheme:
00571 @code
00572 {A1 A2 A3 ...} => max(A1,A2,A3,...)
00573 @endcode
00574 For 32-bit integer and 32-bit floating point types. */
00575 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
00576 
00577 //! @cond IGNORED
00578 template<typename _Tp, int n>
00579 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
00580                       v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
00581 {
00582     for( int i = 0; i < n; i++ )
00583     {
00584         minval.s[i] = std::min(a.s[i], b.s[i]);
00585         maxval.s[i] = std::max(a.s[i], b.s[i]);
00586     }
00587 }
00588 //! @endcond
00589 
00590 //! @brief Helper macro
00591 //! @ingroup core_hal_intrin_impl
00592 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
00593 template<typename _Tp, int n> \
00594 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00595 { \
00596     typedef typename V_TypeTraits<_Tp>::int_type itype; \
00597     v_reg<_Tp, n> c; \
00598     for( int i = 0; i < n; i++ ) \
00599         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
00600     return c; \
00601 }
00602 
00603 /** @brief Less-than comparison
00604 
00605 For all types except 64-bit integer values. */
00606 OPENCV_HAL_IMPL_CMP_OP(<)
00607 
00608 /** @brief Greater-than comparison
00609 
00610 For all types except 64-bit integer values. */
00611 OPENCV_HAL_IMPL_CMP_OP(>)
00612 
00613 /** @brief Less-than or equal comparison
00614 
00615 For all types except 64-bit integer values. */
00616 OPENCV_HAL_IMPL_CMP_OP(<=)
00617 
00618 /** @brief Greater-than or equal comparison
00619 
00620 For all types except 64-bit integer values. */
00621 OPENCV_HAL_IMPL_CMP_OP(>=)
00622 
00623 /** @brief Equal comparison
00624 
00625 For all types except 64-bit integer values. */
00626 OPENCV_HAL_IMPL_CMP_OP(==)
00627 
00628 /** @brief Not equal comparison
00629 
00630 For all types except 64-bit integer values. */
00631 OPENCV_HAL_IMPL_CMP_OP(!=)
00632 
00633 //! @brief Helper macro
00634 //! @ingroup core_hal_intrin_impl
00635 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
00636 template<typename _Tp, int n> \
00637 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00638 { \
00639     typedef _Tp2 rtype; \
00640     v_reg<rtype, n> c; \
00641     for( int i = 0; i < n; i++ ) \
00642         c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
00643     return c; \
00644 }
00645 
00646 /** @brief Add values without saturation
00647 
00648 For 8- and 16-bit integer values. */
00649 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
00650 
00651 /** @brief Subtract values without saturation
00652 
00653 For 8- and 16-bit integer values. */
00654 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
00655 
00656 //! @cond IGNORED
00657 template<typename T> inline T _absdiff(T a, T b)
00658 {
00659     return a > b ? a - b : b - a;
00660 }
00661 //! @endcond
00662 
00663 /** @brief Absolute difference
00664 
00665 Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
00666 Example:
00667 @code{.cpp}
00668 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
00669 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
00670 @endcode
00671 For 8-, 16-, 32-bit integer source types. */
00672 template<typename _Tp, int n>
00673 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
00674 {
00675     typedef typename V_TypeTraits<_Tp>::abs_type rtype;
00676     v_reg<rtype, n> c;
00677     const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0;
00678     for( int i = 0; i < n; i++ )
00679     {
00680         rtype ua = a.s[i] ^ mask;
00681         rtype ub = b.s[i] ^ mask;
00682         c.s[i] = _absdiff(ua, ub);
00683     }
00684     return c;
00685 }
00686 
00687 /** @overload
00688 
00689 For 32-bit floating point values */
00690 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
00691 {
00692     v_float32x4 c;
00693     for( int i = 0; i < c.nlanes; i++ )
00694         c.s[i] = _absdiff(a.s[i], b.s[i]);
00695     return c;
00696 }
00697 
00698 /** @overload
00699 
00700 For 64-bit floating point values */
00701 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
00702 {
00703     v_float64x2 c;
00704     for( int i = 0; i < c.nlanes; i++ )
00705         c.s[i] = _absdiff(a.s[i], b.s[i]);
00706     return c;
00707 }
00708 
00709 /** @brief Inversed square root
00710 
00711 Returns \f$ 1/sqrt(a) \f$
00712 For floating point types only. */
00713 template<typename _Tp, int n>
00714 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
00715 {
00716     v_reg<_Tp, n> c;
00717     for( int i = 0; i < n; i++ )
00718         c.s[i] = 1.f/std::sqrt(a.s[i]);
00719     return c;
00720 }
00721 
00722 /** @brief Magnitude
00723 
00724 Returns \f$ sqrt(a^2 + b^2) \f$
00725 For floating point types only. */
00726 template<typename _Tp, int n>
00727 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
00728 {
00729     v_reg<_Tp, n> c;
00730     for( int i = 0; i < n; i++ )
00731         c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
00732     return c;
00733 }
00734 
00735 /** @brief Square of the magnitude
00736 
00737 Returns \f$ a^2 + b^2 \f$
00738 For floating point types only. */
00739 template<typename _Tp, int n>
00740 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
00741 {
00742     v_reg<_Tp, n> c;
00743     for( int i = 0; i < n; i++ )
00744         c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
00745     return c;
00746 }
00747 
00748 /** @brief Multiply and add
00749 
00750 Returns \f$ a*b + c \f$
00751 For floating point types only. */
00752 template<typename _Tp, int n>
00753 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
00754                               const v_reg<_Tp, n>& c)
00755 {
00756     v_reg<_Tp, n> d;
00757     for( int i = 0; i < n; i++ )
00758         d.s[i] = a.s[i]*b.s[i] + c.s[i];
00759     return d;
00760 }
00761 
00762 /** @brief Dot product of elements
00763 
00764 Multiply values in two registers and sum adjacent result pairs.
00765 Scheme:
00766 @code
00767   {A1 A2 ...} // 16-bit
00768 x {B1 B2 ...} // 16-bit
00769 -------------
00770 {A1B1+A2B2 ...} // 32-bit
00771 @endcode
00772 Implemented only for 16-bit signed source type (v_int16x8).
00773 */
00774 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
00775     v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
00776 {
00777     typedef typename V_TypeTraits<_Tp>::w_type w_type;
00778     v_reg<w_type, n/2> c;
00779     for( int i = 0; i < (n/2); i++ )
00780         c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
00781     return c;
00782 }
00783 
00784 /** @brief Multiply and expand
00785 
00786 Multiply values two registers and store results in two registers with wider pack type.
00787 Scheme:
00788 @code
00789   {A B C D} // 32-bit
00790 x {E F G H} // 32-bit
00791 ---------------
00792 {AE BF}         // 64-bit
00793         {CG DH} // 64-bit
00794 @endcode
00795 Example:
00796 @code{.cpp}
00797 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
00798 v_uint64x2 c, d; // results
00799 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
00800 @endcode
00801 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
00802 */
00803 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
00804                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
00805                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
00806 {
00807     typedef typename V_TypeTraits<_Tp>::w_type w_type;
00808     for( int i = 0; i < (n/2); i++ )
00809     {
00810         c.s[i] = (w_type)a.s[i]*b.s[i];
00811         d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
00812     }
00813 }
00814 
00815 //! @cond IGNORED
00816 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
00817                                                  v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
00818 {
00819     typedef typename V_TypeTraits<_Tp>::w_type w_type;
00820     for( int i = 0; i < (n/2); i++ )
00821     {
00822         c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
00823     }
00824 }
00825 //! @endcond
00826 
00827 //! @brief Helper macro
00828 //! @ingroup core_hal_intrin_impl
00829 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
00830 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
00831 { \
00832     v_reg<_Tp, n> c; \
00833     for( int i = 0; i < n; i++ ) \
00834         c.s[i] = (_Tp)(a.s[i] shift_op imm); \
00835     return c; \
00836 }
00837 
00838 /** @brief Bitwise shift left
00839 
00840 For 16-, 32- and 64-bit integer values. */
00841 OPENCV_HAL_IMPL_SHIFT_OP(<<)
00842 
00843 /** @brief Bitwise shift right
00844 
00845 For 16-, 32- and 64-bit integer values. */
00846 OPENCV_HAL_IMPL_SHIFT_OP(>>)
00847 
00848 /** @brief Sum packed values
00849 
00850 Scheme:
00851 @code
00852 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
00853 @endcode
00854 For 32-bit integer and 32-bit floating point types.*/
00855 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
00856 {
00857     typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
00858     for( int i = 1; i < n; i++ )
00859         c += a.s[i];
00860     return c;
00861 }
00862 
00863 /** @brief Get negative values mask
00864 
00865 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
00866 Example:
00867 @code{.cpp}
00868 v_int32x4 r; // set to {-1, -1, 1, 1}
00869 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
00870 @endcode
00871 For all types except 64-bit. */
00872 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
00873 {
00874     int mask = 0;
00875     for( int i = 0; i < n; i++ )
00876         mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
00877     return mask;
00878 }
00879 
00880 /** @brief Check if all packed values are less than zero
00881 
00882 Unsigned values will be casted to signed: `uchar 254 => char -2`.
00883 For all types except 64-bit. */
00884 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
00885 {
00886     for( int i = 0; i < n; i++ )
00887         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
00888             return false;
00889     return true;
00890 }
00891 
00892 /** @brief Check if any of packed values is less than zero
00893 
00894 Unsigned values will be casted to signed: `uchar 254 => char -2`.
00895 For all types except 64-bit. */
00896 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
00897 {
00898     for( int i = 0; i < n; i++ )
00899         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
00900             return true;
00901     return false;
00902 }
00903 
00904 /** @brief Bitwise select
00905 
00906 Return value will be built by combining values a and b using the following scheme:
00907 If the i-th bit in _mask_ is 1
00908     select i-th bit from _a_
00909 else
00910     select i-th bit from _b_ */
00911 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
00912                                                            const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
00913 {
00914     typedef V_TypeTraits<_Tp> Traits;
00915     typedef typename Traits::int_type int_type;
00916     v_reg<_Tp, n> c;
00917     for( int i = 0; i < n; i++ )
00918     {
00919         int_type m = Traits::reinterpret_int(mask.s[i]);
00920         c.s[i] =  Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m)
00921                                              | (Traits::reinterpret_int(b.s[i]) & ~m));
00922     }
00923     return c;
00924 }
00925 
00926 /** @brief Expand values to the wider pack type
00927 
00928 Copy contents of register to two registers with 2x wider pack type.
00929 Scheme:
00930 @code
00931  int32x4     int64x2 int64x2
00932 {A B C D} ==> {A B} , {C D}
00933 @endcode */
00934 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
00935                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
00936                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
00937 {
00938     for( int i = 0; i < (n/2); i++ )
00939     {
00940         b0.s[i] = a.s[i];
00941         b1.s[i] = a.s[i+(n/2)];
00942     }
00943 }
00944 
00945 //! @cond IGNORED
00946 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
00947     v_reinterpret_as_int(const v_reg<_Tp, n>& a)
00948 {
00949     v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
00950     for( int i = 0; i < n; i++ )
00951         c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
00952     return c;
00953 }
00954 
00955 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
00956     v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
00957 {
00958     v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
00959     for( int i = 0; i < n; i++ )
00960         c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
00961     return c;
00962 }
00963 //! @endcond
00964 
00965 /** @brief Interleave two vectors
00966 
00967 Scheme:
00968 @code
00969   {A1 A2 A3 A4}
00970   {B1 B2 B3 B4}
00971 ---------------
00972   {A1 B1 A2 B2} and {A3 B3 A4 B4}
00973 @endcode
00974 For all types except 64-bit.
00975 */
00976 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
00977                                                v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
00978 {
00979     int i;
00980     for( i = 0; i < n/2; i++ )
00981     {
00982         b0.s[i*2] = a0.s[i];
00983         b0.s[i*2+1] = a1.s[i];
00984     }
00985     for( ; i < n; i++ )
00986     {
00987         b1.s[i*2-n] = a0.s[i];
00988         b1.s[i*2-n+1] = a1.s[i];
00989     }
00990 }
00991 
00992 /** @brief Load register contents from memory
00993 
00994 @param ptr pointer to memory block with data
00995 @return register object
00996 
00997 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
00998  */
00999 template<typename _Tp>
01000 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
01001 {
01002     return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
01003 }
01004 
01005 /** @brief Load register contents from memory (aligned)
01006 
01007 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
01008  */
01009 template<typename _Tp>
01010 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
01011 {
01012     return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
01013 }
01014 
01015 /** @brief Load register contents from two memory blocks
01016 
01017 @param loptr memory block containing data for first half (0..n/2)
01018 @param hiptr memory block containing data for second half (n/2..n)
01019 
01020 @code{.cpp}
01021 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
01022 v_int32x4 r = v_load_halves(lo, hi);
01023 @endcode
01024  */
01025 template<typename _Tp>
01026 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
01027 {
01028     v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
01029     for( int i = 0; i < c.nlanes/2; i++ )
01030     {
01031         c.s[i] = loptr[i];
01032         c.s[i+c.nlanes/2] = hiptr[i];
01033     }
01034     return c;
01035 }
01036 
01037 /** @brief Load register contents from memory with double expand
01038 
01039 Same as cv::v_load, but result pack type will be 2x wider than memory type.
01040 
01041 @code{.cpp}
01042 short buf[4] = {1, 2, 3, 4}; // type is int16
01043 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
01044 @endcode
01045 For 8-, 16-, 32-bit integer source types. */
01046 template<typename _Tp>
01047 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
01048 v_load_expand(const _Tp* ptr)
01049 {
01050     typedef typename V_TypeTraits<_Tp>::w_type w_type;
01051     v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
01052     for( int i = 0; i < c.nlanes; i++ )
01053     {
01054         c.s[i] = ptr[i];
01055     }
01056     return c;
01057 }
01058 
01059 /** @brief Load register contents from memory with quad expand
01060 
01061 Same as cv::v_load_expand, but result type is 4 times wider than source.
01062 @code{.cpp}
01063 char buf[4] = {1, 2, 3, 4}; // type is int8
01064 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
01065 @endcode
01066 For 8-bit integer source types. */
01067 template<typename _Tp>
01068 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
01069 v_load_expand_q(const _Tp* ptr)
01070 {
01071     typedef typename V_TypeTraits<_Tp>::q_type q_type;
01072     v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
01073     for( int i = 0; i < c.nlanes; i++ )
01074     {
01075         c.s[i] = ptr[i];
01076     }
01077     return c;
01078 }
01079 
01080 /** @brief Load and deinterleave (2 channels)
01081 
01082 Load data from memory deinterleave and store to 2 registers.
01083 Scheme:
01084 @code
01085 {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
01086 @endcode
01087 For all types except 64-bit. */
01088 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
01089                                                             v_reg<_Tp, n>& b)
01090 {
01091     int i, i2;
01092     for( i = i2 = 0; i < n; i++, i2 += 2 )
01093     {
01094         a.s[i] = ptr[i2];
01095         b.s[i] = ptr[i2+1];
01096     }
01097 }
01098 
01099 /** @brief Load and deinterleave (3 channels)
01100 
01101 Load data from memory deinterleave and store to 3 registers.
01102 Scheme:
01103 @code
01104 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
01105 @endcode
01106 For all types except 64-bit. */
01107 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
01108                                                             v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
01109 {
01110     int i, i3;
01111     for( i = i3 = 0; i < n; i++, i3 += 3 )
01112     {
01113         a.s[i] = ptr[i3];
01114         b.s[i] = ptr[i3+1];
01115         c.s[i] = ptr[i3+2];
01116     }
01117 }
01118 
01119 /** @brief Load and deinterleave (4 channels)
01120 
01121 Load data from memory deinterleave and store to 4 registers.
01122 Scheme:
01123 @code
01124 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
01125 @endcode
01126 For all types except 64-bit. */
01127 template<typename _Tp, int n>
01128 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
01129                                 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
01130                                 v_reg<_Tp, n>& d)
01131 {
01132     int i, i4;
01133     for( i = i4 = 0; i < n; i++, i4 += 4 )
01134     {
01135         a.s[i] = ptr[i4];
01136         b.s[i] = ptr[i4+1];
01137         c.s[i] = ptr[i4+2];
01138         d.s[i] = ptr[i4+3];
01139     }
01140 }
01141 
01142 /** @brief Interleave and store (2 channels)
01143 
01144 Interleave and store data from 2 registers to memory.
01145 Scheme:
01146 @code
01147 {A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
01148 @endcode
01149 For all types except 64-bit. */
01150 template<typename _Tp, int n>
01151 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
01152                                const v_reg<_Tp, n>& b)
01153 {
01154     int i, i2;
01155     for( i = i2 = 0; i < n; i++, i2 += 2 )
01156     {
01157         ptr[i2] = a.s[i];
01158         ptr[i2+1] = b.s[i];
01159     }
01160 }
01161 
01162 /** @brief Interleave and store (3 channels)
01163 
01164 Interleave and store data from 3 registers to memory.
01165 Scheme:
01166 @code
01167 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
01168 @endcode
01169 For all types except 64-bit. */
01170 template<typename _Tp, int n>
01171 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
01172                                 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
01173 {
01174     int i, i3;
01175     for( i = i3 = 0; i < n; i++, i3 += 3 )
01176     {
01177         ptr[i3] = a.s[i];
01178         ptr[i3+1] = b.s[i];
01179         ptr[i3+2] = c.s[i];
01180     }
01181 }
01182 
01183 /** @brief Interleave and store (4 channels)
01184 
01185 Interleave and store data from 4 registers to memory.
01186 Scheme:
01187 @code
01188 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
01189 @endcode
01190 For all types except 64-bit. */
01191 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
01192                                                             const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
01193                                                             const v_reg<_Tp, n>& d)
01194 {
01195     int i, i4;
01196     for( i = i4 = 0; i < n; i++, i4 += 4 )
01197     {
01198         ptr[i4] = a.s[i];
01199         ptr[i4+1] = b.s[i];
01200         ptr[i4+2] = c.s[i];
01201         ptr[i4+3] = d.s[i];
01202     }
01203 }
01204 
01205 /** @brief Store data to memory
01206 
01207 Store register contents to memory.
01208 Scheme:
01209 @code
01210   REG {A B C D} ==> MEM {A B C D}
01211 @endcode
01212 Pointer can be unaligned. */
01213 template<typename _Tp, int n>
01214 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
01215 {
01216     for( int i = 0; i < n; i++ )
01217         ptr[i] = a.s[i];
01218 }
01219 
01220 /** @brief Store data to memory (lower half)
01221 
01222 Store lower half of register contents to memory.
01223 Scheme:
01224 @code
01225   REG {A B C D} ==> MEM {A B}
01226 @endcode */
01227 template<typename _Tp, int n>
01228 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
01229 {
01230     for( int i = 0; i < (n/2); i++ )
01231         ptr[i] = a.s[i];
01232 }
01233 
01234 /** @brief Store data to memory (higher half)
01235 
01236 Store higher half of register contents to memory.
01237 Scheme:
01238 @code
01239   REG {A B C D} ==> MEM {C D}
01240 @endcode */
01241 template<typename _Tp, int n>
01242 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
01243 {
01244     for( int i = 0; i < (n/2); i++ )
01245         ptr[i] = a.s[i+(n/2)];
01246 }
01247 
01248 /** @brief Store data to memory (aligned)
01249 
01250 Store register contents to memory.
01251 Scheme:
01252 @code
01253   REG {A B C D} ==> MEM {A B C D}
01254 @endcode
01255 Pointer __should__ be aligned by 16-byte boundary. */
01256 template<typename _Tp, int n>
01257 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
01258 {
01259     for( int i = 0; i < n; i++ )
01260         ptr[i] = a.s[i];
01261 }
01262 
01263 /** @brief Combine vector from first elements of two vectors
01264 
01265 Scheme:
01266 @code
01267   {A1 A2 A3 A4}
01268   {B1 B2 B3 B4}
01269 ---------------
01270   {A1 A2 B1 B2}
01271 @endcode
01272 For all types except 64-bit. */
01273 template<typename _Tp, int n>
01274 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
01275 {
01276     v_reg<_Tp, n> c;
01277     for( int i = 0; i < (n/2); i++ )
01278     {
01279         c.s[i] = a.s[i];
01280         c.s[i+(n/2)] = b.s[i];
01281     }
01282     return c;
01283 }
01284 
01285 /** @brief Combine vector from last elements of two vectors
01286 
01287 Scheme:
01288 @code
01289   {A1 A2 A3 A4}
01290   {B1 B2 B3 B4}
01291 ---------------
01292   {A3 A4 B3 B4}
01293 @endcode
01294 For all types except 64-bit. */
01295 template<typename _Tp, int n>
01296 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
01297 {
01298     v_reg<_Tp, n> c;
01299     for( int i = 0; i < (n/2); i++ )
01300     {
01301         c.s[i] = a.s[i+(n/2)];
01302         c.s[i+(n/2)] = b.s[i+(n/2)];
01303     }
01304     return c;
01305 }
01306 
01307 /** @brief Combine two vectors from lower and higher parts of two other vectors
01308 
01309 @code{.cpp}
01310 low = cv::v_combine_low(a, b);
01311 high = cv::v_combine_high(a, b);
01312 @endcode */
01313 template<typename _Tp, int n>
01314 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
01315                         v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
01316 {
01317     for( int i = 0; i < (n/2); i++ )
01318     {
01319         low.s[i] = a.s[i];
01320         low.s[i+(n/2)] = b.s[i];
01321         high.s[i] = a.s[i+(n/2)];
01322         high.s[i+(n/2)] = b.s[i+(n/2)];
01323     }
01324 }
01325 
01326 /** @brief Vector extract
01327 
01328 Scheme:
01329 @code
01330   {A1 A2 A3 A4}
01331   {B1 B2 B3 B4}
01332 ========================
01333 shift = 1  {A2 A3 A4 B1}
01334 shift = 2  {A3 A4 B1 B2}
01335 shift = 3  {A4 B1 B2 B3}
01336 @endcode
01337 Restriction: 0 <= shift < nlanes
01338 
01339 Usage:
01340 @code
01341 v_int32x4 a, b, c;
01342 c = v_extract<2>(a, b);
01343 @endcode
01344 For integer types only. */
01345 template<int s, typename _Tp, int n>
01346 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
01347 {
01348     v_reg<_Tp, n> r;
01349     const int shift = n - s;
01350     int i = 0;
01351     for (; i < shift; ++i)
01352         r.s[i] = a.s[i+s];
01353     for (; i < n; ++i)
01354         r.s[i] = b.s[i-shift];
01355     return r;
01356 }
01357 
01358 /** @brief Round
01359 
01360 Rounds each value. Input type is float vector ==> output type is int vector.*/
01361 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
01362 {
01363     v_reg<int, n> c;
01364     for( int i = 0; i < n; i++ )
01365         c.s[i] = cvRound(a.s[i]);
01366     return c;
01367 }
01368 
01369 /** @brief Floor
01370 
01371 Floor each value. Input type is float vector ==> output type is int vector.*/
01372 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
01373 {
01374     v_reg<int, n> c;
01375     for( int i = 0; i < n; i++ )
01376         c.s[i] = cvFloor(a.s[i]);
01377     return c;
01378 }
01379 
01380 /** @brief Ceil
01381 
01382 Ceil each value. Input type is float vector ==> output type is int vector.*/
01383 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
01384 {
01385     v_reg<int, n> c;
01386     for( int i = 0; i < n; i++ )
01387         c.s[i] = cvCeil(a.s[i]);
01388     return c;
01389 }
01390 
01391 /** @brief Trunc
01392 
01393 Truncate each value. Input type is float vector ==> output type is int vector.*/
01394 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
01395 {
01396     v_reg<int, n> c;
01397     for( int i = 0; i < n; i++ )
01398         c.s[i] = (int)(a.s[i]);
01399     return c;
01400 }
01401 
01402 /** @overload */
01403 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
01404 {
01405     v_reg<int, n*2> c;
01406     for( int i = 0; i < n; i++ )
01407     {
01408         c.s[i] = cvRound(a.s[i]);
01409         c.s[i+n] = 0;
01410     }
01411     return c;
01412 }
01413 
01414 /** @overload */
01415 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
01416 {
01417     v_reg<int, n> c;
01418     for( int i = 0; i < n; i++ )
01419     {
01420         c.s[i] = cvFloor(a.s[i]);
01421         c.s[i+n] = 0;
01422     }
01423     return c;
01424 }
01425 
01426 /** @overload */
01427 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
01428 {
01429     v_reg<int, n> c;
01430     for( int i = 0; i < n; i++ )
01431     {
01432         c.s[i] = cvCeil(a.s[i]);
01433         c.s[i+n] = 0;
01434     }
01435     return c;
01436 }
01437 
01438 /** @overload */
01439 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
01440 {
01441     v_reg<int, n> c;
01442     for( int i = 0; i < n; i++ )
01443     {
01444         c.s[i] = cvCeil(a.s[i]);
01445         c.s[i+n] = 0;
01446     }
01447     return c;
01448 }
01449 
01450 /** @brief Convert to float
01451 
01452 Supported input type is cv::v_int32x4. */
01453 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
01454 {
01455     v_reg<float, n> c;
01456     for( int i = 0; i < n; i++ )
01457         c.s[i] = (float)a.s[i];
01458     return c;
01459 }
01460 
01461 /** @brief Convert to double
01462 
01463 Supported input type is cv::v_int32x4. */
01464 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
01465 {
01466     v_reg<double, n> c;
01467     for( int i = 0; i < n; i++ )
01468         c.s[i] = (double)a.s[i];
01469     return c;
01470 }
01471 
01472 /** @brief Convert to double
01473 
01474 Supported input type is cv::v_float32x4. */
01475 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
01476 {
01477     v_reg<double, n> c;
01478     for( int i = 0; i < n; i++ )
01479         c.s[i] = (double)a.s[i];
01480     return c;
01481 }
01482 
01483 /** @brief Transpose 4x4 matrix
01484 
01485 Scheme:
01486 @code
01487 a0  {A1 A2 A3 A4}
01488 a1  {B1 B2 B3 B4}
01489 a2  {C1 C2 C3 C4}
01490 a3  {D1 D2 D3 D4}
01491 ===============
01492 b0  {A1 B1 C1 D1}
01493 b1  {A2 B2 C2 D2}
01494 b2  {A3 B3 C3 D3}
01495 b3  {A4 B4 C4 D4}
01496 @endcode
01497 */
01498 template<typename _Tp>
01499 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
01500                             const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
01501                             v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
01502                             v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
01503 {
01504     b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
01505     b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
01506     b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
01507     b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
01508 }
01509 
01510 //! @brief Helper macro
01511 //! @ingroup core_hal_intrin_impl
01512 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
01513 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
01514 
01515 //! @name Init with zero
01516 //! @{
01517 //! @brief Create new vector with zero elements
01518 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
01519 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
01520 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
01521 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
01522 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
01523 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
01524 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
01525 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
01526 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
01527 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
01528 //! @}
01529 
01530 //! @brief Helper macro
01531 //! @ingroup core_hal_intrin_impl
01532 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
01533 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
01534 
01535 //! @name Init with value
01536 //! @{
01537 //! @brief Create new vector with elements set to a specific value
01538 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
01539 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
01540 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
01541 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
01542 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
01543 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
01544 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
01545 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
01546 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
01547 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
01548 //! @}
01549 
01550 //! @brief Helper macro
01551 //! @ingroup core_hal_intrin_impl
01552 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
01553 template<typename _Tp0, int n0> inline _Tpvec \
01554     v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
01555 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
01556 
01557 //! @name Reinterpret
01558 //! @{
01559 //! @brief Convert vector to different type without modifying underlying data.
01560 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
01561 OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
01562 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
01563 OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
01564 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
01565 OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
01566 OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
01567 OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
01568 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
01569 OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
01570 //! @}
01571 
01572 //! @brief Helper macro
01573 //! @ingroup core_hal_intrin_impl
01574 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
01575 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
01576 { return a << n; }
01577 
01578 //! @name Left shift
01579 //! @{
01580 //! @brief Shift left
01581 OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
01582 OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
01583 OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
01584 OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
01585 OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
01586 OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
01587 //! @}
01588 
01589 //! @brief Helper macro
01590 //! @ingroup core_hal_intrin_impl
01591 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
01592 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
01593 { return a >> n; }
01594 
01595 //! @name Right shift
01596 //! @{
01597 //! @brief Shift right
01598 OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
01599 OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
01600 OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
01601 OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
01602 OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
01603 OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
01604 //! @}
01605 
01606 //! @brief Helper macro
01607 //! @ingroup core_hal_intrin_impl
01608 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
01609 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
01610 { \
01611     _Tpvec c; \
01612     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01613         c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
01614     return c; \
01615 }
01616 
01617 //! @name Rounding shift
01618 //! @{
01619 //! @brief Rounding shift right
01620 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
01621 OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
01622 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
01623 OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
01624 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
01625 OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
01626 //! @}
01627 
01628 //! @brief Helper macro
01629 //! @ingroup core_hal_intrin_impl
01630 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \
01631 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
01632 { \
01633     _Tpnvec c; \
01634     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01635     { \
01636         c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
01637         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
01638     } \
01639     return c; \
01640 }
01641 
01642 //! @name Pack
01643 //! @{
01644 //! @brief Pack values from two vectors to one
01645 //!
01646 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
01647 //! converts to corresponding unsigned type.
01648 //!
01649 //! - pack: for 16-, 32- and 64-bit integer input types
01650 //! - pack_u: for 16- and 32-bit signed integer input types
01651 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack)
01652 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack)
01653 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack)
01654 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack)
01655 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack)
01656 OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack)
01657 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u)
01658 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u)
01659 //! @}
01660 
01661 //! @brief Helper macro
01662 //! @ingroup core_hal_intrin_impl
01663 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
01664 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
01665 { \
01666     _Tpnvec c; \
01667     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01668     { \
01669         c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
01670         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
01671     } \
01672     return c; \
01673 }
01674 
01675 //! @name Pack with rounding shift
01676 //! @{
01677 //! @brief Pack values from two vectors to one with rounding shift
01678 //!
01679 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
01680 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
01681 //!
01682 //! - pack: for 16-, 32- and 64-bit integer input types
01683 //! - pack_u: for 16- and 32-bit signed integer input types
01684 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
01685 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack)
01686 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
01687 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack)
01688 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
01689 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack)
01690 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
01691 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
01692 //! @}
01693 
01694 //! @brief Helper macro
01695 //! @ingroup core_hal_intrin_impl
01696 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
01697 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
01698 { \
01699     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01700         ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
01701 }
01702 
01703 //! @name Pack and store
01704 //! @{
01705 //! @brief Store values from the input vector into memory with pack
01706 //!
01707 //! Values will be stored into memory with saturating conversion to narrower type.
01708 //! Variant with _u_ suffix converts to corresponding unsigned type.
01709 //!
01710 //! - pack: for 16-, 32- and 64-bit integer input types
01711 //! - pack_u: for 16- and 32-bit signed integer input types
01712 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
01713 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
01714 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
01715 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
01716 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
01717 OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
01718 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
01719 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
01720 //! @}
01721 
01722 //! @brief Helper macro
01723 //! @ingroup core_hal_intrin_impl
01724 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
01725 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
01726 { \
01727     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01728         ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
01729 }
01730 
01731 //! @name Pack and store with rounding shift
01732 //! @{
01733 //! @brief Store values from the input vector into memory with pack
01734 //!
01735 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
01736 //! memory. Variant with _u_ suffix converts to unsigned type.
01737 //!
01738 //! - pack: for 16-, 32- and 64-bit integer input types
01739 //! - pack_u: for 16- and 32-bit signed integer input types
01740 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
01741 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
01742 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
01743 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
01744 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
01745 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
01746 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
01747 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
01748 //! @}
01749 
01750 /** @brief Matrix multiplication
01751 
01752 Scheme:
01753 @code
01754 {A0 A1 A2 A3}   |V0|
01755 {B0 B1 B2 B3}   |V1|
01756 {C0 C1 C2 C3}   |V2|
01757 {D0 D1 D2 D3} x |V3|
01758 ====================
01759 {R0 R1 R2 R3}, where:
01760 R0 = A0V0 + A1V1 + A2V2 + A3V3,
01761 R1 = B0V0 + B1V1 + B2V2 + B3V3
01762 ...
01763 @endcode
01764 */
01765 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
01766                             const v_float32x4& m1, const v_float32x4& m2,
01767                             const v_float32x4& m3)
01768 {
01769     return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
01770                        v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
01771                        v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
01772                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
01773 }
01774 
01775 //! @}
01776 
01777 //! @name Check SIMD support
01778 //! @{
01779 //! @brief Check CPU capability of SIMD operation
01780 static inline bool hasSIMD128()
01781 {
01782     return false;
01783 }
01784 
01785 //! @}
01786 
01787 
01788 }
01789 
01790 #endif