Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Dependents: RZ_A2M_Mbed_samples
intrin_sse.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 00016 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00017 // Third party copyrights are property of their respective owners. 00018 // 00019 // Redistribution and use in source and binary forms, with or without modification, 00020 // are permitted provided that the following conditions are met: 00021 // 00022 // * Redistribution's of source code must retain the above copyright notice, 00023 // this list of conditions and the following disclaimer. 00024 // 00025 // * Redistribution's in binary form must reproduce the above copyright notice, 00026 // this list of conditions and the following disclaimer in the documentation 00027 // and/or other materials provided with the distribution. 00028 // 00029 // * The name of the copyright holders may not be used to endorse or promote products 00030 // derived from this software without specific prior written permission. 00031 // 00032 // This software is provided by the copyright holders and contributors "as is" and 00033 // any express or implied warranties, including, but not limited to, the implied 00034 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00035 // In no event shall the Intel Corporation or contributors be liable for any direct, 00036 // indirect, incidental, special, exemplary, or consequential damages 00037 // (including, but not limited to, procurement of substitute goods or services; 00038 // loss of use, data, or profits; or business interruption) however caused 00039 // and on any theory of liability, whether in contract, strict liability, 00040 // or tort (including negligence or otherwise) arising in any way out of 00041 // the use of this software, even if advised of the possibility of such damage. 00042 // 00043 //M*/ 00044 00045 #ifndef OPENCV_HAL_SSE_HPP 00046 #define OPENCV_HAL_SSE_HPP 00047 00048 #include <algorithm> 00049 #include "opencv2/core/utility.hpp" 00050 00051 #define CV_SIMD128 1 00052 #define CV_SIMD128_64F 1 00053 00054 namespace cv 00055 { 00056 00057 //! @cond IGNORED 00058 00059 struct v_uint8x16 00060 { 00061 typedef uchar lane_type; 00062 enum { nlanes = 16 }; 00063 00064 v_uint8x16() {} 00065 explicit v_uint8x16(__m128i v) : val(v) {} 00066 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, 00067 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) 00068 { 00069 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, 00070 (char)v4, (char)v5, (char)v6, (char)v7, 00071 (char)v8, (char)v9, (char)v10, (char)v11, 00072 (char)v12, (char)v13, (char)v14, (char)v15); 00073 } 00074 uchar get0() const 00075 { 00076 return (uchar)_mm_cvtsi128_si32(val); 00077 } 00078 00079 __m128i val; 00080 }; 00081 00082 struct v_int8x16 00083 { 00084 typedef schar lane_type; 00085 enum { nlanes = 16 }; 00086 00087 v_int8x16() {} 00088 explicit v_int8x16(__m128i v) : val(v) {} 00089 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, 00090 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) 00091 { 00092 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, 00093 (char)v4, (char)v5, (char)v6, (char)v7, 00094 (char)v8, (char)v9, (char)v10, (char)v11, 00095 (char)v12, (char)v13, (char)v14, (char)v15); 00096 } 00097 schar get0() const 00098 { 00099 return (schar)_mm_cvtsi128_si32(val); 00100 } 00101 00102 __m128i val; 00103 }; 00104 00105 struct v_uint16x8 00106 { 00107 typedef ushort lane_type; 00108 enum { nlanes = 8 }; 00109 00110 v_uint16x8() {} 00111 explicit v_uint16x8(__m128i v) : val(v) {} 00112 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) 00113 { 00114 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, 00115 (short)v4, (short)v5, (short)v6, (short)v7); 00116 } 00117 ushort get0() const 00118 { 00119 return (ushort)_mm_cvtsi128_si32(val); 00120 } 00121 00122 __m128i val; 00123 }; 00124 00125 struct v_int16x8 00126 { 00127 typedef short lane_type; 00128 enum { nlanes = 8 }; 00129 00130 v_int16x8() {} 00131 explicit v_int16x8(__m128i v) : val(v) {} 00132 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) 00133 { 00134 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, 00135 (short)v4, (short)v5, (short)v6, (short)v7); 00136 } 00137 short get0() const 00138 { 00139 return (short)_mm_cvtsi128_si32(val); 00140 } 00141 __m128i val; 00142 }; 00143 00144 struct v_uint32x4 00145 { 00146 typedef unsigned lane_type; 00147 enum { nlanes = 4 }; 00148 00149 v_uint32x4() {} 00150 explicit v_uint32x4(__m128i v) : val(v) {} 00151 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) 00152 { 00153 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3); 00154 } 00155 unsigned get0() const 00156 { 00157 return (unsigned)_mm_cvtsi128_si32(val); 00158 } 00159 __m128i val; 00160 }; 00161 00162 struct v_int32x4 00163 { 00164 typedef int lane_type; 00165 enum { nlanes = 4 }; 00166 00167 v_int32x4() {} 00168 explicit v_int32x4(__m128i v) : val(v) {} 00169 v_int32x4(int v0, int v1, int v2, int v3) 00170 { 00171 val = _mm_setr_epi32(v0, v1, v2, v3); 00172 } 00173 int get0() const 00174 { 00175 return _mm_cvtsi128_si32(val); 00176 } 00177 __m128i val; 00178 }; 00179 00180 struct v_float32x4 00181 { 00182 typedef float lane_type; 00183 enum { nlanes = 4 }; 00184 00185 v_float32x4() {} 00186 explicit v_float32x4(__m128 v) : val(v) {} 00187 v_float32x4(float v0, float v1, float v2, float v3) 00188 { 00189 val = _mm_setr_ps(v0, v1, v2, v3); 00190 } 00191 float get0() const 00192 { 00193 return _mm_cvtss_f32(val); 00194 } 00195 __m128 val; 00196 }; 00197 00198 struct v_uint64x2 00199 { 00200 typedef uint64 lane_type; 00201 enum { nlanes = 2 }; 00202 00203 v_uint64x2() {} 00204 explicit v_uint64x2(__m128i v) : val(v) {} 00205 v_uint64x2(uint64 v0, uint64 v1) 00206 { 00207 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32)); 00208 } 00209 uint64 get0() const 00210 { 00211 int a = _mm_cvtsi128_si32(val); 00212 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); 00213 return (unsigned)a | ((uint64)(unsigned)b << 32); 00214 } 00215 __m128i val; 00216 }; 00217 00218 struct v_int64x2 00219 { 00220 typedef int64 lane_type; 00221 enum { nlanes = 2 }; 00222 00223 v_int64x2() {} 00224 explicit v_int64x2(__m128i v) : val(v) {} 00225 v_int64x2(int64 v0, int64 v1) 00226 { 00227 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32)); 00228 } 00229 int64 get0() const 00230 { 00231 int a = _mm_cvtsi128_si32(val); 00232 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); 00233 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32)); 00234 } 00235 __m128i val; 00236 }; 00237 00238 struct v_float64x2 00239 { 00240 typedef double lane_type; 00241 enum { nlanes = 2 }; 00242 00243 v_float64x2() {} 00244 explicit v_float64x2(__m128d v) : val(v) {} 00245 v_float64x2(double v0, double v1) 00246 { 00247 val = _mm_setr_pd(v0, v1); 00248 } 00249 double get0() const 00250 { 00251 return _mm_cvtsd_f64(val); 00252 } 00253 __m128d val; 00254 }; 00255 00256 #if defined(HAVE_FP16) 00257 struct v_float16x4 00258 { 00259 typedef short lane_type; 00260 enum { nlanes = 4 }; 00261 00262 v_float16x4() {} 00263 explicit v_float16x4(__m128i v) : val(v) {} 00264 v_float16x4(short v0, short v1, short v2, short v3) 00265 { 00266 val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0); 00267 } 00268 short get0() const 00269 { 00270 return (short)_mm_cvtsi128_si32(val); 00271 } 00272 __m128i val; 00273 }; 00274 #endif 00275 00276 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \ 00277 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \ 00278 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \ 00279 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ 00280 { return _Tpvec(cast(a.val)); } 00281 00282 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP) 00283 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP) 00284 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP) 00285 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP) 00286 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP) 00287 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP) 00288 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps) 00289 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd) 00290 00291 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); } 00292 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); } 00293 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); } 00294 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); } 00295 00296 template<typename _Tpvec> inline 00297 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); } 00298 template<typename _Tpvec> inline 00299 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); } 00300 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a) 00301 { return v_float32x4(_mm_castsi128_ps(a.val)); } 00302 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a) 00303 { return v_float32x4(_mm_castsi128_ps(a.val)); } 00304 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a) 00305 { return v_float64x2(_mm_castsi128_pd(a.val)); } 00306 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a) 00307 { return v_float64x2(_mm_castsi128_pd(a.val)); } 00308 00309 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \ 00310 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \ 00311 { return _Tpvec(_mm_castps_si128(a.val)); } \ 00312 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \ 00313 { return _Tpvec(_mm_castpd_si128(a.val)); } 00314 00315 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8) 00316 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8) 00317 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16) 00318 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16) 00319 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32) 00320 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32) 00321 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64) 00322 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64) 00323 00324 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; } 00325 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; } 00326 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); } 00327 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); } 00328 00329 //////////////// PACK /////////////// 00330 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) 00331 { 00332 __m128i delta = _mm_set1_epi16(255); 00333 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)), 00334 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta)))); 00335 } 00336 00337 inline void v_pack_store(uchar* ptr, const v_uint16x8& a) 00338 { 00339 __m128i delta = _mm_set1_epi16(255); 00340 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)); 00341 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); 00342 } 00343 00344 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) 00345 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); } 00346 00347 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a) 00348 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); } 00349 00350 template<int n> inline 00351 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b) 00352 { 00353 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. 00354 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00355 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n), 00356 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n))); 00357 } 00358 00359 template<int n> inline 00360 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a) 00361 { 00362 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00363 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n); 00364 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); 00365 } 00366 00367 template<int n> inline 00368 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b) 00369 { 00370 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00371 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), 00372 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); 00373 } 00374 00375 template<int n> inline 00376 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a) 00377 { 00378 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00379 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); 00380 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); 00381 } 00382 00383 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b) 00384 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); } 00385 00386 inline void v_pack_store(schar* ptr, v_int16x8& a) 00387 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } 00388 00389 template<int n> inline 00390 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b) 00391 { 00392 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. 00393 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00394 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), 00395 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); 00396 } 00397 template<int n> inline 00398 void v_rshr_pack_store(schar* ptr, const v_int16x8& a) 00399 { 00400 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. 00401 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00402 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); 00403 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1)); 00404 } 00405 00406 00407 // bit-wise "mask ? a : b" 00408 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) 00409 { 00410 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask)); 00411 } 00412 00413 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) 00414 { 00415 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); 00416 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); 00417 __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32); 00418 __m128i r = _mm_packs_epi32(a1, b1); 00419 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); 00420 } 00421 00422 inline void v_pack_store(ushort* ptr, const v_uint32x4& a) 00423 { 00424 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); 00425 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); 00426 __m128i r = _mm_packs_epi32(a1, a1); 00427 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768))); 00428 } 00429 00430 template<int n> inline 00431 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b) 00432 { 00433 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); 00434 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); 00435 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32); 00436 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); 00437 } 00438 00439 template<int n> inline 00440 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a) 00441 { 00442 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); 00443 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); 00444 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); 00445 _mm_storel_epi64((__m128i*)ptr, a2); 00446 } 00447 00448 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) 00449 { 00450 __m128i delta32 = _mm_set1_epi32(32768); 00451 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32)); 00452 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); 00453 } 00454 00455 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) 00456 { 00457 __m128i delta32 = _mm_set1_epi32(32768); 00458 __m128i a1 = _mm_sub_epi32(a.val, delta32); 00459 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); 00460 _mm_storel_epi64((__m128i*)ptr, r); 00461 } 00462 00463 template<int n> inline 00464 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b) 00465 { 00466 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); 00467 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); 00468 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); 00469 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32); 00470 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768)); 00471 return v_uint16x8(_mm_unpacklo_epi64(a2, b2)); 00472 } 00473 00474 template<int n> inline 00475 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) 00476 { 00477 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); 00478 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); 00479 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); 00480 _mm_storel_epi64((__m128i*)ptr, a2); 00481 } 00482 00483 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b) 00484 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); } 00485 00486 inline void v_pack_store(short* ptr, const v_int32x4& a) 00487 { 00488 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val)); 00489 } 00490 00491 template<int n> inline 00492 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b) 00493 { 00494 __m128i delta = _mm_set1_epi32(1 << (n-1)); 00495 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), 00496 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n))); 00497 } 00498 00499 template<int n> inline 00500 void v_rshr_pack_store(short* ptr, const v_int32x4& a) 00501 { 00502 __m128i delta = _mm_set1_epi32(1 << (n-1)); 00503 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); 00504 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); 00505 } 00506 00507 00508 // [a0 0 | b0 0] [a1 0 | b1 0] 00509 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b) 00510 { 00511 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 00512 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 00513 return v_uint32x4(_mm_unpacklo_epi32(v0, v1)); 00514 } 00515 00516 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a) 00517 { 00518 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0)); 00519 _mm_storel_epi64((__m128i*)ptr, a1); 00520 } 00521 00522 // [a0 0 | b0 0] [a1 0 | b1 0] 00523 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b) 00524 { 00525 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 00526 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 00527 return v_int32x4(_mm_unpacklo_epi32(v0, v1)); 00528 } 00529 00530 inline void v_pack_store(int* ptr, const v_int64x2& a) 00531 { 00532 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0)); 00533 _mm_storel_epi64((__m128i*)ptr, a1); 00534 } 00535 00536 template<int n> inline 00537 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b) 00538 { 00539 uint64 delta = (uint64)1 << (n-1); 00540 v_uint64x2 delta2(delta, delta); 00541 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); 00542 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n); 00543 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 00544 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 00545 return v_uint32x4(_mm_unpacklo_epi32(v0, v1)); 00546 } 00547 00548 template<int n> inline 00549 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a) 00550 { 00551 uint64 delta = (uint64)1 << (n-1); 00552 v_uint64x2 delta2(delta, delta); 00553 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); 00554 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0)); 00555 _mm_storel_epi64((__m128i*)ptr, a2); 00556 } 00557 00558 inline __m128i v_sign_epi64(__m128i a) 00559 { 00560 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1 00561 } 00562 00563 inline __m128i v_srai_epi64(__m128i a, int imm) 00564 { 00565 __m128i smask = v_sign_epi64(a); 00566 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask); 00567 } 00568 00569 template<int n> inline 00570 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b) 00571 { 00572 int64 delta = (int64)1 << (n-1); 00573 v_int64x2 delta2(delta, delta); 00574 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n); 00575 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n); 00576 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 00577 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 00578 return v_int32x4(_mm_unpacklo_epi32(v0, v1)); 00579 } 00580 00581 template<int n> inline 00582 void v_rshr_pack_store(int* ptr, const v_int64x2& a) 00583 { 00584 int64 delta = (int64)1 << (n-1); 00585 v_int64x2 delta2(delta, delta); 00586 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n); 00587 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0)); 00588 _mm_storel_epi64((__m128i*)ptr, a2); 00589 } 00590 00591 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, 00592 const v_float32x4& m1, const v_float32x4& m2, 00593 const v_float32x4& m3) 00594 { 00595 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val); 00596 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val); 00597 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val); 00598 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val); 00599 00600 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3))); 00601 } 00602 00603 00604 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \ 00605 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ 00606 { \ 00607 return _Tpvec(intrin(a.val, b.val)); \ 00608 } \ 00609 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ 00610 { \ 00611 a.val = intrin(a.val, b.val); \ 00612 return a; \ 00613 } 00614 00615 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8) 00616 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8) 00617 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) 00618 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) 00619 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) 00620 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) 00621 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16) 00622 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) 00623 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) 00624 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16) 00625 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) 00626 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) 00627 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) 00628 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) 00629 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) 00630 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) 00631 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) 00632 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps) 00633 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd) 00634 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd) 00635 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd) 00636 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd) 00637 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64) 00638 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64) 00639 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64) 00640 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) 00641 00642 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) 00643 { 00644 __m128i c0 = _mm_mul_epu32(a.val, b.val); 00645 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); 00646 __m128i d0 = _mm_unpacklo_epi32(c0, c1); 00647 __m128i d1 = _mm_unpackhi_epi32(c0, c1); 00648 return v_uint32x4(_mm_unpacklo_epi64(d0, d1)); 00649 } 00650 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) 00651 { 00652 __m128i c0 = _mm_mul_epu32(a.val, b.val); 00653 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); 00654 __m128i d0 = _mm_unpacklo_epi32(c0, c1); 00655 __m128i d1 = _mm_unpackhi_epi32(c0, c1); 00656 return v_int32x4(_mm_unpacklo_epi64(d0, d1)); 00657 } 00658 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b) 00659 { 00660 a = a * b; 00661 return a; 00662 } 00663 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b) 00664 { 00665 a = a * b; 00666 return a; 00667 } 00668 00669 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, 00670 v_int32x4& c, v_int32x4& d) 00671 { 00672 __m128i v0 = _mm_mullo_epi16(a.val, b.val); 00673 __m128i v1 = _mm_mulhi_epi16(a.val, b.val); 00674 c.val = _mm_unpacklo_epi16(v0, v1); 00675 d.val = _mm_unpackhi_epi16(v0, v1); 00676 } 00677 00678 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, 00679 v_uint32x4& c, v_uint32x4& d) 00680 { 00681 __m128i v0 = _mm_mullo_epi16(a.val, b.val); 00682 __m128i v1 = _mm_mulhi_epu16(a.val, b.val); 00683 c.val = _mm_unpacklo_epi16(v0, v1); 00684 d.val = _mm_unpackhi_epi16(v0, v1); 00685 } 00686 00687 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, 00688 v_uint64x2& c, v_uint64x2& d) 00689 { 00690 __m128i c0 = _mm_mul_epu32(a.val, b.val); 00691 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); 00692 c.val = _mm_unpacklo_epi64(c0, c1); 00693 d.val = _mm_unpackhi_epi64(c0, c1); 00694 } 00695 00696 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) 00697 { 00698 return v_int32x4(_mm_madd_epi16(a.val, b.val)); 00699 } 00700 00701 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ 00702 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ 00703 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ 00704 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \ 00705 inline _Tpvec operator ~ (const _Tpvec& a) \ 00706 { \ 00707 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \ 00708 } 00709 00710 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1)) 00711 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1)) 00712 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1)) 00713 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1)) 00714 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1)) 00715 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1)) 00716 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1)) 00717 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1)) 00718 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1))) 00719 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1))) 00720 00721 inline v_float32x4 v_sqrt(const v_float32x4& x) 00722 { return v_float32x4(_mm_sqrt_ps(x.val)); } 00723 00724 inline v_float32x4 v_invsqrt(const v_float32x4& x) 00725 { 00726 static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); 00727 __m128 t = x.val; 00728 __m128 h = _mm_mul_ps(t, _0_5); 00729 t = _mm_rsqrt_ps(t); 00730 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h))); 00731 return v_float32x4(t); 00732 } 00733 00734 inline v_float64x2 v_sqrt(const v_float64x2& x) 00735 { return v_float64x2(_mm_sqrt_pd(x.val)); } 00736 00737 inline v_float64x2 v_invsqrt(const v_float64x2& x) 00738 { 00739 static const __m128d v_1 = _mm_set1_pd(1.); 00740 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); 00741 } 00742 00743 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \ 00744 inline _Tpuvec v_abs(const _Tpsvec& x) \ 00745 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); } 00746 00747 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8) 00748 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16) 00749 inline v_uint32x4 v_abs(const v_int32x4& x) 00750 { 00751 __m128i s = _mm_srli_epi32(x.val, 31); 00752 __m128i f = _mm_srai_epi32(x.val, 31); 00753 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s)); 00754 } 00755 inline v_float32x4 v_abs(const v_float32x4& x) 00756 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } 00757 inline v_float64x2 v_abs(const v_float64x2& x) 00758 { 00759 return v_float64x2(_mm_and_pd(x.val, 00760 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1)))); 00761 } 00762 00763 // TODO: exp, log, sin, cos 00764 00765 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \ 00766 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ 00767 { \ 00768 return _Tpvec(intrin(a.val, b.val)); \ 00769 } 00770 00771 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8) 00772 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8) 00773 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16) 00774 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16) 00775 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps) 00776 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps) 00777 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd) 00778 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd) 00779 00780 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b) 00781 { 00782 __m128i delta = _mm_set1_epi8((char)-128); 00783 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta), 00784 _mm_xor_si128(b.val, delta)))); 00785 } 00786 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b) 00787 { 00788 __m128i delta = _mm_set1_epi8((char)-128); 00789 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta), 00790 _mm_xor_si128(b.val, delta)))); 00791 } 00792 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b) 00793 { 00794 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val))); 00795 } 00796 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b) 00797 { 00798 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val)); 00799 } 00800 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b) 00801 { 00802 __m128i delta = _mm_set1_epi32((int)0x80000000); 00803 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); 00804 return v_uint32x4(v_select_si128(mask, b.val, a.val)); 00805 } 00806 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b) 00807 { 00808 __m128i delta = _mm_set1_epi32((int)0x80000000); 00809 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); 00810 return v_uint32x4(v_select_si128(mask, a.val, b.val)); 00811 } 00812 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b) 00813 { 00814 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val)); 00815 } 00816 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b) 00817 { 00818 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val)); 00819 } 00820 00821 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \ 00822 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ 00823 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ 00824 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \ 00825 { \ 00826 __m128i not_mask = _mm_set1_epi32(-1); \ 00827 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ 00828 } \ 00829 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ 00830 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ 00831 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \ 00832 { \ 00833 __m128i not_mask = _mm_set1_epi32(-1); \ 00834 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ 00835 } \ 00836 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \ 00837 { \ 00838 __m128i smask = _mm_set1_##suffix(sbit); \ 00839 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \ 00840 } \ 00841 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ 00842 { \ 00843 __m128i smask = _mm_set1_##suffix(sbit); \ 00844 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \ 00845 } \ 00846 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \ 00847 { \ 00848 __m128i smask = _mm_set1_##suffix(sbit); \ 00849 __m128i not_mask = _mm_set1_epi32(-1); \ 00850 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \ 00851 return _Tpuvec(_mm_xor_si128(res, not_mask)); \ 00852 } \ 00853 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \ 00854 { \ 00855 __m128i smask = _mm_set1_##suffix(sbit); \ 00856 __m128i not_mask = _mm_set1_epi32(-1); \ 00857 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \ 00858 return _Tpuvec(_mm_xor_si128(res, not_mask)); \ 00859 } \ 00860 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \ 00861 { \ 00862 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \ 00863 } \ 00864 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ 00865 { \ 00866 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \ 00867 } \ 00868 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \ 00869 { \ 00870 __m128i not_mask = _mm_set1_epi32(-1); \ 00871 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \ 00872 } \ 00873 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ 00874 { \ 00875 __m128i not_mask = _mm_set1_epi32(-1); \ 00876 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ 00877 } 00878 00879 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128) 00880 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768) 00881 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) 00882 00883 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ 00884 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ 00885 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ 00886 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ 00887 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \ 00888 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ 00889 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \ 00890 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ 00891 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \ 00892 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ 00893 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \ 00894 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ 00895 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); } 00896 00897 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) 00898 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) 00899 00900 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) 00901 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) 00902 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) 00903 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16) 00904 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8) 00905 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) 00906 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) 00907 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) 00908 00909 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ 00910 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ 00911 { \ 00912 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \ 00913 } \ 00914 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ 00915 { \ 00916 __m128i smask = _mm_set1_epi32(smask32); \ 00917 __m128i a1 = _mm_xor_si128(a.val, smask); \ 00918 __m128i b1 = _mm_xor_si128(b.val, smask); \ 00919 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \ 00920 } 00921 00922 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) 00923 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) 00924 00925 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) 00926 { 00927 return v_max(a, b) - v_min(a, b); 00928 } 00929 00930 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) 00931 { 00932 __m128i d = _mm_sub_epi32(a.val, b.val); 00933 __m128i m = _mm_cmpgt_epi32(b.val, a.val); 00934 return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m)); 00935 } 00936 00937 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ 00938 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ 00939 { \ 00940 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \ 00941 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \ 00942 } \ 00943 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ 00944 { \ 00945 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ 00946 return _Tpvec(_mm_sqrt_##suffix(res)); \ 00947 } \ 00948 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ 00949 { \ 00950 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ 00951 return _Tpvec(res); \ 00952 } \ 00953 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ 00954 { \ 00955 return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \ 00956 } 00957 00958 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff)) 00959 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1)) 00960 00961 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ 00962 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ 00963 { \ 00964 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ 00965 } \ 00966 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ 00967 { \ 00968 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ 00969 } \ 00970 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ 00971 { \ 00972 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ 00973 } \ 00974 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ 00975 { \ 00976 return _Tpsvec(srai(a.val, imm)); \ 00977 } \ 00978 template<int imm> \ 00979 inline _Tpuvec v_shl(const _Tpuvec& a) \ 00980 { \ 00981 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ 00982 } \ 00983 template<int imm> \ 00984 inline _Tpsvec v_shl(const _Tpsvec& a) \ 00985 { \ 00986 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ 00987 } \ 00988 template<int imm> \ 00989 inline _Tpuvec v_shr(const _Tpuvec& a) \ 00990 { \ 00991 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ 00992 } \ 00993 template<int imm> \ 00994 inline _Tpsvec v_shr(const _Tpsvec& a) \ 00995 { \ 00996 return _Tpsvec(srai(a.val, imm)); \ 00997 } 00998 00999 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16) 01000 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32) 01001 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64) 01002 01003 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \ 01004 inline _Tpvec v_load(const _Tp* ptr) \ 01005 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \ 01006 inline _Tpvec v_load_aligned(const _Tp* ptr) \ 01007 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \ 01008 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ 01009 { \ 01010 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ 01011 _mm_loadl_epi64((const __m128i*)ptr1))); \ 01012 } \ 01013 inline void v_store(_Tp* ptr, const _Tpvec& a) \ 01014 { _mm_storeu_si128((__m128i*)ptr, a.val); } \ 01015 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ 01016 { _mm_store_si128((__m128i*)ptr, a.val); } \ 01017 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ 01018 { _mm_storel_epi64((__m128i*)ptr, a.val); } \ 01019 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ 01020 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); } 01021 01022 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar) 01023 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar) 01024 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort) 01025 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short) 01026 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned) 01027 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int) 01028 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64) 01029 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64) 01030 01031 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \ 01032 inline _Tpvec v_load(const _Tp* ptr) \ 01033 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \ 01034 inline _Tpvec v_load_aligned(const _Tp* ptr) \ 01035 { return _Tpvec(_mm_load_##suffix(ptr)); } \ 01036 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ 01037 { \ 01038 return _Tpvec(_mm_castsi128_##suffix( \ 01039 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ 01040 _mm_loadl_epi64((const __m128i*)ptr1)))); \ 01041 } \ 01042 inline void v_store(_Tp* ptr, const _Tpvec& a) \ 01043 { _mm_storeu_##suffix(ptr, a.val); } \ 01044 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ 01045 { _mm_store_##suffix(ptr, a.val); } \ 01046 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ 01047 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \ 01048 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ 01049 { \ 01050 __m128i a1 = _mm_cast##suffix##_si128(a.val); \ 01051 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \ 01052 } 01053 01054 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) 01055 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) 01056 01057 #if defined(HAVE_FP16) 01058 inline v_float16x4 v_load_f16(const short* ptr) 01059 { return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); } 01060 inline void v_store_f16(short* ptr, v_float16x4& a) 01061 { _mm_storel_epi64((__m128i*)ptr, a.val); } 01062 #endif 01063 01064 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \ 01065 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \ 01066 { \ 01067 __m128i val = a.val; \ 01068 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \ 01069 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \ 01070 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \ 01071 return (scalartype)_mm_cvtsi128_si32(val); \ 01072 } \ 01073 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \ 01074 { \ 01075 __m128i val = a.val; \ 01076 __m128i smask = _mm_set1_epi16(sbit); \ 01077 val = _mm_xor_si128(val, smask); \ 01078 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \ 01079 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \ 01080 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \ 01081 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \ 01082 } 01083 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \ 01084 inline scalartype v_reduce_sum(const v_##_Tpvec& a) \ 01085 { \ 01086 __m128i val = a.val; \ 01087 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \ 01088 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \ 01089 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \ 01090 return (scalartype)_mm_cvtsi128_si32(val); \ 01091 } \ 01092 inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \ 01093 { \ 01094 __m128i val = a.val; \ 01095 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \ 01096 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \ 01097 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \ 01098 return (unsigned scalartype)_mm_cvtsi128_si32(val); \ 01099 } 01100 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768) 01101 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768) 01102 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16) 01103 01104 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ 01105 inline scalartype v_reduce_##func(const _Tpvec& a) \ 01106 { \ 01107 scalartype CV_DECL_ALIGNED(16) buf[4]; \ 01108 v_store_aligned(buf, a); \ 01109 scalartype s0 = scalar_func(buf[0], buf[1]); \ 01110 scalartype s1 = scalar_func(buf[2], buf[3]); \ 01111 return scalar_func(s0, s1); \ 01112 } 01113 01114 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) 01115 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) 01116 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) 01117 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) 01118 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max) 01119 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min) 01120 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) 01121 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) 01122 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) 01123 01124 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ 01125 inline int v_signmask(const _Tpvec& a) \ 01126 { \ 01127 return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \ 01128 } \ 01129 inline bool v_check_all(const _Tpvec& a) \ 01130 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \ 01131 inline bool v_check_any(const _Tpvec& a) \ 01132 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; } 01133 01134 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a) 01135 inline __m128i v_packq_epi32(__m128i a) 01136 { 01137 __m128i b = _mm_packs_epi32(a, a); 01138 return _mm_packs_epi16(b, b); 01139 } 01140 01141 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) 01142 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) 01143 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) 01144 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) 01145 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) 01146 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) 01147 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) 01148 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) 01149 01150 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \ 01151 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ 01152 { \ 01153 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \ 01154 } 01155 01156 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128) 01157 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128) 01158 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128) 01159 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128) 01160 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128) 01161 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) 01162 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128) 01163 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128) 01164 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) 01165 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) 01166 01167 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ 01168 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ 01169 { \ 01170 __m128i z = _mm_setzero_si128(); \ 01171 b0.val = _mm_unpacklo_##suffix(a.val, z); \ 01172 b1.val = _mm_unpackhi_##suffix(a.val, z); \ 01173 } \ 01174 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \ 01175 { \ 01176 __m128i z = _mm_setzero_si128(); \ 01177 return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \ 01178 } \ 01179 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \ 01180 { \ 01181 b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \ 01182 b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \ 01183 } \ 01184 inline _Tpwsvec v_load_expand(const _Tps* ptr) \ 01185 { \ 01186 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ 01187 return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \ 01188 } 01189 01190 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) 01191 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) 01192 01193 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1) 01194 { 01195 __m128i z = _mm_setzero_si128(); 01196 b0.val = _mm_unpacklo_epi32(a.val, z); 01197 b1.val = _mm_unpackhi_epi32(a.val, z); 01198 } 01199 inline v_uint64x2 v_load_expand(const unsigned* ptr) 01200 { 01201 __m128i z = _mm_setzero_si128(); 01202 return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z)); 01203 } 01204 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1) 01205 { 01206 __m128i s = _mm_srai_epi32(a.val, 31); 01207 b0.val = _mm_unpacklo_epi32(a.val, s); 01208 b1.val = _mm_unpackhi_epi32(a.val, s); 01209 } 01210 inline v_int64x2 v_load_expand(const int* ptr) 01211 { 01212 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); 01213 __m128i s = _mm_srai_epi32(a, 31); 01214 return v_int64x2(_mm_unpacklo_epi32(a, s)); 01215 } 01216 01217 inline v_uint32x4 v_load_expand_q(const uchar* ptr) 01218 { 01219 __m128i z = _mm_setzero_si128(); 01220 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); 01221 return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z)); 01222 } 01223 01224 inline v_int32x4 v_load_expand_q(const schar* ptr) 01225 { 01226 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); 01227 a = _mm_unpacklo_epi8(a, a); 01228 a = _mm_unpacklo_epi8(a, a); 01229 return v_int32x4(_mm_srai_epi32(a, 24)); 01230 } 01231 01232 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \ 01233 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ 01234 { \ 01235 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \ 01236 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \ 01237 } \ 01238 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ 01239 { \ 01240 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ 01241 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \ 01242 } \ 01243 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ 01244 { \ 01245 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ 01246 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \ 01247 } \ 01248 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ 01249 { \ 01250 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ 01251 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \ 01252 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \ 01253 } 01254 01255 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01256 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01257 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01258 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01259 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01260 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01261 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) 01262 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) 01263 01264 template<int s, typename _Tpvec> 01265 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) 01266 { 01267 const int w = sizeof(typename _Tpvec::lane_type); 01268 const int n = _Tpvec::nlanes; 01269 __m128i ra, rb; 01270 ra = _mm_srli_si128(a.val, s*w); 01271 rb = _mm_slli_si128(b.val, (n-s)*w); 01272 return _Tpvec(_mm_or_si128(ra, rb)); 01273 } 01274 01275 inline v_int32x4 v_round(const v_float32x4& a) 01276 { return v_int32x4(_mm_cvtps_epi32(a.val)); } 01277 01278 inline v_int32x4 v_floor(const v_float32x4& a) 01279 { 01280 __m128i a1 = _mm_cvtps_epi32(a.val); 01281 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val)); 01282 return v_int32x4(_mm_add_epi32(a1, mask)); 01283 } 01284 01285 inline v_int32x4 v_ceil(const v_float32x4& a) 01286 { 01287 __m128i a1 = _mm_cvtps_epi32(a.val); 01288 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1))); 01289 return v_int32x4(_mm_sub_epi32(a1, mask)); 01290 } 01291 01292 inline v_int32x4 v_trunc(const v_float32x4& a) 01293 { return v_int32x4(_mm_cvttps_epi32(a.val)); } 01294 01295 inline v_int32x4 v_round(const v_float64x2& a) 01296 { return v_int32x4(_mm_cvtpd_epi32(a.val)); } 01297 01298 inline v_int32x4 v_floor(const v_float64x2& a) 01299 { 01300 __m128i a1 = _mm_cvtpd_epi32(a.val); 01301 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val)); 01302 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 01303 return v_int32x4(_mm_add_epi32(a1, mask)); 01304 } 01305 01306 inline v_int32x4 v_ceil(const v_float64x2& a) 01307 { 01308 __m128i a1 = _mm_cvtpd_epi32(a.val); 01309 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1))); 01310 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 01311 return v_int32x4(_mm_sub_epi32(a1, mask)); 01312 } 01313 01314 inline v_int32x4 v_trunc(const v_float64x2& a) 01315 { return v_int32x4(_mm_cvttpd_epi32(a.val)); } 01316 01317 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \ 01318 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ 01319 const _Tpvec& a2, const _Tpvec& a3, \ 01320 _Tpvec& b0, _Tpvec& b1, \ 01321 _Tpvec& b2, _Tpvec& b3) \ 01322 { \ 01323 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \ 01324 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ 01325 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \ 01326 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \ 01327 \ 01328 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \ 01329 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \ 01330 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \ 01331 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \ 01332 } 01333 01334 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01335 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01336 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) 01337 01338 // adopted from sse_utils.hpp 01339 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) 01340 { 01341 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); 01342 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16)); 01343 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32)); 01344 01345 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01)); 01346 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02); 01347 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02)); 01348 01349 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11)); 01350 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12); 01351 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12)); 01352 01353 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21)); 01354 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22); 01355 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22)); 01356 01357 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31)); 01358 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32); 01359 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32)); 01360 } 01361 01362 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) 01363 { 01364 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ... 01365 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... 01366 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ... 01367 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ... 01368 01369 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... 01370 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... 01371 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... 01372 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ... 01373 01374 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... 01375 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... 01376 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ... 01377 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ... 01378 01379 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... 01380 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... 01381 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ... 01382 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ... 01383 01384 a.val = _mm_unpacklo_epi8(v0, v1); 01385 b.val = _mm_unpackhi_epi8(v0, v1); 01386 c.val = _mm_unpacklo_epi8(v2, v3); 01387 d.val = _mm_unpackhi_epi8(v2, v3); 01388 } 01389 01390 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) 01391 { 01392 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); 01393 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8)); 01394 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16)); 01395 01396 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01)); 01397 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02); 01398 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02)); 01399 01400 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11)); 01401 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12); 01402 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12)); 01403 01404 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21)); 01405 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22); 01406 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22)); 01407 } 01408 01409 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) 01410 { 01411 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 01412 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ... 01413 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... 01414 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ... 01415 01416 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... 01417 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... 01418 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... 01419 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... 01420 01421 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ... 01422 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ... 01423 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ... 01424 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ... 01425 01426 a.val = _mm_unpacklo_epi16(u0, u1); 01427 b.val = _mm_unpackhi_epi16(u0, u1); 01428 c.val = _mm_unpacklo_epi16(u2, u3); 01429 d.val = _mm_unpackhi_epi16(u2, u3); 01430 } 01431 01432 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c) 01433 { 01434 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); 01435 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4)); 01436 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8)); 01437 01438 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01)); 01439 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02); 01440 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02)); 01441 01442 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11)); 01443 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12); 01444 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12)); 01445 } 01446 01447 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) 01448 { 01449 v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 01450 v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 01451 v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 01452 v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 01453 01454 v_transpose4x4(u0, u1, u2, u3, a, b, c, d); 01455 } 01456 01457 // 2-channel, float only 01458 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) 01459 { 01460 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 01461 01462 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1 01463 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3 01464 01465 a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3 01466 b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 01467 } 01468 01469 inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b ) 01470 { 01471 __m128i t0, t1; 01472 t0 = _mm_unpacklo_epi16(a.val, b.val); 01473 t1 = _mm_unpackhi_epi16(a.val, b.val); 01474 _mm_storeu_si128((__m128i*)(ptr), t0); 01475 _mm_storeu_si128((__m128i*)(ptr + 8), t1); 01476 } 01477 01478 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, 01479 const v_uint8x16& c ) 01480 { 01481 __m128i z = _mm_setzero_si128(); 01482 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val); 01483 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val); 01484 __m128i c0 = _mm_unpacklo_epi8(c.val, z); 01485 __m128i c1 = _mm_unpackhi_epi8(c.val, z); 01486 01487 __m128i p00 = _mm_unpacklo_epi16(ab0, c0); 01488 __m128i p01 = _mm_unpackhi_epi16(ab0, c0); 01489 __m128i p02 = _mm_unpacklo_epi16(ab1, c1); 01490 __m128i p03 = _mm_unpackhi_epi16(ab1, c1); 01491 01492 __m128i p10 = _mm_unpacklo_epi32(p00, p01); 01493 __m128i p11 = _mm_unpackhi_epi32(p00, p01); 01494 __m128i p12 = _mm_unpacklo_epi32(p02, p03); 01495 __m128i p13 = _mm_unpackhi_epi32(p02, p03); 01496 01497 __m128i p20 = _mm_unpacklo_epi64(p10, p11); 01498 __m128i p21 = _mm_unpackhi_epi64(p10, p11); 01499 __m128i p22 = _mm_unpacklo_epi64(p12, p13); 01500 __m128i p23 = _mm_unpackhi_epi64(p12, p13); 01501 01502 p20 = _mm_slli_si128(p20, 1); 01503 p22 = _mm_slli_si128(p22, 1); 01504 01505 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8); 01506 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8); 01507 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8); 01508 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8); 01509 01510 __m128i p40 = _mm_unpacklo_epi64(p30, p31); 01511 __m128i p41 = _mm_unpackhi_epi64(p30, p31); 01512 __m128i p42 = _mm_unpacklo_epi64(p32, p33); 01513 __m128i p43 = _mm_unpackhi_epi64(p32, p33); 01514 01515 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10)); 01516 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6)); 01517 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2)); 01518 01519 _mm_storeu_si128((__m128i*)(ptr), v0); 01520 _mm_storeu_si128((__m128i*)(ptr + 16), v1); 01521 _mm_storeu_si128((__m128i*)(ptr + 32), v2); 01522 } 01523 01524 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, 01525 const v_uint8x16& c, const v_uint8x16& d) 01526 { 01527 // a0 a1 a2 a3 .... 01528 // b0 b1 b2 b3 .... 01529 // c0 c1 c2 c3 .... 01530 // d0 d1 d2 d3 .... 01531 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ... 01532 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ... 01533 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ... 01534 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ... 01535 01536 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ... 01537 __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... 01538 __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... 01539 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ... 01540 01541 _mm_storeu_si128((__m128i*)ptr, v0); 01542 _mm_storeu_si128((__m128i*)(ptr + 16), v2); 01543 _mm_storeu_si128((__m128i*)(ptr + 32), v1); 01544 _mm_storeu_si128((__m128i*)(ptr + 48), v3); 01545 } 01546 01547 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, 01548 const v_uint16x8& b, 01549 const v_uint16x8& c ) 01550 { 01551 __m128i z = _mm_setzero_si128(); 01552 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val); 01553 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val); 01554 __m128i c0 = _mm_unpacklo_epi16(c.val, z); 01555 __m128i c1 = _mm_unpackhi_epi16(c.val, z); 01556 01557 __m128i p10 = _mm_unpacklo_epi32(ab0, c0); 01558 __m128i p11 = _mm_unpackhi_epi32(ab0, c0); 01559 __m128i p12 = _mm_unpacklo_epi32(ab1, c1); 01560 __m128i p13 = _mm_unpackhi_epi32(ab1, c1); 01561 01562 __m128i p20 = _mm_unpacklo_epi64(p10, p11); 01563 __m128i p21 = _mm_unpackhi_epi64(p10, p11); 01564 __m128i p22 = _mm_unpacklo_epi64(p12, p13); 01565 __m128i p23 = _mm_unpackhi_epi64(p12, p13); 01566 01567 p20 = _mm_slli_si128(p20, 2); 01568 p22 = _mm_slli_si128(p22, 2); 01569 01570 __m128i p30 = _mm_unpacklo_epi64(p20, p21); 01571 __m128i p31 = _mm_unpackhi_epi64(p20, p21); 01572 __m128i p32 = _mm_unpacklo_epi64(p22, p23); 01573 __m128i p33 = _mm_unpackhi_epi64(p22, p23); 01574 01575 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10)); 01576 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6)); 01577 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2)); 01578 01579 _mm_storeu_si128((__m128i*)(ptr), v0); 01580 _mm_storeu_si128((__m128i*)(ptr + 8), v1); 01581 _mm_storeu_si128((__m128i*)(ptr + 16), v2); 01582 } 01583 01584 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, 01585 const v_uint16x8& c, const v_uint16x8& d) 01586 { 01587 // a0 a1 a2 a3 .... 01588 // b0 b1 b2 b3 .... 01589 // c0 c1 c2 c3 .... 01590 // d0 d1 d2 d3 .... 01591 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ... 01592 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ... 01593 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ... 01594 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ... 01595 01596 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... 01597 __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... 01598 __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... 01599 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... 01600 01601 _mm_storeu_si128((__m128i*)ptr, v0); 01602 _mm_storeu_si128((__m128i*)(ptr + 8), v2); 01603 _mm_storeu_si128((__m128i*)(ptr + 16), v1); 01604 _mm_storeu_si128((__m128i*)(ptr + 24), v3); 01605 } 01606 01607 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, 01608 const v_uint32x4& c ) 01609 { 01610 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3; 01611 v_transpose4x4(a, b, c, z, u0, u1, u2, u3); 01612 01613 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12)); 01614 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8)); 01615 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4)); 01616 01617 _mm_storeu_si128((__m128i*)ptr, v0); 01618 _mm_storeu_si128((__m128i*)(ptr + 4), v1); 01619 _mm_storeu_si128((__m128i*)(ptr + 8), v2); 01620 } 01621 01622 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, 01623 const v_uint32x4& c, const v_uint32x4& d) 01624 { 01625 v_uint32x4 t0, t1, t2, t3; 01626 v_transpose4x4(a, b, c, d, t0, t1, t2, t3); 01627 v_store(ptr, t0); 01628 v_store(ptr + 4, t1); 01629 v_store(ptr + 8, t2); 01630 v_store(ptr + 12, t3); 01631 } 01632 01633 // 2-channel, float only 01634 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b) 01635 { 01636 // a0 a1 a2 a3 ... 01637 // b0 b1 b2 b3 ... 01638 __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1 01639 __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3 01640 01641 _mm_storeu_ps(ptr, u0); 01642 _mm_storeu_ps((ptr + 4), u1); 01643 } 01644 01645 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ 01646 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ 01647 _Tpvec& b0, _Tpvec& c0 ) \ 01648 { \ 01649 _Tpuvec a1, b1, c1; \ 01650 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \ 01651 a0 = v_reinterpret_as_##suffix(a1); \ 01652 b0 = v_reinterpret_as_##suffix(b1); \ 01653 c0 = v_reinterpret_as_##suffix(c1); \ 01654 } \ 01655 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ 01656 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \ 01657 { \ 01658 _Tpuvec a1, b1, c1, d1; \ 01659 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \ 01660 a0 = v_reinterpret_as_##suffix(a1); \ 01661 b0 = v_reinterpret_as_##suffix(b1); \ 01662 c0 = v_reinterpret_as_##suffix(c1); \ 01663 d0 = v_reinterpret_as_##suffix(d1); \ 01664 } \ 01665 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \ 01666 const _Tpvec& b0, const _Tpvec& c0 ) \ 01667 { \ 01668 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ 01669 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ 01670 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ 01671 v_store_interleave((_Tpu*)ptr, a1, b1, c1); \ 01672 } \ 01673 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \ 01674 const _Tpvec& c0, const _Tpvec& d0 ) \ 01675 { \ 01676 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ 01677 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ 01678 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ 01679 _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \ 01680 v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \ 01681 } 01682 01683 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) 01684 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) 01685 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) 01686 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) 01687 01688 inline v_float32x4 v_cvt_f32(const v_int32x4& a) 01689 { 01690 return v_float32x4(_mm_cvtepi32_ps(a.val)); 01691 } 01692 01693 inline v_float32x4 v_cvt_f32(const v_float64x2& a) 01694 { 01695 return v_float32x4(_mm_cvtpd_ps(a.val)); 01696 } 01697 01698 inline v_float64x2 v_cvt_f64(const v_int32x4& a) 01699 { 01700 return v_float64x2(_mm_cvtepi32_pd(a.val)); 01701 } 01702 01703 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) 01704 { 01705 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8))); 01706 } 01707 01708 inline v_float64x2 v_cvt_f64(const v_float32x4& a) 01709 { 01710 return v_float64x2(_mm_cvtps_pd(a.val)); 01711 } 01712 01713 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) 01714 { 01715 return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8)))); 01716 } 01717 01718 #if defined(HAVE_FP16) 01719 inline v_float32x4 v_cvt_f32(const v_float16x4& a) 01720 { 01721 return v_float32x4(_mm_cvtph_ps(a.val)); 01722 } 01723 01724 inline v_float16x4 v_cvt_f16(const v_float32x4& a) 01725 { 01726 return v_float16x4(_mm_cvtps_ph(a.val, 0)); 01727 } 01728 #endif 01729 01730 //! @name Check SIMD support 01731 //! @{ 01732 //! @brief Check CPU capability of SIMD operation 01733 static inline bool hasSIMD128() 01734 { 01735 return checkHardwareSupport(CV_CPU_SSE2); 01736 } 01737 01738 //! @} 01739 01740 //! @endcond 01741 01742 } 01743 01744 #endif
Generated on Tue Jul 12 2022 18:20:17 by
