Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
intrin_sse.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 00016 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00017 // Third party copyrights are property of their respective owners. 00018 // 00019 // Redistribution and use in source and binary forms, with or without modification, 00020 // are permitted provided that the following conditions are met: 00021 // 00022 // * Redistribution's of source code must retain the above copyright notice, 00023 // this list of conditions and the following disclaimer. 00024 // 00025 // * Redistribution's in binary form must reproduce the above copyright notice, 00026 // this list of conditions and the following disclaimer in the documentation 00027 // and/or other materials provided with the distribution. 00028 // 00029 // * The name of the copyright holders may not be used to endorse or promote products 00030 // derived from this software without specific prior written permission. 00031 // 00032 // This software is provided by the copyright holders and contributors "as is" and 00033 // any express or implied warranties, including, but not limited to, the implied 00034 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00035 // In no event shall the Intel Corporation or contributors be liable for any direct, 00036 // indirect, incidental, special, exemplary, or consequential damages 00037 // (including, but not limited to, procurement of substitute goods or services; 00038 // loss of use, data, or profits; or business interruption) however caused 00039 // and on any theory of liability, whether in contract, strict liability, 00040 // or tort (including negligence or otherwise) arising in any way out of 00041 // the use of this software, even if advised of the possibility of such damage. 00042 // 00043 //M*/ 00044 00045 #ifndef __OPENCV_HAL_SSE_HPP__ 00046 #define __OPENCV_HAL_SSE_HPP__ 00047 00048 #include <algorithm> 00049 00050 #define CV_SIMD128 1 00051 #define CV_SIMD128_64F 1 00052 00053 namespace cv 00054 { 00055 00056 //! @cond IGNORED 00057 00058 struct v_uint8x16 00059 { 00060 typedef uchar lane_type; 00061 enum { nlanes = 16 }; 00062 00063 v_uint8x16() {} 00064 explicit v_uint8x16(__m128i v) : val(v) {} 00065 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, 00066 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) 00067 { 00068 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, 00069 (char)v4, (char)v5, (char)v6, (char)v7, 00070 (char)v8, (char)v9, (char)v10, (char)v11, 00071 (char)v12, (char)v13, (char)v14, (char)v15); 00072 } 00073 uchar get0() const 00074 { 00075 return (uchar)_mm_cvtsi128_si32(val); 00076 } 00077 00078 __m128i val; 00079 }; 00080 00081 struct v_int8x16 00082 { 00083 typedef schar lane_type; 00084 enum { nlanes = 16 }; 00085 00086 v_int8x16() {} 00087 explicit v_int8x16(__m128i v) : val(v) {} 00088 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, 00089 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) 00090 { 00091 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, 00092 (char)v4, (char)v5, (char)v6, (char)v7, 00093 (char)v8, (char)v9, (char)v10, (char)v11, 00094 (char)v12, (char)v13, (char)v14, (char)v15); 00095 } 00096 schar get0() const 00097 { 00098 return (schar)_mm_cvtsi128_si32(val); 00099 } 00100 00101 __m128i val; 00102 }; 00103 00104 struct v_uint16x8 00105 { 00106 typedef ushort lane_type; 00107 enum { nlanes = 8 }; 00108 00109 v_uint16x8() {} 00110 explicit v_uint16x8(__m128i v) : val(v) {} 00111 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) 00112 { 00113 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, 00114 (short)v4, (short)v5, (short)v6, (short)v7); 00115 } 00116 ushort get0() const 00117 { 00118 return (ushort)_mm_cvtsi128_si32(val); 00119 } 00120 00121 __m128i val; 00122 }; 00123 00124 struct v_int16x8 00125 { 00126 typedef short lane_type; 00127 enum { nlanes = 8 }; 00128 00129 v_int16x8() {} 00130 explicit v_int16x8(__m128i v) : val(v) {} 00131 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) 00132 { 00133 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, 00134 (short)v4, (short)v5, (short)v6, (short)v7); 00135 } 00136 short get0() const 00137 { 00138 return (short)_mm_cvtsi128_si32(val); 00139 } 00140 __m128i val; 00141 }; 00142 00143 struct v_uint32x4 00144 { 00145 typedef unsigned lane_type; 00146 enum { nlanes = 4 }; 00147 00148 v_uint32x4() {} 00149 explicit v_uint32x4(__m128i v) : val(v) {} 00150 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) 00151 { 00152 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3); 00153 } 00154 unsigned get0() const 00155 { 00156 return (unsigned)_mm_cvtsi128_si32(val); 00157 } 00158 __m128i val; 00159 }; 00160 00161 struct v_int32x4 00162 { 00163 typedef int lane_type; 00164 enum { nlanes = 4 }; 00165 00166 v_int32x4() {} 00167 explicit v_int32x4(__m128i v) : val(v) {} 00168 v_int32x4(int v0, int v1, int v2, int v3) 00169 { 00170 val = _mm_setr_epi32(v0, v1, v2, v3); 00171 } 00172 int get0() const 00173 { 00174 return _mm_cvtsi128_si32(val); 00175 } 00176 __m128i val; 00177 }; 00178 00179 struct v_float32x4 00180 { 00181 typedef float lane_type; 00182 enum { nlanes = 4 }; 00183 00184 v_float32x4() {} 00185 explicit v_float32x4(__m128 v) : val(v) {} 00186 v_float32x4(float v0, float v1, float v2, float v3) 00187 { 00188 val = _mm_setr_ps(v0, v1, v2, v3); 00189 } 00190 float get0() const 00191 { 00192 return _mm_cvtss_f32(val); 00193 } 00194 __m128 val; 00195 }; 00196 00197 struct v_uint64x2 00198 { 00199 typedef uint64 lane_type; 00200 enum { nlanes = 2 }; 00201 00202 v_uint64x2() {} 00203 explicit v_uint64x2(__m128i v) : val(v) {} 00204 v_uint64x2(uint64 v0, uint64 v1) 00205 { 00206 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32)); 00207 } 00208 uint64 get0() const 00209 { 00210 int a = _mm_cvtsi128_si32(val); 00211 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); 00212 return (unsigned)a | ((uint64)(unsigned)b << 32); 00213 } 00214 __m128i val; 00215 }; 00216 00217 struct v_int64x2 00218 { 00219 typedef int64 lane_type; 00220 enum { nlanes = 2 }; 00221 00222 v_int64x2() {} 00223 explicit v_int64x2(__m128i v) : val(v) {} 00224 v_int64x2(int64 v0, int64 v1) 00225 { 00226 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32)); 00227 } 00228 int64 get0() const 00229 { 00230 int a = _mm_cvtsi128_si32(val); 00231 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); 00232 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32)); 00233 } 00234 __m128i val; 00235 }; 00236 00237 struct v_float64x2 00238 { 00239 typedef double lane_type; 00240 enum { nlanes = 2 }; 00241 00242 v_float64x2() {} 00243 explicit v_float64x2(__m128d v) : val(v) {} 00244 v_float64x2(double v0, double v1) 00245 { 00246 val = _mm_setr_pd(v0, v1); 00247 } 00248 double get0() const 00249 { 00250 return _mm_cvtsd_f64(val); 00251 } 00252 __m128d val; 00253 }; 00254 00255 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \ 00256 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \ 00257 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \ 00258 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ 00259 { return _Tpvec(cast(a.val)); } 00260 00261 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP) 00262 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP) 00263 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP) 00264 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP) 00265 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP) 00266 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP) 00267 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps) 00268 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd) 00269 00270 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); } 00271 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); } 00272 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); } 00273 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); } 00274 00275 template<typename _Tpvec> inline 00276 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); } 00277 template<typename _Tpvec> inline 00278 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); } 00279 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a) 00280 { return v_float32x4(_mm_castsi128_ps(a.val)); } 00281 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a) 00282 { return v_float32x4(_mm_castsi128_ps(a.val)); } 00283 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a) 00284 { return v_float64x2(_mm_castsi128_pd(a.val)); } 00285 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a) 00286 { return v_float64x2(_mm_castsi128_pd(a.val)); } 00287 00288 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \ 00289 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \ 00290 { return _Tpvec(_mm_castps_si128(a.val)); } \ 00291 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \ 00292 { return _Tpvec(_mm_castpd_si128(a.val)); } 00293 00294 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8) 00295 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8) 00296 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16) 00297 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16) 00298 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32) 00299 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32) 00300 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64) 00301 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64) 00302 00303 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; } 00304 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; } 00305 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); } 00306 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); } 00307 00308 //////////////// PACK /////////////// 00309 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) 00310 { 00311 __m128i delta = _mm_set1_epi16(255); 00312 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)), 00313 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta)))); 00314 } 00315 00316 inline void v_pack_store(uchar* ptr, const v_uint16x8& a) 00317 { 00318 __m128i delta = _mm_set1_epi16(255); 00319 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)); 00320 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); 00321 } 00322 00323 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) 00324 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); } 00325 00326 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a) 00327 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); } 00328 00329 template<int n> inline 00330 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b) 00331 { 00332 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. 00333 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00334 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n), 00335 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n))); 00336 } 00337 00338 template<int n> inline 00339 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a) 00340 { 00341 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00342 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n); 00343 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); 00344 } 00345 00346 template<int n> inline 00347 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b) 00348 { 00349 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00350 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), 00351 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); 00352 } 00353 00354 template<int n> inline 00355 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a) 00356 { 00357 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00358 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); 00359 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); 00360 } 00361 00362 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b) 00363 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); } 00364 00365 inline void v_pack_store(schar* ptr, v_int16x8& a) 00366 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } 00367 00368 template<int n> inline 00369 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b) 00370 { 00371 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. 00372 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00373 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), 00374 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); 00375 } 00376 template<int n> inline 00377 void v_rshr_pack_store(schar* ptr, const v_int16x8& a) 00378 { 00379 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. 00380 __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); 00381 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); 00382 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1)); 00383 } 00384 00385 00386 // bit-wise "mask ? a : b" 00387 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) 00388 { 00389 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask)); 00390 } 00391 00392 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) 00393 { 00394 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); 00395 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); 00396 __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32); 00397 __m128i r = _mm_packs_epi32(a1, b1); 00398 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); 00399 } 00400 00401 inline void v_pack_store(ushort* ptr, const v_uint32x4& a) 00402 { 00403 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); 00404 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); 00405 __m128i r = _mm_packs_epi32(a1, a1); 00406 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768))); 00407 } 00408 00409 template<int n> inline 00410 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b) 00411 { 00412 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); 00413 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); 00414 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32); 00415 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); 00416 } 00417 00418 template<int n> inline 00419 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a) 00420 { 00421 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); 00422 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); 00423 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); 00424 _mm_storel_epi64((__m128i*)ptr, a2); 00425 } 00426 00427 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) 00428 { 00429 __m128i delta32 = _mm_set1_epi32(32768); 00430 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32)); 00431 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); 00432 } 00433 00434 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) 00435 { 00436 __m128i delta32 = _mm_set1_epi32(32768); 00437 __m128i a1 = _mm_sub_epi32(a.val, delta32); 00438 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); 00439 _mm_storel_epi64((__m128i*)ptr, r); 00440 } 00441 00442 template<int n> inline 00443 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b) 00444 { 00445 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); 00446 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); 00447 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); 00448 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32); 00449 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768)); 00450 return v_uint16x8(_mm_unpacklo_epi64(a2, b2)); 00451 } 00452 00453 template<int n> inline 00454 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) 00455 { 00456 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); 00457 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); 00458 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); 00459 _mm_storel_epi64((__m128i*)ptr, a2); 00460 } 00461 00462 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b) 00463 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); } 00464 00465 inline void v_pack_store(short* ptr, const v_int32x4& a) 00466 { 00467 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val)); 00468 } 00469 00470 template<int n> inline 00471 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b) 00472 { 00473 __m128i delta = _mm_set1_epi32(1 << (n-1)); 00474 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), 00475 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n))); 00476 } 00477 00478 template<int n> inline 00479 void v_rshr_pack_store(short* ptr, const v_int32x4& a) 00480 { 00481 __m128i delta = _mm_set1_epi32(1 << (n-1)); 00482 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); 00483 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); 00484 } 00485 00486 00487 // [a0 0 | b0 0] [a1 0 | b1 0] 00488 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b) 00489 { 00490 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 00491 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 00492 return v_uint32x4(_mm_unpacklo_epi32(v0, v1)); 00493 } 00494 00495 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a) 00496 { 00497 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0)); 00498 _mm_storel_epi64((__m128i*)ptr, a1); 00499 } 00500 00501 // [a0 0 | b0 0] [a1 0 | b1 0] 00502 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b) 00503 { 00504 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 00505 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 00506 return v_int32x4(_mm_unpacklo_epi32(v0, v1)); 00507 } 00508 00509 inline void v_pack_store(int* ptr, const v_int64x2& a) 00510 { 00511 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0)); 00512 _mm_storel_epi64((__m128i*)ptr, a1); 00513 } 00514 00515 template<int n> inline 00516 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b) 00517 { 00518 uint64 delta = (uint64)1 << (n-1); 00519 v_uint64x2 delta2(delta, delta); 00520 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); 00521 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n); 00522 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 00523 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 00524 return v_uint32x4(_mm_unpacklo_epi32(v0, v1)); 00525 } 00526 00527 template<int n> inline 00528 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a) 00529 { 00530 uint64 delta = (uint64)1 << (n-1); 00531 v_uint64x2 delta2(delta, delta); 00532 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); 00533 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0)); 00534 _mm_storel_epi64((__m128i*)ptr, a2); 00535 } 00536 00537 inline __m128i v_sign_epi64(__m128i a) 00538 { 00539 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1 00540 } 00541 00542 inline __m128i v_srai_epi64(__m128i a, int imm) 00543 { 00544 __m128i smask = v_sign_epi64(a); 00545 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask); 00546 } 00547 00548 template<int n> inline 00549 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b) 00550 { 00551 int64 delta = (int64)1 << (n-1); 00552 v_int64x2 delta2(delta, delta); 00553 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n); 00554 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n); 00555 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 00556 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 00557 return v_int32x4(_mm_unpacklo_epi32(v0, v1)); 00558 } 00559 00560 template<int n> inline 00561 void v_rshr_pack_store(int* ptr, const v_int64x2& a) 00562 { 00563 int64 delta = (int64)1 << (n-1); 00564 v_int64x2 delta2(delta, delta); 00565 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n); 00566 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0)); 00567 _mm_storel_epi64((__m128i*)ptr, a2); 00568 } 00569 00570 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, 00571 const v_float32x4& m1, const v_float32x4& m2, 00572 const v_float32x4& m3) 00573 { 00574 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val); 00575 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val); 00576 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val); 00577 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val); 00578 00579 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3))); 00580 } 00581 00582 00583 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \ 00584 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ 00585 { \ 00586 return _Tpvec(intrin(a.val, b.val)); \ 00587 } \ 00588 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ 00589 { \ 00590 a.val = intrin(a.val, b.val); \ 00591 return a; \ 00592 } 00593 00594 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8) 00595 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8) 00596 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) 00597 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) 00598 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) 00599 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) 00600 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16) 00601 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) 00602 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) 00603 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16) 00604 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) 00605 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) 00606 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) 00607 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) 00608 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) 00609 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) 00610 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) 00611 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps) 00612 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd) 00613 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd) 00614 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd) 00615 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd) 00616 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64) 00617 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64) 00618 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64) 00619 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) 00620 00621 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) 00622 { 00623 __m128i c0 = _mm_mul_epu32(a.val, b.val); 00624 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); 00625 __m128i d0 = _mm_unpacklo_epi32(c0, c1); 00626 __m128i d1 = _mm_unpackhi_epi32(c0, c1); 00627 return v_uint32x4(_mm_unpacklo_epi64(d0, d1)); 00628 } 00629 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) 00630 { 00631 __m128i c0 = _mm_mul_epu32(a.val, b.val); 00632 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); 00633 __m128i d0 = _mm_unpacklo_epi32(c0, c1); 00634 __m128i d1 = _mm_unpackhi_epi32(c0, c1); 00635 return v_int32x4(_mm_unpacklo_epi64(d0, d1)); 00636 } 00637 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b) 00638 { 00639 a = a * b; 00640 return a; 00641 } 00642 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b) 00643 { 00644 a = a * b; 00645 return a; 00646 } 00647 00648 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, 00649 v_int32x4& c, v_int32x4& d) 00650 { 00651 __m128i v0 = _mm_mullo_epi16(a.val, b.val); 00652 __m128i v1 = _mm_mulhi_epi16(a.val, b.val); 00653 c.val = _mm_unpacklo_epi16(v0, v1); 00654 d.val = _mm_unpackhi_epi16(v0, v1); 00655 } 00656 00657 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, 00658 v_uint32x4& c, v_uint32x4& d) 00659 { 00660 __m128i v0 = _mm_mullo_epi16(a.val, b.val); 00661 __m128i v1 = _mm_mulhi_epu16(a.val, b.val); 00662 c.val = _mm_unpacklo_epi16(v0, v1); 00663 d.val = _mm_unpackhi_epi16(v0, v1); 00664 } 00665 00666 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, 00667 v_uint64x2& c, v_uint64x2& d) 00668 { 00669 __m128i c0 = _mm_mul_epu32(a.val, b.val); 00670 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); 00671 c.val = _mm_unpacklo_epi64(c0, c1); 00672 d.val = _mm_unpackhi_epi64(c0, c1); 00673 } 00674 00675 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) 00676 { 00677 return v_int32x4(_mm_madd_epi16(a.val, b.val)); 00678 } 00679 00680 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ 00681 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ 00682 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ 00683 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \ 00684 inline _Tpvec operator ~ (const _Tpvec& a) \ 00685 { \ 00686 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \ 00687 } 00688 00689 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1)) 00690 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1)) 00691 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1)) 00692 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1)) 00693 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1)) 00694 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1)) 00695 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1)) 00696 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1)) 00697 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1))) 00698 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1))) 00699 00700 inline v_float32x4 v_sqrt(const v_float32x4& x) 00701 { return v_float32x4(_mm_sqrt_ps(x.val)); } 00702 00703 inline v_float32x4 v_invsqrt(const v_float32x4& x) 00704 { 00705 static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); 00706 __m128 t = x.val; 00707 __m128 h = _mm_mul_ps(t, _0_5); 00708 t = _mm_rsqrt_ps(t); 00709 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h))); 00710 return v_float32x4(t); 00711 } 00712 00713 inline v_float64x2 v_sqrt(const v_float64x2& x) 00714 { return v_float64x2(_mm_sqrt_pd(x.val)); } 00715 00716 inline v_float64x2 v_invsqrt(const v_float64x2& x) 00717 { 00718 static const __m128d v_1 = _mm_set1_pd(1.); 00719 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); 00720 } 00721 00722 inline v_float32x4 v_abs(const v_float32x4& x) 00723 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } 00724 inline v_float64x2 v_abs(const v_float64x2& x) 00725 { 00726 return v_float64x2(_mm_and_pd(x.val, 00727 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1)))); 00728 } 00729 00730 // TODO: exp, log, sin, cos 00731 00732 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \ 00733 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ 00734 { \ 00735 return _Tpvec(intrin(a.val, b.val)); \ 00736 } 00737 00738 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8) 00739 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8) 00740 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16) 00741 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16) 00742 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps) 00743 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps) 00744 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd) 00745 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd) 00746 00747 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b) 00748 { 00749 __m128i delta = _mm_set1_epi8((char)-128); 00750 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta), 00751 _mm_xor_si128(b.val, delta)))); 00752 } 00753 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b) 00754 { 00755 __m128i delta = _mm_set1_epi8((char)-128); 00756 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta), 00757 _mm_xor_si128(b.val, delta)))); 00758 } 00759 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b) 00760 { 00761 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val))); 00762 } 00763 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b) 00764 { 00765 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val)); 00766 } 00767 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b) 00768 { 00769 __m128i delta = _mm_set1_epi32((int)0x80000000); 00770 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); 00771 return v_uint32x4(v_select_si128(mask, b.val, a.val)); 00772 } 00773 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b) 00774 { 00775 __m128i delta = _mm_set1_epi32((int)0x80000000); 00776 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); 00777 return v_uint32x4(v_select_si128(mask, a.val, b.val)); 00778 } 00779 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b) 00780 { 00781 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val)); 00782 } 00783 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b) 00784 { 00785 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val)); 00786 } 00787 00788 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \ 00789 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ 00790 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ 00791 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \ 00792 { \ 00793 __m128i not_mask = _mm_set1_epi32(-1); \ 00794 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ 00795 } \ 00796 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ 00797 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ 00798 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \ 00799 { \ 00800 __m128i not_mask = _mm_set1_epi32(-1); \ 00801 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ 00802 } \ 00803 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \ 00804 { \ 00805 __m128i smask = _mm_set1_##suffix(sbit); \ 00806 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \ 00807 } \ 00808 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ 00809 { \ 00810 __m128i smask = _mm_set1_##suffix(sbit); \ 00811 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \ 00812 } \ 00813 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \ 00814 { \ 00815 __m128i smask = _mm_set1_##suffix(sbit); \ 00816 __m128i not_mask = _mm_set1_epi32(-1); \ 00817 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \ 00818 return _Tpuvec(_mm_xor_si128(res, not_mask)); \ 00819 } \ 00820 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \ 00821 { \ 00822 __m128i smask = _mm_set1_##suffix(sbit); \ 00823 __m128i not_mask = _mm_set1_epi32(-1); \ 00824 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \ 00825 return _Tpuvec(_mm_xor_si128(res, not_mask)); \ 00826 } \ 00827 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \ 00828 { \ 00829 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \ 00830 } \ 00831 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ 00832 { \ 00833 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \ 00834 } \ 00835 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \ 00836 { \ 00837 __m128i not_mask = _mm_set1_epi32(-1); \ 00838 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \ 00839 } \ 00840 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ 00841 { \ 00842 __m128i not_mask = _mm_set1_epi32(-1); \ 00843 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ 00844 } 00845 00846 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128) 00847 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768) 00848 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) 00849 00850 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ 00851 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ 00852 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ 00853 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ 00854 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \ 00855 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ 00856 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \ 00857 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ 00858 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \ 00859 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ 00860 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \ 00861 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ 00862 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); } 00863 00864 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) 00865 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) 00866 00867 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) 00868 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) 00869 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) 00870 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16) 00871 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8) 00872 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) 00873 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) 00874 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) 00875 00876 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ 00877 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ 00878 { \ 00879 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \ 00880 } \ 00881 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ 00882 { \ 00883 __m128i smask = _mm_set1_epi32(smask32); \ 00884 __m128i a1 = _mm_xor_si128(a.val, smask); \ 00885 __m128i b1 = _mm_xor_si128(b.val, smask); \ 00886 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \ 00887 } 00888 00889 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) 00890 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) 00891 00892 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) 00893 { 00894 return v_max(a, b) - v_min(a, b); 00895 } 00896 00897 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) 00898 { 00899 __m128i d = _mm_sub_epi32(a.val, b.val); 00900 __m128i m = _mm_cmpgt_epi32(b.val, a.val); 00901 return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m)); 00902 } 00903 00904 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ 00905 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ 00906 { \ 00907 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \ 00908 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \ 00909 } \ 00910 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ 00911 { \ 00912 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ 00913 return _Tpvec(_mm_sqrt_##suffix(res)); \ 00914 } \ 00915 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ 00916 { \ 00917 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ 00918 return _Tpvec(res); \ 00919 } \ 00920 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ 00921 { \ 00922 return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \ 00923 } 00924 00925 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff)) 00926 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1)) 00927 00928 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ 00929 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ 00930 { \ 00931 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ 00932 } \ 00933 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ 00934 { \ 00935 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ 00936 } \ 00937 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ 00938 { \ 00939 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ 00940 } \ 00941 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ 00942 { \ 00943 return _Tpsvec(srai(a.val, imm)); \ 00944 } \ 00945 template<int imm> \ 00946 inline _Tpuvec v_shl(const _Tpuvec& a) \ 00947 { \ 00948 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ 00949 } \ 00950 template<int imm> \ 00951 inline _Tpsvec v_shl(const _Tpsvec& a) \ 00952 { \ 00953 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ 00954 } \ 00955 template<int imm> \ 00956 inline _Tpuvec v_shr(const _Tpuvec& a) \ 00957 { \ 00958 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ 00959 } \ 00960 template<int imm> \ 00961 inline _Tpsvec v_shr(const _Tpsvec& a) \ 00962 { \ 00963 return _Tpsvec(srai(a.val, imm)); \ 00964 } 00965 00966 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16) 00967 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32) 00968 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64) 00969 00970 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \ 00971 inline _Tpvec v_load(const _Tp* ptr) \ 00972 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \ 00973 inline _Tpvec v_load_aligned(const _Tp* ptr) \ 00974 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \ 00975 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ 00976 { \ 00977 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ 00978 _mm_loadl_epi64((const __m128i*)ptr1))); \ 00979 } \ 00980 inline void v_store(_Tp* ptr, const _Tpvec& a) \ 00981 { _mm_storeu_si128((__m128i*)ptr, a.val); } \ 00982 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ 00983 { _mm_store_si128((__m128i*)ptr, a.val); } \ 00984 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ 00985 { _mm_storel_epi64((__m128i*)ptr, a.val); } \ 00986 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ 00987 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); } 00988 00989 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar) 00990 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar) 00991 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort) 00992 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short) 00993 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned) 00994 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int) 00995 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64) 00996 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64) 00997 00998 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \ 00999 inline _Tpvec v_load(const _Tp* ptr) \ 01000 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \ 01001 inline _Tpvec v_load_aligned(const _Tp* ptr) \ 01002 { return _Tpvec(_mm_load_##suffix(ptr)); } \ 01003 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ 01004 { \ 01005 return _Tpvec(_mm_castsi128_##suffix( \ 01006 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ 01007 _mm_loadl_epi64((const __m128i*)ptr1)))); \ 01008 } \ 01009 inline void v_store(_Tp* ptr, const _Tpvec& a) \ 01010 { _mm_storeu_##suffix(ptr, a.val); } \ 01011 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ 01012 { _mm_store_##suffix(ptr, a.val); } \ 01013 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ 01014 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \ 01015 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ 01016 { \ 01017 __m128i a1 = _mm_cast##suffix##_si128(a.val); \ 01018 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \ 01019 } 01020 01021 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) 01022 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) 01023 01024 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ 01025 inline scalartype v_reduce_##func(const _Tpvec& a) \ 01026 { \ 01027 scalartype CV_DECL_ALIGNED(16) buf[4]; \ 01028 v_store_aligned(buf, a); \ 01029 scalartype s0 = scalar_func(buf[0], buf[1]); \ 01030 scalartype s1 = scalar_func(buf[2], buf[3]); \ 01031 return scalar_func(s0, s1); \ 01032 } 01033 01034 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) 01035 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) 01036 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) 01037 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) 01038 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max) 01039 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min) 01040 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) 01041 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) 01042 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) 01043 01044 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ 01045 inline int v_signmask(const _Tpvec& a) \ 01046 { \ 01047 return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \ 01048 } \ 01049 inline bool v_check_all(const _Tpvec& a) \ 01050 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \ 01051 inline bool v_check_any(const _Tpvec& a) \ 01052 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; } 01053 01054 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a) 01055 inline __m128i v_packq_epi32(__m128i a) 01056 { 01057 __m128i b = _mm_packs_epi32(a, a); 01058 return _mm_packs_epi16(b, b); 01059 } 01060 01061 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) 01062 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) 01063 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) 01064 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) 01065 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) 01066 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) 01067 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) 01068 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) 01069 01070 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \ 01071 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ 01072 { \ 01073 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \ 01074 } 01075 01076 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128) 01077 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128) 01078 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128) 01079 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128) 01080 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128) 01081 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) 01082 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128) 01083 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128) 01084 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) 01085 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) 01086 01087 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ 01088 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ 01089 { \ 01090 __m128i z = _mm_setzero_si128(); \ 01091 b0.val = _mm_unpacklo_##suffix(a.val, z); \ 01092 b1.val = _mm_unpackhi_##suffix(a.val, z); \ 01093 } \ 01094 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \ 01095 { \ 01096 __m128i z = _mm_setzero_si128(); \ 01097 return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \ 01098 } \ 01099 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \ 01100 { \ 01101 b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \ 01102 b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \ 01103 } \ 01104 inline _Tpwsvec v_load_expand(const _Tps* ptr) \ 01105 { \ 01106 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ 01107 return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \ 01108 } 01109 01110 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) 01111 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) 01112 01113 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1) 01114 { 01115 __m128i z = _mm_setzero_si128(); 01116 b0.val = _mm_unpacklo_epi32(a.val, z); 01117 b1.val = _mm_unpackhi_epi32(a.val, z); 01118 } 01119 inline v_uint64x2 v_load_expand(const unsigned* ptr) 01120 { 01121 __m128i z = _mm_setzero_si128(); 01122 return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z)); 01123 } 01124 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1) 01125 { 01126 __m128i s = _mm_srai_epi32(a.val, 31); 01127 b0.val = _mm_unpacklo_epi32(a.val, s); 01128 b1.val = _mm_unpackhi_epi32(a.val, s); 01129 } 01130 inline v_int64x2 v_load_expand(const int* ptr) 01131 { 01132 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); 01133 __m128i s = _mm_srai_epi32(a, 31); 01134 return v_int64x2(_mm_unpacklo_epi32(a, s)); 01135 } 01136 01137 inline v_uint32x4 v_load_expand_q(const uchar* ptr) 01138 { 01139 __m128i z = _mm_setzero_si128(); 01140 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); 01141 return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z)); 01142 } 01143 01144 inline v_int32x4 v_load_expand_q(const schar* ptr) 01145 { 01146 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); 01147 a = _mm_unpacklo_epi8(a, a); 01148 a = _mm_unpacklo_epi8(a, a); 01149 return v_int32x4(_mm_srai_epi32(a, 24)); 01150 } 01151 01152 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \ 01153 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ 01154 { \ 01155 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \ 01156 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \ 01157 } \ 01158 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ 01159 { \ 01160 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ 01161 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \ 01162 } \ 01163 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ 01164 { \ 01165 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ 01166 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \ 01167 } \ 01168 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ 01169 { \ 01170 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ 01171 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \ 01172 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \ 01173 } 01174 01175 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01176 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01177 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01178 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01179 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01180 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01181 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) 01182 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) 01183 01184 template<int s, typename _Tpvec> 01185 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) 01186 { 01187 const int w = sizeof(typename _Tpvec::lane_type); 01188 const int n = _Tpvec::nlanes; 01189 __m128i ra, rb; 01190 ra = _mm_srli_si128(a.val, s*w); 01191 rb = _mm_slli_si128(b.val, (n-s)*w); 01192 return _Tpvec(_mm_or_si128(ra, rb)); 01193 } 01194 01195 inline v_int32x4 v_round(const v_float32x4& a) 01196 { return v_int32x4(_mm_cvtps_epi32(a.val)); } 01197 01198 inline v_int32x4 v_floor(const v_float32x4& a) 01199 { 01200 __m128i a1 = _mm_cvtps_epi32(a.val); 01201 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val)); 01202 return v_int32x4(_mm_add_epi32(a1, mask)); 01203 } 01204 01205 inline v_int32x4 v_ceil(const v_float32x4& a) 01206 { 01207 __m128i a1 = _mm_cvtps_epi32(a.val); 01208 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1))); 01209 return v_int32x4(_mm_sub_epi32(a1, mask)); 01210 } 01211 01212 inline v_int32x4 v_trunc(const v_float32x4& a) 01213 { return v_int32x4(_mm_cvttps_epi32(a.val)); } 01214 01215 inline v_int32x4 v_round(const v_float64x2& a) 01216 { return v_int32x4(_mm_cvtpd_epi32(a.val)); } 01217 01218 inline v_int32x4 v_floor(const v_float64x2& a) 01219 { 01220 __m128i a1 = _mm_cvtpd_epi32(a.val); 01221 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val)); 01222 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 01223 return v_int32x4(_mm_add_epi32(a1, mask)); 01224 } 01225 01226 inline v_int32x4 v_ceil(const v_float64x2& a) 01227 { 01228 __m128i a1 = _mm_cvtpd_epi32(a.val); 01229 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1))); 01230 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 01231 return v_int32x4(_mm_sub_epi32(a1, mask)); 01232 } 01233 01234 inline v_int32x4 v_trunc(const v_float64x2& a) 01235 { return v_int32x4(_mm_cvttpd_epi32(a.val)); } 01236 01237 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \ 01238 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ 01239 const _Tpvec& a2, const _Tpvec& a3, \ 01240 _Tpvec& b0, _Tpvec& b1, \ 01241 _Tpvec& b2, _Tpvec& b3) \ 01242 { \ 01243 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \ 01244 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ 01245 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \ 01246 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \ 01247 \ 01248 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \ 01249 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \ 01250 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \ 01251 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \ 01252 } 01253 01254 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01255 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) 01256 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) 01257 01258 // adopted from sse_utils.hpp 01259 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) 01260 { 01261 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); 01262 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16)); 01263 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32)); 01264 01265 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01)); 01266 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02); 01267 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02)); 01268 01269 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11)); 01270 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12); 01271 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12)); 01272 01273 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21)); 01274 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22); 01275 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22)); 01276 01277 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31)); 01278 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32); 01279 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32)); 01280 } 01281 01282 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) 01283 { 01284 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ... 01285 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... 01286 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ... 01287 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ... 01288 01289 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... 01290 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... 01291 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... 01292 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ... 01293 01294 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... 01295 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... 01296 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ... 01297 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ... 01298 01299 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... 01300 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... 01301 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ... 01302 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ... 01303 01304 a.val = _mm_unpacklo_epi8(v0, v1); 01305 b.val = _mm_unpackhi_epi8(v0, v1); 01306 c.val = _mm_unpacklo_epi8(v2, v3); 01307 d.val = _mm_unpackhi_epi8(v2, v3); 01308 } 01309 01310 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) 01311 { 01312 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); 01313 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8)); 01314 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16)); 01315 01316 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01)); 01317 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02); 01318 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02)); 01319 01320 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11)); 01321 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12); 01322 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12)); 01323 01324 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21)); 01325 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22); 01326 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22)); 01327 } 01328 01329 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) 01330 { 01331 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 01332 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ... 01333 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... 01334 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ... 01335 01336 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... 01337 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... 01338 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... 01339 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... 01340 01341 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ... 01342 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ... 01343 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ... 01344 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ... 01345 01346 a.val = _mm_unpacklo_epi16(u0, u1); 01347 b.val = _mm_unpackhi_epi16(u0, u1); 01348 c.val = _mm_unpacklo_epi16(u2, u3); 01349 d.val = _mm_unpackhi_epi16(u2, u3); 01350 } 01351 01352 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c) 01353 { 01354 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); 01355 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4)); 01356 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8)); 01357 01358 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01)); 01359 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02); 01360 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02)); 01361 01362 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11)); 01363 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12); 01364 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12)); 01365 } 01366 01367 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) 01368 { 01369 v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 01370 v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 01371 v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 01372 v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 01373 01374 v_transpose4x4(u0, u1, u2, u3, a, b, c, d); 01375 } 01376 01377 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, 01378 const v_uint8x16& c ) 01379 { 01380 __m128i z = _mm_setzero_si128(); 01381 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val); 01382 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val); 01383 __m128i c0 = _mm_unpacklo_epi8(c.val, z); 01384 __m128i c1 = _mm_unpackhi_epi8(c.val, z); 01385 01386 __m128i p00 = _mm_unpacklo_epi16(ab0, c0); 01387 __m128i p01 = _mm_unpackhi_epi16(ab0, c0); 01388 __m128i p02 = _mm_unpacklo_epi16(ab1, c1); 01389 __m128i p03 = _mm_unpackhi_epi16(ab1, c1); 01390 01391 __m128i p10 = _mm_unpacklo_epi32(p00, p01); 01392 __m128i p11 = _mm_unpackhi_epi32(p00, p01); 01393 __m128i p12 = _mm_unpacklo_epi32(p02, p03); 01394 __m128i p13 = _mm_unpackhi_epi32(p02, p03); 01395 01396 __m128i p20 = _mm_unpacklo_epi64(p10, p11); 01397 __m128i p21 = _mm_unpackhi_epi64(p10, p11); 01398 __m128i p22 = _mm_unpacklo_epi64(p12, p13); 01399 __m128i p23 = _mm_unpackhi_epi64(p12, p13); 01400 01401 p20 = _mm_slli_si128(p20, 1); 01402 p22 = _mm_slli_si128(p22, 1); 01403 01404 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8); 01405 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8); 01406 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8); 01407 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8); 01408 01409 __m128i p40 = _mm_unpacklo_epi64(p30, p31); 01410 __m128i p41 = _mm_unpackhi_epi64(p30, p31); 01411 __m128i p42 = _mm_unpacklo_epi64(p32, p33); 01412 __m128i p43 = _mm_unpackhi_epi64(p32, p33); 01413 01414 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10)); 01415 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6)); 01416 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2)); 01417 01418 _mm_storeu_si128((__m128i*)(ptr), v0); 01419 _mm_storeu_si128((__m128i*)(ptr + 16), v1); 01420 _mm_storeu_si128((__m128i*)(ptr + 32), v2); 01421 } 01422 01423 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, 01424 const v_uint8x16& c, const v_uint8x16& d) 01425 { 01426 // a0 a1 a2 a3 .... 01427 // b0 b1 b2 b3 .... 01428 // c0 c1 c2 c3 .... 01429 // d0 d1 d2 d3 .... 01430 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ... 01431 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ... 01432 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ... 01433 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ... 01434 01435 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ... 01436 __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... 01437 __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... 01438 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ... 01439 01440 _mm_storeu_si128((__m128i*)ptr, v0); 01441 _mm_storeu_si128((__m128i*)(ptr + 16), v2); 01442 _mm_storeu_si128((__m128i*)(ptr + 32), v1); 01443 _mm_storeu_si128((__m128i*)(ptr + 48), v3); 01444 } 01445 01446 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, 01447 const v_uint16x8& b, 01448 const v_uint16x8& c ) 01449 { 01450 __m128i z = _mm_setzero_si128(); 01451 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val); 01452 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val); 01453 __m128i c0 = _mm_unpacklo_epi16(c.val, z); 01454 __m128i c1 = _mm_unpackhi_epi16(c.val, z); 01455 01456 __m128i p10 = _mm_unpacklo_epi32(ab0, c0); 01457 __m128i p11 = _mm_unpackhi_epi32(ab0, c0); 01458 __m128i p12 = _mm_unpacklo_epi32(ab1, c1); 01459 __m128i p13 = _mm_unpackhi_epi32(ab1, c1); 01460 01461 __m128i p20 = _mm_unpacklo_epi64(p10, p11); 01462 __m128i p21 = _mm_unpackhi_epi64(p10, p11); 01463 __m128i p22 = _mm_unpacklo_epi64(p12, p13); 01464 __m128i p23 = _mm_unpackhi_epi64(p12, p13); 01465 01466 p20 = _mm_slli_si128(p20, 2); 01467 p22 = _mm_slli_si128(p22, 2); 01468 01469 __m128i p30 = _mm_unpacklo_epi64(p20, p21); 01470 __m128i p31 = _mm_unpackhi_epi64(p20, p21); 01471 __m128i p32 = _mm_unpacklo_epi64(p22, p23); 01472 __m128i p33 = _mm_unpackhi_epi64(p22, p23); 01473 01474 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10)); 01475 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6)); 01476 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2)); 01477 01478 _mm_storeu_si128((__m128i*)(ptr), v0); 01479 _mm_storeu_si128((__m128i*)(ptr + 8), v1); 01480 _mm_storeu_si128((__m128i*)(ptr + 16), v2); 01481 } 01482 01483 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, 01484 const v_uint16x8& c, const v_uint16x8& d) 01485 { 01486 // a0 a1 a2 a3 .... 01487 // b0 b1 b2 b3 .... 01488 // c0 c1 c2 c3 .... 01489 // d0 d1 d2 d3 .... 01490 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ... 01491 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ... 01492 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ... 01493 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ... 01494 01495 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... 01496 __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... 01497 __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... 01498 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... 01499 01500 _mm_storeu_si128((__m128i*)ptr, v0); 01501 _mm_storeu_si128((__m128i*)(ptr + 8), v2); 01502 _mm_storeu_si128((__m128i*)(ptr + 16), v1); 01503 _mm_storeu_si128((__m128i*)(ptr + 24), v3); 01504 } 01505 01506 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, 01507 const v_uint32x4& c ) 01508 { 01509 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3; 01510 v_transpose4x4(a, b, c, z, u0, u1, u2, u3); 01511 01512 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12)); 01513 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8)); 01514 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4)); 01515 01516 _mm_storeu_si128((__m128i*)ptr, v0); 01517 _mm_storeu_si128((__m128i*)(ptr + 4), v1); 01518 _mm_storeu_si128((__m128i*)(ptr + 8), v2); 01519 } 01520 01521 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, 01522 const v_uint32x4& c, const v_uint32x4& d) 01523 { 01524 v_uint32x4 t0, t1, t2, t3; 01525 v_transpose4x4(a, b, c, d, t0, t1, t2, t3); 01526 v_store(ptr, t0); 01527 v_store(ptr + 4, t1); 01528 v_store(ptr + 8, t2); 01529 v_store(ptr + 12, t3); 01530 } 01531 01532 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ 01533 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ 01534 _Tpvec& b0, _Tpvec& c0 ) \ 01535 { \ 01536 _Tpuvec a1, b1, c1; \ 01537 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \ 01538 a0 = v_reinterpret_as_##suffix(a1); \ 01539 b0 = v_reinterpret_as_##suffix(b1); \ 01540 c0 = v_reinterpret_as_##suffix(c1); \ 01541 } \ 01542 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ 01543 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \ 01544 { \ 01545 _Tpuvec a1, b1, c1, d1; \ 01546 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \ 01547 a0 = v_reinterpret_as_##suffix(a1); \ 01548 b0 = v_reinterpret_as_##suffix(b1); \ 01549 c0 = v_reinterpret_as_##suffix(c1); \ 01550 d0 = v_reinterpret_as_##suffix(d1); \ 01551 } \ 01552 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \ 01553 const _Tpvec& b0, const _Tpvec& c0 ) \ 01554 { \ 01555 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ 01556 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ 01557 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ 01558 v_store_interleave((_Tpu*)ptr, a1, b1, c1); \ 01559 } \ 01560 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \ 01561 const _Tpvec& c0, const _Tpvec& d0 ) \ 01562 { \ 01563 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ 01564 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ 01565 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ 01566 _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \ 01567 v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \ 01568 } 01569 01570 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) 01571 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) 01572 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) 01573 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) 01574 01575 inline v_float32x4 v_cvt_f32(const v_int32x4& a) 01576 { 01577 return v_float32x4(_mm_cvtepi32_ps(a.val)); 01578 } 01579 01580 inline v_float32x4 v_cvt_f32(const v_float64x2& a) 01581 { 01582 return v_float32x4(_mm_cvtpd_ps(a.val)); 01583 } 01584 01585 inline v_float64x2 v_cvt_f64(const v_int32x4& a) 01586 { 01587 return v_float64x2(_mm_cvtepi32_pd(a.val)); 01588 } 01589 01590 inline v_float64x2 v_cvt_f64(const v_float32x4& a) 01591 { 01592 return v_float64x2(_mm_cvtps_pd(a.val)); 01593 } 01594 01595 //! @endcond 01596 01597 } 01598 01599 #endif 01600
Generated on Tue Jul 12 2022 14:47:12 by
1.7.2
