Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers intrin_sse.hpp Source File

intrin_sse.hpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                          License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
00016 // Copyright (C) 2015, Itseez Inc., all rights reserved.
00017 // Third party copyrights are property of their respective owners.
00018 //
00019 // Redistribution and use in source and binary forms, with or without modification,
00020 // are permitted provided that the following conditions are met:
00021 //
00022 //   * Redistribution's of source code must retain the above copyright notice,
00023 //     this list of conditions and the following disclaimer.
00024 //
00025 //   * Redistribution's in binary form must reproduce the above copyright notice,
00026 //     this list of conditions and the following disclaimer in the documentation
00027 //     and/or other materials provided with the distribution.
00028 //
00029 //   * The name of the copyright holders may not be used to endorse or promote products
00030 //     derived from this software without specific prior written permission.
00031 //
00032 // This software is provided by the copyright holders and contributors "as is" and
00033 // any express or implied warranties, including, but not limited to, the implied
00034 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00035 // In no event shall the Intel Corporation or contributors be liable for any direct,
00036 // indirect, incidental, special, exemplary, or consequential damages
00037 // (including, but not limited to, procurement of substitute goods or services;
00038 // loss of use, data, or profits; or business interruption) however caused
00039 // and on any theory of liability, whether in contract, strict liability,
00040 // or tort (including negligence or otherwise) arising in any way out of
00041 // the use of this software, even if advised of the possibility of such damage.
00042 //
00043 //M*/
00044 
00045 #ifndef __OPENCV_HAL_SSE_HPP__
00046 #define __OPENCV_HAL_SSE_HPP__
00047 
00048 #include <algorithm>
00049 
00050 #define CV_SIMD128 1
00051 #define CV_SIMD128_64F 1
00052 
00053 namespace cv
00054 {
00055 
00056 //! @cond IGNORED
00057 
00058 struct v_uint8x16
00059 {
00060     typedef uchar lane_type;
00061     enum { nlanes = 16 };
00062 
00063     v_uint8x16() {}
00064     explicit v_uint8x16(__m128i v) : val(v) {}
00065     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
00066                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
00067     {
00068         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
00069                             (char)v4, (char)v5, (char)v6, (char)v7,
00070                             (char)v8, (char)v9, (char)v10, (char)v11,
00071                             (char)v12, (char)v13, (char)v14, (char)v15);
00072     }
00073     uchar get0() const
00074     {
00075         return (uchar)_mm_cvtsi128_si32(val);
00076     }
00077 
00078     __m128i val;
00079 };
00080 
00081 struct v_int8x16
00082 {
00083     typedef schar lane_type;
00084     enum { nlanes = 16 };
00085 
00086     v_int8x16() {}
00087     explicit v_int8x16(__m128i v) : val(v) {}
00088     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
00089               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
00090     {
00091         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
00092                             (char)v4, (char)v5, (char)v6, (char)v7,
00093                             (char)v8, (char)v9, (char)v10, (char)v11,
00094                             (char)v12, (char)v13, (char)v14, (char)v15);
00095     }
00096     schar get0() const
00097     {
00098         return (schar)_mm_cvtsi128_si32(val);
00099     }
00100 
00101     __m128i val;
00102 };
00103 
00104 struct v_uint16x8
00105 {
00106     typedef ushort lane_type;
00107     enum { nlanes = 8 };
00108 
00109     v_uint16x8() {}
00110     explicit v_uint16x8(__m128i v) : val(v) {}
00111     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
00112     {
00113         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
00114                              (short)v4, (short)v5, (short)v6, (short)v7);
00115     }
00116     ushort get0() const
00117     {
00118         return (ushort)_mm_cvtsi128_si32(val);
00119     }
00120 
00121     __m128i val;
00122 };
00123 
00124 struct v_int16x8
00125 {
00126     typedef short lane_type;
00127     enum { nlanes = 8 };
00128 
00129     v_int16x8() {}
00130     explicit v_int16x8(__m128i v) : val(v) {}
00131     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
00132     {
00133         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
00134                              (short)v4, (short)v5, (short)v6, (short)v7);
00135     }
00136     short get0() const
00137     {
00138         return (short)_mm_cvtsi128_si32(val);
00139     }
00140     __m128i val;
00141 };
00142 
00143 struct v_uint32x4
00144 {
00145     typedef unsigned lane_type;
00146     enum { nlanes = 4 };
00147 
00148     v_uint32x4() {}
00149     explicit v_uint32x4(__m128i v) : val(v) {}
00150     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
00151     {
00152         val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
00153     }
00154     unsigned get0() const
00155     {
00156         return (unsigned)_mm_cvtsi128_si32(val);
00157     }
00158     __m128i val;
00159 };
00160 
00161 struct v_int32x4
00162 {
00163     typedef int lane_type;
00164     enum { nlanes = 4 };
00165 
00166     v_int32x4() {}
00167     explicit v_int32x4(__m128i v) : val(v) {}
00168     v_int32x4(int v0, int v1, int v2, int v3)
00169     {
00170         val = _mm_setr_epi32(v0, v1, v2, v3);
00171     }
00172     int get0() const
00173     {
00174         return _mm_cvtsi128_si32(val);
00175     }
00176     __m128i val;
00177 };
00178 
00179 struct v_float32x4
00180 {
00181     typedef float lane_type;
00182     enum { nlanes = 4 };
00183 
00184     v_float32x4() {}
00185     explicit v_float32x4(__m128 v) : val(v) {}
00186     v_float32x4(float v0, float v1, float v2, float v3)
00187     {
00188         val = _mm_setr_ps(v0, v1, v2, v3);
00189     }
00190     float get0() const
00191     {
00192         return _mm_cvtss_f32(val);
00193     }
00194     __m128 val;
00195 };
00196 
00197 struct v_uint64x2
00198 {
00199     typedef uint64 lane_type;
00200     enum { nlanes = 2 };
00201 
00202     v_uint64x2() {}
00203     explicit v_uint64x2(__m128i v) : val(v) {}
00204     v_uint64x2(uint64 v0, uint64 v1)
00205     {
00206         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
00207     }
00208     uint64 get0() const
00209     {
00210         int a = _mm_cvtsi128_si32(val);
00211         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
00212         return (unsigned)a | ((uint64)(unsigned)b << 32);
00213     }
00214     __m128i val;
00215 };
00216 
00217 struct v_int64x2
00218 {
00219     typedef int64 lane_type;
00220     enum { nlanes = 2 };
00221 
00222     v_int64x2() {}
00223     explicit v_int64x2(__m128i v) : val(v) {}
00224     v_int64x2(int64 v0, int64 v1)
00225     {
00226         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
00227     }
00228     int64 get0() const
00229     {
00230         int a = _mm_cvtsi128_si32(val);
00231         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
00232         return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
00233     }
00234     __m128i val;
00235 };
00236 
00237 struct v_float64x2
00238 {
00239     typedef double lane_type;
00240     enum { nlanes = 2 };
00241 
00242     v_float64x2() {}
00243     explicit v_float64x2(__m128d v) : val(v) {}
00244     v_float64x2(double v0, double v1)
00245     {
00246         val = _mm_setr_pd(v0, v1);
00247     }
00248     double get0() const
00249     {
00250         return _mm_cvtsd_f64(val);
00251     }
00252     __m128d val;
00253 };
00254 
00255 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
00256 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
00257 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
00258 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
00259 { return _Tpvec(cast(a.val)); }
00260 
00261 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
00262 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
00263 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
00264 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
00265 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
00266 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
00267 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
00268 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
00269 
00270 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
00271 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
00272 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
00273 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
00274 
00275 template<typename _Tpvec> inline
00276 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
00277 template<typename _Tpvec> inline
00278 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
00279 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
00280 { return v_float32x4(_mm_castsi128_ps(a.val)); }
00281 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
00282 { return v_float32x4(_mm_castsi128_ps(a.val)); }
00283 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
00284 { return v_float64x2(_mm_castsi128_pd(a.val)); }
00285 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
00286 { return v_float64x2(_mm_castsi128_pd(a.val)); }
00287 
00288 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
00289 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
00290 { return _Tpvec(_mm_castps_si128(a.val)); } \
00291 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
00292 { return _Tpvec(_mm_castpd_si128(a.val)); }
00293 
00294 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
00295 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
00296 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
00297 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
00298 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
00299 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
00300 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
00301 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
00302 
00303 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
00304 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
00305 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
00306 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
00307 
00308 //////////////// PACK ///////////////
00309 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
00310 {
00311     __m128i delta = _mm_set1_epi16(255);
00312     return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
00313                                        _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
00314 }
00315 
00316 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
00317 {
00318     __m128i delta = _mm_set1_epi16(255);
00319     __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
00320     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
00321 }
00322 
00323 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
00324 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
00325 
00326 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
00327 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
00328 
00329 template<int n> inline
00330 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
00331 {
00332     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
00333     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
00334     return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
00335                                        _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
00336 }
00337 
00338 template<int n> inline
00339 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
00340 {
00341     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
00342     __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
00343     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
00344 }
00345 
00346 template<int n> inline
00347 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
00348 {
00349     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
00350     return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
00351                                        _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
00352 }
00353 
00354 template<int n> inline
00355 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
00356 {
00357     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
00358     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
00359     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
00360 }
00361 
00362 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
00363 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
00364 
00365 inline void v_pack_store(schar* ptr, v_int16x8& a)
00366 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
00367 
00368 template<int n> inline
00369 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
00370 {
00371     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
00372     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
00373     return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
00374                                      _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
00375 }
00376 template<int n> inline
00377 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
00378 {
00379     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
00380     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
00381     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
00382     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
00383 }
00384 
00385 
00386 // bit-wise "mask ? a : b"
00387 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
00388 {
00389     return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
00390 }
00391 
00392 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
00393 {
00394     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
00395     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
00396     __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
00397     __m128i r = _mm_packs_epi32(a1, b1);
00398     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
00399 }
00400 
00401 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
00402 {
00403     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
00404     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
00405     __m128i r = _mm_packs_epi32(a1, a1);
00406     _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
00407 }
00408 
00409 template<int n> inline
00410 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
00411 {
00412     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
00413     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
00414     __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
00415     return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
00416 }
00417 
00418 template<int n> inline
00419 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
00420 {
00421     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
00422     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
00423     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
00424     _mm_storel_epi64((__m128i*)ptr, a2);
00425 }
00426 
00427 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
00428 {
00429     __m128i delta32 = _mm_set1_epi32(32768);
00430     __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
00431     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
00432 }
00433 
00434 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
00435 {
00436     __m128i delta32 = _mm_set1_epi32(32768);
00437     __m128i a1 = _mm_sub_epi32(a.val, delta32);
00438     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
00439     _mm_storel_epi64((__m128i*)ptr, r);
00440 }
00441 
00442 template<int n> inline
00443 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
00444 {
00445     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
00446     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
00447     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
00448     __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
00449     __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
00450     return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
00451 }
00452 
00453 template<int n> inline
00454 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
00455 {
00456     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
00457     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
00458     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
00459     _mm_storel_epi64((__m128i*)ptr, a2);
00460 }
00461 
00462 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
00463 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
00464 
00465 inline void v_pack_store(short* ptr, const v_int32x4& a)
00466 {
00467     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
00468 }
00469 
00470 template<int n> inline
00471 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
00472 {
00473     __m128i delta = _mm_set1_epi32(1 << (n-1));
00474     return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
00475                                      _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
00476 }
00477 
00478 template<int n> inline
00479 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
00480 {
00481     __m128i delta = _mm_set1_epi32(1 << (n-1));
00482     __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
00483     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
00484 }
00485 
00486 
00487 // [a0 0 | b0 0]  [a1 0 | b1 0]
00488 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
00489 {
00490     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
00491     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
00492     return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
00493 }
00494 
00495 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
00496 {
00497     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
00498     _mm_storel_epi64((__m128i*)ptr, a1);
00499 }
00500 
00501 // [a0 0 | b0 0]  [a1 0 | b1 0]
00502 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
00503 {
00504     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
00505     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
00506     return v_int32x4(_mm_unpacklo_epi32(v0, v1));
00507 }
00508 
00509 inline void v_pack_store(int* ptr, const v_int64x2& a)
00510 {
00511     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
00512     _mm_storel_epi64((__m128i*)ptr, a1);
00513 }
00514 
00515 template<int n> inline
00516 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
00517 {
00518     uint64 delta = (uint64)1 << (n-1);
00519     v_uint64x2 delta2(delta, delta);
00520     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
00521     __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
00522     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
00523     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
00524     return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
00525 }
00526 
00527 template<int n> inline
00528 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
00529 {
00530     uint64 delta = (uint64)1 << (n-1);
00531     v_uint64x2 delta2(delta, delta);
00532     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
00533     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
00534     _mm_storel_epi64((__m128i*)ptr, a2);
00535 }
00536 
00537 inline __m128i v_sign_epi64(__m128i a)
00538 {
00539     return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
00540 }
00541 
00542 inline __m128i v_srai_epi64(__m128i a, int imm)
00543 {
00544     __m128i smask = v_sign_epi64(a);
00545     return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
00546 }
00547 
00548 template<int n> inline
00549 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
00550 {
00551     int64 delta = (int64)1 << (n-1);
00552     v_int64x2 delta2(delta, delta);
00553     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
00554     __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
00555     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
00556     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
00557     return v_int32x4(_mm_unpacklo_epi32(v0, v1));
00558 }
00559 
00560 template<int n> inline
00561 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
00562 {
00563     int64 delta = (int64)1 << (n-1);
00564     v_int64x2 delta2(delta, delta);
00565     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
00566     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
00567     _mm_storel_epi64((__m128i*)ptr, a2);
00568 }
00569 
00570 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
00571                             const v_float32x4& m1, const v_float32x4& m2,
00572                             const v_float32x4& m3)
00573 {
00574     __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
00575     __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
00576     __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
00577     __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
00578 
00579     return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
00580 }
00581 
00582 
00583 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
00584     inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
00585     { \
00586         return _Tpvec(intrin(a.val, b.val)); \
00587     } \
00588     inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
00589     { \
00590         a.val = intrin(a.val, b.val); \
00591         return a; \
00592     }
00593 
00594 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
00595 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
00596 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
00597 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
00598 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
00599 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
00600 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
00601 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
00602 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
00603 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
00604 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
00605 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
00606 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
00607 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
00608 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
00609 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
00610 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
00611 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
00612 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
00613 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
00614 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
00615 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
00616 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
00617 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
00618 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
00619 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
00620 
00621 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
00622 {
00623     __m128i c0 = _mm_mul_epu32(a.val, b.val);
00624     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
00625     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
00626     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
00627     return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
00628 }
00629 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
00630 {
00631     __m128i c0 = _mm_mul_epu32(a.val, b.val);
00632     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
00633     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
00634     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
00635     return v_int32x4(_mm_unpacklo_epi64(d0, d1));
00636 }
00637 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
00638 {
00639     a = a * b;
00640     return a;
00641 }
00642 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
00643 {
00644     a = a * b;
00645     return a;
00646 }
00647 
00648 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
00649                          v_int32x4& c, v_int32x4& d)
00650 {
00651     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
00652     __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
00653     c.val = _mm_unpacklo_epi16(v0, v1);
00654     d.val = _mm_unpackhi_epi16(v0, v1);
00655 }
00656 
00657 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
00658                          v_uint32x4& c, v_uint32x4& d)
00659 {
00660     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
00661     __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
00662     c.val = _mm_unpacklo_epi16(v0, v1);
00663     d.val = _mm_unpackhi_epi16(v0, v1);
00664 }
00665 
00666 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
00667                          v_uint64x2& c, v_uint64x2& d)
00668 {
00669     __m128i c0 = _mm_mul_epu32(a.val, b.val);
00670     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
00671     c.val = _mm_unpacklo_epi64(c0, c1);
00672     d.val = _mm_unpackhi_epi64(c0, c1);
00673 }
00674 
00675 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
00676 {
00677     return v_int32x4(_mm_madd_epi16(a.val, b.val));
00678 }
00679 
00680 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
00681     OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
00682     OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
00683     OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
00684     inline _Tpvec operator ~ (const _Tpvec& a) \
00685     { \
00686         return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
00687     }
00688 
00689 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
00690 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
00691 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
00692 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
00693 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
00694 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
00695 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
00696 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
00697 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
00698 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
00699 
00700 inline v_float32x4 v_sqrt(const v_float32x4& x)
00701 { return v_float32x4(_mm_sqrt_ps(x.val)); }
00702 
00703 inline v_float32x4 v_invsqrt(const v_float32x4& x)
00704 {
00705     static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
00706     __m128 t = x.val;
00707     __m128 h = _mm_mul_ps(t, _0_5);
00708     t = _mm_rsqrt_ps(t);
00709     t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
00710     return v_float32x4(t);
00711 }
00712 
00713 inline v_float64x2 v_sqrt(const v_float64x2& x)
00714 { return v_float64x2(_mm_sqrt_pd(x.val)); }
00715 
00716 inline v_float64x2 v_invsqrt(const v_float64x2& x)
00717 {
00718     static const __m128d v_1 = _mm_set1_pd(1.);
00719     return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
00720 }
00721 
00722 inline v_float32x4 v_abs(const v_float32x4& x)
00723 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
00724 inline v_float64x2 v_abs(const v_float64x2& x)
00725 {
00726     return v_float64x2(_mm_and_pd(x.val,
00727         _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
00728 }
00729 
00730 // TODO: exp, log, sin, cos
00731 
00732 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
00733 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
00734 { \
00735     return _Tpvec(intrin(a.val, b.val)); \
00736 }
00737 
00738 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
00739 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
00740 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
00741 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
00742 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
00743 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
00744 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
00745 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
00746 
00747 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
00748 {
00749     __m128i delta = _mm_set1_epi8((char)-128);
00750     return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
00751                                                        _mm_xor_si128(b.val, delta))));
00752 }
00753 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
00754 {
00755     __m128i delta = _mm_set1_epi8((char)-128);
00756     return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
00757                                                        _mm_xor_si128(b.val, delta))));
00758 }
00759 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
00760 {
00761     return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
00762 }
00763 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
00764 {
00765     return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
00766 }
00767 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
00768 {
00769     __m128i delta = _mm_set1_epi32((int)0x80000000);
00770     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
00771     return v_uint32x4(v_select_si128(mask, b.val, a.val));
00772 }
00773 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
00774 {
00775     __m128i delta = _mm_set1_epi32((int)0x80000000);
00776     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
00777     return v_uint32x4(v_select_si128(mask, a.val, b.val));
00778 }
00779 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
00780 {
00781     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
00782 }
00783 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
00784 {
00785     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
00786 }
00787 
00788 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
00789 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
00790 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
00791 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
00792 { \
00793     __m128i not_mask = _mm_set1_epi32(-1); \
00794     return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
00795 } \
00796 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
00797 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
00798 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
00799 { \
00800     __m128i not_mask = _mm_set1_epi32(-1); \
00801     return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
00802 } \
00803 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
00804 { \
00805     __m128i smask = _mm_set1_##suffix(sbit); \
00806     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
00807 } \
00808 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
00809 { \
00810     __m128i smask = _mm_set1_##suffix(sbit); \
00811     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
00812 } \
00813 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
00814 { \
00815     __m128i smask = _mm_set1_##suffix(sbit); \
00816     __m128i not_mask = _mm_set1_epi32(-1); \
00817     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
00818     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
00819 } \
00820 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
00821 { \
00822     __m128i smask = _mm_set1_##suffix(sbit); \
00823     __m128i not_mask = _mm_set1_epi32(-1); \
00824     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
00825     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
00826 } \
00827 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
00828 { \
00829     return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
00830 } \
00831 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
00832 { \
00833     return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
00834 } \
00835 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
00836 { \
00837     __m128i not_mask = _mm_set1_epi32(-1); \
00838     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
00839 } \
00840 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
00841 { \
00842     __m128i not_mask = _mm_set1_epi32(-1); \
00843     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
00844 }
00845 
00846 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
00847 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
00848 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
00849 
00850 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
00851 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
00852 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
00853 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
00854 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
00855 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
00856 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
00857 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
00858 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
00859 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
00860 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
00861 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
00862 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
00863 
00864 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
00865 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
00866 
00867 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
00868 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
00869 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
00870 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
00871 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
00872 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
00873 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
00874 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
00875 
00876 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
00877 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
00878 { \
00879     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
00880 } \
00881 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
00882 { \
00883     __m128i smask = _mm_set1_epi32(smask32); \
00884     __m128i a1 = _mm_xor_si128(a.val, smask); \
00885     __m128i b1 = _mm_xor_si128(b.val, smask); \
00886     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
00887 }
00888 
00889 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
00890 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
00891 
00892 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
00893 {
00894     return v_max(a, b) - v_min(a, b);
00895 }
00896 
00897 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
00898 {
00899     __m128i d = _mm_sub_epi32(a.val, b.val);
00900     __m128i m = _mm_cmpgt_epi32(b.val, a.val);
00901     return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
00902 }
00903 
00904 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
00905 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
00906 { \
00907     _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
00908     return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
00909 } \
00910 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
00911 { \
00912     _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
00913     return _Tpvec(_mm_sqrt_##suffix(res)); \
00914 } \
00915 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
00916 { \
00917     _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
00918     return _Tpvec(res); \
00919 } \
00920 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
00921 { \
00922     return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
00923 }
00924 
00925 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
00926 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
00927 
00928 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
00929 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
00930 { \
00931     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
00932 } \
00933 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
00934 { \
00935     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
00936 } \
00937 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
00938 { \
00939     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
00940 } \
00941 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
00942 { \
00943     return _Tpsvec(srai(a.val, imm)); \
00944 } \
00945 template<int imm> \
00946 inline _Tpuvec v_shl(const _Tpuvec& a) \
00947 { \
00948     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
00949 } \
00950 template<int imm> \
00951 inline _Tpsvec v_shl(const _Tpsvec& a) \
00952 { \
00953     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
00954 } \
00955 template<int imm> \
00956 inline _Tpuvec v_shr(const _Tpuvec& a) \
00957 { \
00958     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
00959 } \
00960 template<int imm> \
00961 inline _Tpsvec v_shr(const _Tpsvec& a) \
00962 { \
00963     return _Tpsvec(srai(a.val, imm)); \
00964 }
00965 
00966 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
00967 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
00968 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
00969 
00970 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
00971 inline _Tpvec v_load(const _Tp* ptr) \
00972 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
00973 inline _Tpvec v_load_aligned(const _Tp* ptr) \
00974 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
00975 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
00976 { \
00977     return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
00978                                      _mm_loadl_epi64((const __m128i*)ptr1))); \
00979 } \
00980 inline void v_store(_Tp* ptr, const _Tpvec& a) \
00981 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
00982 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
00983 { _mm_store_si128((__m128i*)ptr, a.val); } \
00984 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
00985 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
00986 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
00987 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
00988 
00989 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
00990 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
00991 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
00992 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
00993 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
00994 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
00995 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
00996 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
00997 
00998 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
00999 inline _Tpvec v_load(const _Tp* ptr) \
01000 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
01001 inline _Tpvec v_load_aligned(const _Tp* ptr) \
01002 { return _Tpvec(_mm_load_##suffix(ptr)); } \
01003 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
01004 { \
01005     return _Tpvec(_mm_castsi128_##suffix( \
01006         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
01007                            _mm_loadl_epi64((const __m128i*)ptr1)))); \
01008 } \
01009 inline void v_store(_Tp* ptr, const _Tpvec& a) \
01010 { _mm_storeu_##suffix(ptr, a.val); } \
01011 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
01012 { _mm_store_##suffix(ptr, a.val); } \
01013 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
01014 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
01015 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
01016 { \
01017     __m128i a1 = _mm_cast##suffix##_si128(a.val); \
01018     _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
01019 }
01020 
01021 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
01022 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
01023 
01024 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
01025 inline scalartype v_reduce_##func(const _Tpvec& a) \
01026 { \
01027     scalartype CV_DECL_ALIGNED(16) buf[4]; \
01028     v_store_aligned(buf, a); \
01029     scalartype s0 = scalar_func(buf[0], buf[1]); \
01030     scalartype s1 = scalar_func(buf[2], buf[3]); \
01031     return scalar_func(s0, s1); \
01032 }
01033 
01034 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
01035 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
01036 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
01037 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
01038 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
01039 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
01040 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
01041 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
01042 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
01043 
01044 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
01045 inline int v_signmask(const _Tpvec& a) \
01046 { \
01047     return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
01048 } \
01049 inline bool v_check_all(const _Tpvec& a) \
01050 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
01051 inline bool v_check_any(const _Tpvec& a) \
01052 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
01053 
01054 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
01055 inline __m128i v_packq_epi32(__m128i a)
01056 {
01057     __m128i b = _mm_packs_epi32(a, a);
01058     return _mm_packs_epi16(b, b);
01059 }
01060 
01061 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
01062 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
01063 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
01064 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
01065 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
01066 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
01067 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
01068 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
01069 
01070 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
01071 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
01072 { \
01073     return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
01074 }
01075 
01076 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
01077 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
01078 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
01079 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
01080 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
01081 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
01082 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
01083 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
01084 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
01085 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
01086 
01087 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
01088 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
01089 { \
01090     __m128i z = _mm_setzero_si128(); \
01091     b0.val = _mm_unpacklo_##suffix(a.val, z); \
01092     b1.val = _mm_unpackhi_##suffix(a.val, z); \
01093 } \
01094 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
01095 { \
01096     __m128i z = _mm_setzero_si128(); \
01097     return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
01098 } \
01099 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
01100 { \
01101     b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
01102     b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
01103 } \
01104 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
01105 { \
01106     __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
01107     return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
01108 }
01109 
01110 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
01111 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
01112 
01113 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
01114 {
01115     __m128i z = _mm_setzero_si128();
01116     b0.val = _mm_unpacklo_epi32(a.val, z);
01117     b1.val = _mm_unpackhi_epi32(a.val, z);
01118 }
01119 inline v_uint64x2 v_load_expand(const unsigned* ptr)
01120 {
01121     __m128i z = _mm_setzero_si128();
01122     return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
01123 }
01124 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
01125 {
01126     __m128i s = _mm_srai_epi32(a.val, 31);
01127     b0.val = _mm_unpacklo_epi32(a.val, s);
01128     b1.val = _mm_unpackhi_epi32(a.val, s);
01129 }
01130 inline v_int64x2 v_load_expand(const int* ptr)
01131 {
01132     __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
01133     __m128i s = _mm_srai_epi32(a, 31);
01134     return v_int64x2(_mm_unpacklo_epi32(a, s));
01135 }
01136 
01137 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
01138 {
01139     __m128i z = _mm_setzero_si128();
01140     __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
01141     return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
01142 }
01143 
01144 inline v_int32x4 v_load_expand_q(const schar* ptr)
01145 {
01146     __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
01147     a = _mm_unpacklo_epi8(a, a);
01148     a = _mm_unpacklo_epi8(a, a);
01149     return v_int32x4(_mm_srai_epi32(a, 24));
01150 }
01151 
01152 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
01153 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
01154 { \
01155     b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
01156     b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
01157 } \
01158 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
01159 { \
01160     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
01161     return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
01162 } \
01163 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
01164 { \
01165     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
01166     return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
01167 } \
01168 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
01169 { \
01170     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
01171     c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
01172     d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
01173 }
01174 
01175 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
01176 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
01177 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
01178 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
01179 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
01180 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
01181 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
01182 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
01183 
01184 template<int s, typename _Tpvec>
01185 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
01186 {
01187     const int w = sizeof(typename _Tpvec::lane_type);
01188     const int n = _Tpvec::nlanes;
01189     __m128i ra, rb;
01190     ra = _mm_srli_si128(a.val, s*w);
01191     rb = _mm_slli_si128(b.val, (n-s)*w);
01192     return _Tpvec(_mm_or_si128(ra, rb));
01193 }
01194 
01195 inline v_int32x4 v_round(const v_float32x4& a)
01196 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
01197 
01198 inline v_int32x4 v_floor(const v_float32x4& a)
01199 {
01200     __m128i a1 = _mm_cvtps_epi32(a.val);
01201     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
01202     return v_int32x4(_mm_add_epi32(a1, mask));
01203 }
01204 
01205 inline v_int32x4 v_ceil(const v_float32x4& a)
01206 {
01207     __m128i a1 = _mm_cvtps_epi32(a.val);
01208     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
01209     return v_int32x4(_mm_sub_epi32(a1, mask));
01210 }
01211 
01212 inline v_int32x4 v_trunc(const v_float32x4& a)
01213 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
01214 
01215 inline v_int32x4 v_round(const v_float64x2& a)
01216 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
01217 
01218 inline v_int32x4 v_floor(const v_float64x2& a)
01219 {
01220     __m128i a1 = _mm_cvtpd_epi32(a.val);
01221     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
01222     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
01223     return v_int32x4(_mm_add_epi32(a1, mask));
01224 }
01225 
01226 inline v_int32x4 v_ceil(const v_float64x2& a)
01227 {
01228     __m128i a1 = _mm_cvtpd_epi32(a.val);
01229     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
01230     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
01231     return v_int32x4(_mm_sub_epi32(a1, mask));
01232 }
01233 
01234 inline v_int32x4 v_trunc(const v_float64x2& a)
01235 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
01236 
01237 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
01238 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
01239                            const _Tpvec& a2, const _Tpvec& a3, \
01240                            _Tpvec& b0, _Tpvec& b1, \
01241                            _Tpvec& b2, _Tpvec& b3) \
01242 { \
01243     __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
01244     __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
01245     __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
01246     __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
01247 \
01248     b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
01249     b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
01250     b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
01251     b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
01252 }
01253 
01254 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
01255 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
01256 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
01257 
01258 // adopted from sse_utils.hpp
01259 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
01260 {
01261     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
01262     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
01263     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
01264 
01265     __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
01266     __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
01267     __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
01268 
01269     __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
01270     __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
01271     __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
01272 
01273     __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
01274     __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
01275     __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
01276 
01277     a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
01278     b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
01279     c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
01280 }
01281 
01282 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
01283 {
01284     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
01285     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
01286     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
01287     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
01288 
01289     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
01290     __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
01291     __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
01292     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
01293 
01294     u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
01295     u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
01296     u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
01297     u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
01298 
01299     v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
01300     v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
01301     v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
01302     v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
01303 
01304     a.val = _mm_unpacklo_epi8(v0, v1);
01305     b.val = _mm_unpackhi_epi8(v0, v1);
01306     c.val = _mm_unpacklo_epi8(v2, v3);
01307     d.val = _mm_unpackhi_epi8(v2, v3);
01308 }
01309 
01310 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
01311 {
01312     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
01313     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
01314     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
01315 
01316     __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
01317     __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
01318     __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
01319 
01320     __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
01321     __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
01322     __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
01323 
01324     a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
01325     b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
01326     c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
01327 }
01328 
01329 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
01330 {
01331     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
01332     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
01333     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
01334     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
01335 
01336     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
01337     __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
01338     __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
01339     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
01340 
01341     u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
01342     u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
01343     u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
01344     u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
01345 
01346     a.val = _mm_unpacklo_epi16(u0, u1);
01347     b.val = _mm_unpackhi_epi16(u0, u1);
01348     c.val = _mm_unpacklo_epi16(u2, u3);
01349     d.val = _mm_unpackhi_epi16(u2, u3);
01350 }
01351 
01352 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
01353 {
01354     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
01355     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
01356     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
01357 
01358     __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
01359     __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
01360     __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
01361 
01362     a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
01363     b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
01364     c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
01365 }
01366 
01367 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
01368 {
01369     v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
01370     v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
01371     v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
01372     v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
01373 
01374     v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
01375 }
01376 
01377 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
01378                                 const v_uint8x16& c )
01379 {
01380     __m128i z = _mm_setzero_si128();
01381     __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
01382     __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
01383     __m128i c0 = _mm_unpacklo_epi8(c.val, z);
01384     __m128i c1 = _mm_unpackhi_epi8(c.val, z);
01385 
01386     __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
01387     __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
01388     __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
01389     __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
01390 
01391     __m128i p10 = _mm_unpacklo_epi32(p00, p01);
01392     __m128i p11 = _mm_unpackhi_epi32(p00, p01);
01393     __m128i p12 = _mm_unpacklo_epi32(p02, p03);
01394     __m128i p13 = _mm_unpackhi_epi32(p02, p03);
01395 
01396     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
01397     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
01398     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
01399     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
01400 
01401     p20 = _mm_slli_si128(p20, 1);
01402     p22 = _mm_slli_si128(p22, 1);
01403 
01404     __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
01405     __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
01406     __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
01407     __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
01408 
01409     __m128i p40 = _mm_unpacklo_epi64(p30, p31);
01410     __m128i p41 = _mm_unpackhi_epi64(p30, p31);
01411     __m128i p42 = _mm_unpacklo_epi64(p32, p33);
01412     __m128i p43 = _mm_unpackhi_epi64(p32, p33);
01413 
01414     __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
01415     __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
01416     __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
01417 
01418     _mm_storeu_si128((__m128i*)(ptr), v0);
01419     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
01420     _mm_storeu_si128((__m128i*)(ptr + 32), v2);
01421 }
01422 
01423 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
01424                                 const v_uint8x16& c, const v_uint8x16& d)
01425 {
01426     // a0 a1 a2 a3 ....
01427     // b0 b1 b2 b3 ....
01428     // c0 c1 c2 c3 ....
01429     // d0 d1 d2 d3 ....
01430     __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
01431     __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
01432     __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
01433     __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
01434 
01435     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
01436     __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
01437     __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
01438     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
01439 
01440     _mm_storeu_si128((__m128i*)ptr, v0);
01441     _mm_storeu_si128((__m128i*)(ptr + 16), v2);
01442     _mm_storeu_si128((__m128i*)(ptr + 32), v1);
01443     _mm_storeu_si128((__m128i*)(ptr + 48), v3);
01444 }
01445 
01446 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
01447                                 const v_uint16x8& b,
01448                                 const v_uint16x8& c )
01449 {
01450     __m128i z = _mm_setzero_si128();
01451     __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
01452     __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
01453     __m128i c0 = _mm_unpacklo_epi16(c.val, z);
01454     __m128i c1 = _mm_unpackhi_epi16(c.val, z);
01455 
01456     __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
01457     __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
01458     __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
01459     __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
01460 
01461     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
01462     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
01463     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
01464     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
01465 
01466     p20 = _mm_slli_si128(p20, 2);
01467     p22 = _mm_slli_si128(p22, 2);
01468 
01469     __m128i p30 = _mm_unpacklo_epi64(p20, p21);
01470     __m128i p31 = _mm_unpackhi_epi64(p20, p21);
01471     __m128i p32 = _mm_unpacklo_epi64(p22, p23);
01472     __m128i p33 = _mm_unpackhi_epi64(p22, p23);
01473 
01474     __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
01475     __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
01476     __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
01477 
01478     _mm_storeu_si128((__m128i*)(ptr), v0);
01479     _mm_storeu_si128((__m128i*)(ptr + 8), v1);
01480     _mm_storeu_si128((__m128i*)(ptr + 16), v2);
01481 }
01482 
01483 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
01484                                 const v_uint16x8& c, const v_uint16x8& d)
01485 {
01486     // a0 a1 a2 a3 ....
01487     // b0 b1 b2 b3 ....
01488     // c0 c1 c2 c3 ....
01489     // d0 d1 d2 d3 ....
01490     __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
01491     __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
01492     __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
01493     __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
01494 
01495     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
01496     __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
01497     __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
01498     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
01499 
01500     _mm_storeu_si128((__m128i*)ptr, v0);
01501     _mm_storeu_si128((__m128i*)(ptr + 8), v2);
01502     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
01503     _mm_storeu_si128((__m128i*)(ptr + 24), v3);
01504 }
01505 
01506 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
01507                                 const v_uint32x4& c )
01508 {
01509     v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
01510     v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
01511 
01512     __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
01513     __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
01514     __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
01515 
01516     _mm_storeu_si128((__m128i*)ptr, v0);
01517     _mm_storeu_si128((__m128i*)(ptr + 4), v1);
01518     _mm_storeu_si128((__m128i*)(ptr + 8), v2);
01519 }
01520 
01521 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
01522                                const v_uint32x4& c, const v_uint32x4& d)
01523 {
01524     v_uint32x4 t0, t1, t2, t3;
01525     v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
01526     v_store(ptr, t0);
01527     v_store(ptr + 4, t1);
01528     v_store(ptr + 8, t2);
01529     v_store(ptr + 12, t3);
01530 }
01531 
01532 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
01533 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
01534                                  _Tpvec& b0, _Tpvec& c0 ) \
01535 { \
01536     _Tpuvec a1, b1, c1; \
01537     v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
01538     a0 = v_reinterpret_as_##suffix(a1); \
01539     b0 = v_reinterpret_as_##suffix(b1); \
01540     c0 = v_reinterpret_as_##suffix(c1); \
01541 } \
01542 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
01543                                  _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
01544 { \
01545     _Tpuvec a1, b1, c1, d1; \
01546     v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
01547     a0 = v_reinterpret_as_##suffix(a1); \
01548     b0 = v_reinterpret_as_##suffix(b1); \
01549     c0 = v_reinterpret_as_##suffix(c1); \
01550     d0 = v_reinterpret_as_##suffix(d1); \
01551 } \
01552 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
01553                                const _Tpvec& b0, const _Tpvec& c0 ) \
01554 { \
01555     _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
01556     _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
01557     _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
01558     v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
01559 } \
01560 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
01561                                const _Tpvec& c0, const _Tpvec& d0 ) \
01562 { \
01563     _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
01564     _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
01565     _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
01566     _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
01567     v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
01568 }
01569 
01570 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
01571 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
01572 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
01573 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
01574 
01575 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
01576 {
01577     return v_float32x4(_mm_cvtepi32_ps(a.val));
01578 }
01579 
01580 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
01581 {
01582     return v_float32x4(_mm_cvtpd_ps(a.val));
01583 }
01584 
01585 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
01586 {
01587     return v_float64x2(_mm_cvtepi32_pd(a.val));
01588 }
01589 
01590 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
01591 {
01592     return v_float64x2(_mm_cvtps_pd(a.val));
01593 }
01594 
01595 //! @endcond
01596 
01597 }
01598 
01599 #endif
01600