openCV library for Renesas RZ/A

Dependents:   RZ_A2M_Mbed_samples

Committer:
RyoheiHagimoto
Date:
Fri Jan 29 04:53:38 2021 +0000
Revision:
0:0e0631af0305
copied from https://github.com/d-kato/opencv-lib.

Who changed what in which revision?

UserRevisionLine numberNew contents of line
RyoheiHagimoto 0:0e0631af0305 1 /*M///////////////////////////////////////////////////////////////////////////////////////
RyoheiHagimoto 0:0e0631af0305 2 //
RyoheiHagimoto 0:0e0631af0305 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
RyoheiHagimoto 0:0e0631af0305 4 //
RyoheiHagimoto 0:0e0631af0305 5 // By downloading, copying, installing or using the software you agree to this license.
RyoheiHagimoto 0:0e0631af0305 6 // If you do not agree to this license, do not download, install,
RyoheiHagimoto 0:0e0631af0305 7 // copy or use the software.
RyoheiHagimoto 0:0e0631af0305 8 //
RyoheiHagimoto 0:0e0631af0305 9 //
RyoheiHagimoto 0:0e0631af0305 10 // License Agreement
RyoheiHagimoto 0:0e0631af0305 11 // For Open Source Computer Vision Library
RyoheiHagimoto 0:0e0631af0305 12 //
RyoheiHagimoto 0:0e0631af0305 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
RyoheiHagimoto 0:0e0631af0305 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
RyoheiHagimoto 0:0e0631af0305 15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
RyoheiHagimoto 0:0e0631af0305 16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
RyoheiHagimoto 0:0e0631af0305 17 // Third party copyrights are property of their respective owners.
RyoheiHagimoto 0:0e0631af0305 18 //
RyoheiHagimoto 0:0e0631af0305 19 // Redistribution and use in source and binary forms, with or without modification,
RyoheiHagimoto 0:0e0631af0305 20 // are permitted provided that the following conditions are met:
RyoheiHagimoto 0:0e0631af0305 21 //
RyoheiHagimoto 0:0e0631af0305 22 // * Redistribution's of source code must retain the above copyright notice,
RyoheiHagimoto 0:0e0631af0305 23 // this list of conditions and the following disclaimer.
RyoheiHagimoto 0:0e0631af0305 24 //
RyoheiHagimoto 0:0e0631af0305 25 // * Redistribution's in binary form must reproduce the above copyright notice,
RyoheiHagimoto 0:0e0631af0305 26 // this list of conditions and the following disclaimer in the documentation
RyoheiHagimoto 0:0e0631af0305 27 // and/or other materials provided with the distribution.
RyoheiHagimoto 0:0e0631af0305 28 //
RyoheiHagimoto 0:0e0631af0305 29 // * The name of the copyright holders may not be used to endorse or promote products
RyoheiHagimoto 0:0e0631af0305 30 // derived from this software without specific prior written permission.
RyoheiHagimoto 0:0e0631af0305 31 //
RyoheiHagimoto 0:0e0631af0305 32 // This software is provided by the copyright holders and contributors "as is" and
RyoheiHagimoto 0:0e0631af0305 33 // any express or implied warranties, including, but not limited to, the implied
RyoheiHagimoto 0:0e0631af0305 34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
RyoheiHagimoto 0:0e0631af0305 35 // In no event shall the Intel Corporation or contributors be liable for any direct,
RyoheiHagimoto 0:0e0631af0305 36 // indirect, incidental, special, exemplary, or consequential damages
RyoheiHagimoto 0:0e0631af0305 37 // (including, but not limited to, procurement of substitute goods or services;
RyoheiHagimoto 0:0e0631af0305 38 // loss of use, data, or profits; or business interruption) however caused
RyoheiHagimoto 0:0e0631af0305 39 // and on any theory of liability, whether in contract, strict liability,
RyoheiHagimoto 0:0e0631af0305 40 // or tort (including negligence or otherwise) arising in any way out of
RyoheiHagimoto 0:0e0631af0305 41 // the use of this software, even if advised of the possibility of such damage.
RyoheiHagimoto 0:0e0631af0305 42 //
RyoheiHagimoto 0:0e0631af0305 43 //M*/
RyoheiHagimoto 0:0e0631af0305 44
RyoheiHagimoto 0:0e0631af0305 45 #ifndef OPENCV_HAL_SSE_HPP
RyoheiHagimoto 0:0e0631af0305 46 #define OPENCV_HAL_SSE_HPP
RyoheiHagimoto 0:0e0631af0305 47
RyoheiHagimoto 0:0e0631af0305 48 #include <algorithm>
RyoheiHagimoto 0:0e0631af0305 49 #include "opencv2/core/utility.hpp"
RyoheiHagimoto 0:0e0631af0305 50
RyoheiHagimoto 0:0e0631af0305 51 #define CV_SIMD128 1
RyoheiHagimoto 0:0e0631af0305 52 #define CV_SIMD128_64F 1
RyoheiHagimoto 0:0e0631af0305 53
RyoheiHagimoto 0:0e0631af0305 54 namespace cv
RyoheiHagimoto 0:0e0631af0305 55 {
RyoheiHagimoto 0:0e0631af0305 56
RyoheiHagimoto 0:0e0631af0305 57 //! @cond IGNORED
RyoheiHagimoto 0:0e0631af0305 58
RyoheiHagimoto 0:0e0631af0305 59 struct v_uint8x16
RyoheiHagimoto 0:0e0631af0305 60 {
RyoheiHagimoto 0:0e0631af0305 61 typedef uchar lane_type;
RyoheiHagimoto 0:0e0631af0305 62 enum { nlanes = 16 };
RyoheiHagimoto 0:0e0631af0305 63
RyoheiHagimoto 0:0e0631af0305 64 v_uint8x16() {}
RyoheiHagimoto 0:0e0631af0305 65 explicit v_uint8x16(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 66 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
RyoheiHagimoto 0:0e0631af0305 67 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
RyoheiHagimoto 0:0e0631af0305 68 {
RyoheiHagimoto 0:0e0631af0305 69 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
RyoheiHagimoto 0:0e0631af0305 70 (char)v4, (char)v5, (char)v6, (char)v7,
RyoheiHagimoto 0:0e0631af0305 71 (char)v8, (char)v9, (char)v10, (char)v11,
RyoheiHagimoto 0:0e0631af0305 72 (char)v12, (char)v13, (char)v14, (char)v15);
RyoheiHagimoto 0:0e0631af0305 73 }
RyoheiHagimoto 0:0e0631af0305 74 uchar get0() const
RyoheiHagimoto 0:0e0631af0305 75 {
RyoheiHagimoto 0:0e0631af0305 76 return (uchar)_mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 77 }
RyoheiHagimoto 0:0e0631af0305 78
RyoheiHagimoto 0:0e0631af0305 79 __m128i val;
RyoheiHagimoto 0:0e0631af0305 80 };
RyoheiHagimoto 0:0e0631af0305 81
RyoheiHagimoto 0:0e0631af0305 82 struct v_int8x16
RyoheiHagimoto 0:0e0631af0305 83 {
RyoheiHagimoto 0:0e0631af0305 84 typedef schar lane_type;
RyoheiHagimoto 0:0e0631af0305 85 enum { nlanes = 16 };
RyoheiHagimoto 0:0e0631af0305 86
RyoheiHagimoto 0:0e0631af0305 87 v_int8x16() {}
RyoheiHagimoto 0:0e0631af0305 88 explicit v_int8x16(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 89 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
RyoheiHagimoto 0:0e0631af0305 90 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
RyoheiHagimoto 0:0e0631af0305 91 {
RyoheiHagimoto 0:0e0631af0305 92 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
RyoheiHagimoto 0:0e0631af0305 93 (char)v4, (char)v5, (char)v6, (char)v7,
RyoheiHagimoto 0:0e0631af0305 94 (char)v8, (char)v9, (char)v10, (char)v11,
RyoheiHagimoto 0:0e0631af0305 95 (char)v12, (char)v13, (char)v14, (char)v15);
RyoheiHagimoto 0:0e0631af0305 96 }
RyoheiHagimoto 0:0e0631af0305 97 schar get0() const
RyoheiHagimoto 0:0e0631af0305 98 {
RyoheiHagimoto 0:0e0631af0305 99 return (schar)_mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 100 }
RyoheiHagimoto 0:0e0631af0305 101
RyoheiHagimoto 0:0e0631af0305 102 __m128i val;
RyoheiHagimoto 0:0e0631af0305 103 };
RyoheiHagimoto 0:0e0631af0305 104
RyoheiHagimoto 0:0e0631af0305 105 struct v_uint16x8
RyoheiHagimoto 0:0e0631af0305 106 {
RyoheiHagimoto 0:0e0631af0305 107 typedef ushort lane_type;
RyoheiHagimoto 0:0e0631af0305 108 enum { nlanes = 8 };
RyoheiHagimoto 0:0e0631af0305 109
RyoheiHagimoto 0:0e0631af0305 110 v_uint16x8() {}
RyoheiHagimoto 0:0e0631af0305 111 explicit v_uint16x8(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 112 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
RyoheiHagimoto 0:0e0631af0305 113 {
RyoheiHagimoto 0:0e0631af0305 114 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
RyoheiHagimoto 0:0e0631af0305 115 (short)v4, (short)v5, (short)v6, (short)v7);
RyoheiHagimoto 0:0e0631af0305 116 }
RyoheiHagimoto 0:0e0631af0305 117 ushort get0() const
RyoheiHagimoto 0:0e0631af0305 118 {
RyoheiHagimoto 0:0e0631af0305 119 return (ushort)_mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 120 }
RyoheiHagimoto 0:0e0631af0305 121
RyoheiHagimoto 0:0e0631af0305 122 __m128i val;
RyoheiHagimoto 0:0e0631af0305 123 };
RyoheiHagimoto 0:0e0631af0305 124
RyoheiHagimoto 0:0e0631af0305 125 struct v_int16x8
RyoheiHagimoto 0:0e0631af0305 126 {
RyoheiHagimoto 0:0e0631af0305 127 typedef short lane_type;
RyoheiHagimoto 0:0e0631af0305 128 enum { nlanes = 8 };
RyoheiHagimoto 0:0e0631af0305 129
RyoheiHagimoto 0:0e0631af0305 130 v_int16x8() {}
RyoheiHagimoto 0:0e0631af0305 131 explicit v_int16x8(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 132 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
RyoheiHagimoto 0:0e0631af0305 133 {
RyoheiHagimoto 0:0e0631af0305 134 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
RyoheiHagimoto 0:0e0631af0305 135 (short)v4, (short)v5, (short)v6, (short)v7);
RyoheiHagimoto 0:0e0631af0305 136 }
RyoheiHagimoto 0:0e0631af0305 137 short get0() const
RyoheiHagimoto 0:0e0631af0305 138 {
RyoheiHagimoto 0:0e0631af0305 139 return (short)_mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 140 }
RyoheiHagimoto 0:0e0631af0305 141 __m128i val;
RyoheiHagimoto 0:0e0631af0305 142 };
RyoheiHagimoto 0:0e0631af0305 143
RyoheiHagimoto 0:0e0631af0305 144 struct v_uint32x4
RyoheiHagimoto 0:0e0631af0305 145 {
RyoheiHagimoto 0:0e0631af0305 146 typedef unsigned lane_type;
RyoheiHagimoto 0:0e0631af0305 147 enum { nlanes = 4 };
RyoheiHagimoto 0:0e0631af0305 148
RyoheiHagimoto 0:0e0631af0305 149 v_uint32x4() {}
RyoheiHagimoto 0:0e0631af0305 150 explicit v_uint32x4(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 151 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
RyoheiHagimoto 0:0e0631af0305 152 {
RyoheiHagimoto 0:0e0631af0305 153 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
RyoheiHagimoto 0:0e0631af0305 154 }
RyoheiHagimoto 0:0e0631af0305 155 unsigned get0() const
RyoheiHagimoto 0:0e0631af0305 156 {
RyoheiHagimoto 0:0e0631af0305 157 return (unsigned)_mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 158 }
RyoheiHagimoto 0:0e0631af0305 159 __m128i val;
RyoheiHagimoto 0:0e0631af0305 160 };
RyoheiHagimoto 0:0e0631af0305 161
RyoheiHagimoto 0:0e0631af0305 162 struct v_int32x4
RyoheiHagimoto 0:0e0631af0305 163 {
RyoheiHagimoto 0:0e0631af0305 164 typedef int lane_type;
RyoheiHagimoto 0:0e0631af0305 165 enum { nlanes = 4 };
RyoheiHagimoto 0:0e0631af0305 166
RyoheiHagimoto 0:0e0631af0305 167 v_int32x4() {}
RyoheiHagimoto 0:0e0631af0305 168 explicit v_int32x4(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 169 v_int32x4(int v0, int v1, int v2, int v3)
RyoheiHagimoto 0:0e0631af0305 170 {
RyoheiHagimoto 0:0e0631af0305 171 val = _mm_setr_epi32(v0, v1, v2, v3);
RyoheiHagimoto 0:0e0631af0305 172 }
RyoheiHagimoto 0:0e0631af0305 173 int get0() const
RyoheiHagimoto 0:0e0631af0305 174 {
RyoheiHagimoto 0:0e0631af0305 175 return _mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 176 }
RyoheiHagimoto 0:0e0631af0305 177 __m128i val;
RyoheiHagimoto 0:0e0631af0305 178 };
RyoheiHagimoto 0:0e0631af0305 179
RyoheiHagimoto 0:0e0631af0305 180 struct v_float32x4
RyoheiHagimoto 0:0e0631af0305 181 {
RyoheiHagimoto 0:0e0631af0305 182 typedef float lane_type;
RyoheiHagimoto 0:0e0631af0305 183 enum { nlanes = 4 };
RyoheiHagimoto 0:0e0631af0305 184
RyoheiHagimoto 0:0e0631af0305 185 v_float32x4() {}
RyoheiHagimoto 0:0e0631af0305 186 explicit v_float32x4(__m128 v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 187 v_float32x4(float v0, float v1, float v2, float v3)
RyoheiHagimoto 0:0e0631af0305 188 {
RyoheiHagimoto 0:0e0631af0305 189 val = _mm_setr_ps(v0, v1, v2, v3);
RyoheiHagimoto 0:0e0631af0305 190 }
RyoheiHagimoto 0:0e0631af0305 191 float get0() const
RyoheiHagimoto 0:0e0631af0305 192 {
RyoheiHagimoto 0:0e0631af0305 193 return _mm_cvtss_f32(val);
RyoheiHagimoto 0:0e0631af0305 194 }
RyoheiHagimoto 0:0e0631af0305 195 __m128 val;
RyoheiHagimoto 0:0e0631af0305 196 };
RyoheiHagimoto 0:0e0631af0305 197
RyoheiHagimoto 0:0e0631af0305 198 struct v_uint64x2
RyoheiHagimoto 0:0e0631af0305 199 {
RyoheiHagimoto 0:0e0631af0305 200 typedef uint64 lane_type;
RyoheiHagimoto 0:0e0631af0305 201 enum { nlanes = 2 };
RyoheiHagimoto 0:0e0631af0305 202
RyoheiHagimoto 0:0e0631af0305 203 v_uint64x2() {}
RyoheiHagimoto 0:0e0631af0305 204 explicit v_uint64x2(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 205 v_uint64x2(uint64 v0, uint64 v1)
RyoheiHagimoto 0:0e0631af0305 206 {
RyoheiHagimoto 0:0e0631af0305 207 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
RyoheiHagimoto 0:0e0631af0305 208 }
RyoheiHagimoto 0:0e0631af0305 209 uint64 get0() const
RyoheiHagimoto 0:0e0631af0305 210 {
RyoheiHagimoto 0:0e0631af0305 211 int a = _mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 212 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
RyoheiHagimoto 0:0e0631af0305 213 return (unsigned)a | ((uint64)(unsigned)b << 32);
RyoheiHagimoto 0:0e0631af0305 214 }
RyoheiHagimoto 0:0e0631af0305 215 __m128i val;
RyoheiHagimoto 0:0e0631af0305 216 };
RyoheiHagimoto 0:0e0631af0305 217
RyoheiHagimoto 0:0e0631af0305 218 struct v_int64x2
RyoheiHagimoto 0:0e0631af0305 219 {
RyoheiHagimoto 0:0e0631af0305 220 typedef int64 lane_type;
RyoheiHagimoto 0:0e0631af0305 221 enum { nlanes = 2 };
RyoheiHagimoto 0:0e0631af0305 222
RyoheiHagimoto 0:0e0631af0305 223 v_int64x2() {}
RyoheiHagimoto 0:0e0631af0305 224 explicit v_int64x2(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 225 v_int64x2(int64 v0, int64 v1)
RyoheiHagimoto 0:0e0631af0305 226 {
RyoheiHagimoto 0:0e0631af0305 227 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
RyoheiHagimoto 0:0e0631af0305 228 }
RyoheiHagimoto 0:0e0631af0305 229 int64 get0() const
RyoheiHagimoto 0:0e0631af0305 230 {
RyoheiHagimoto 0:0e0631af0305 231 int a = _mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 232 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
RyoheiHagimoto 0:0e0631af0305 233 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
RyoheiHagimoto 0:0e0631af0305 234 }
RyoheiHagimoto 0:0e0631af0305 235 __m128i val;
RyoheiHagimoto 0:0e0631af0305 236 };
RyoheiHagimoto 0:0e0631af0305 237
RyoheiHagimoto 0:0e0631af0305 238 struct v_float64x2
RyoheiHagimoto 0:0e0631af0305 239 {
RyoheiHagimoto 0:0e0631af0305 240 typedef double lane_type;
RyoheiHagimoto 0:0e0631af0305 241 enum { nlanes = 2 };
RyoheiHagimoto 0:0e0631af0305 242
RyoheiHagimoto 0:0e0631af0305 243 v_float64x2() {}
RyoheiHagimoto 0:0e0631af0305 244 explicit v_float64x2(__m128d v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 245 v_float64x2(double v0, double v1)
RyoheiHagimoto 0:0e0631af0305 246 {
RyoheiHagimoto 0:0e0631af0305 247 val = _mm_setr_pd(v0, v1);
RyoheiHagimoto 0:0e0631af0305 248 }
RyoheiHagimoto 0:0e0631af0305 249 double get0() const
RyoheiHagimoto 0:0e0631af0305 250 {
RyoheiHagimoto 0:0e0631af0305 251 return _mm_cvtsd_f64(val);
RyoheiHagimoto 0:0e0631af0305 252 }
RyoheiHagimoto 0:0e0631af0305 253 __m128d val;
RyoheiHagimoto 0:0e0631af0305 254 };
RyoheiHagimoto 0:0e0631af0305 255
RyoheiHagimoto 0:0e0631af0305 256 #if defined(HAVE_FP16)
RyoheiHagimoto 0:0e0631af0305 257 struct v_float16x4
RyoheiHagimoto 0:0e0631af0305 258 {
RyoheiHagimoto 0:0e0631af0305 259 typedef short lane_type;
RyoheiHagimoto 0:0e0631af0305 260 enum { nlanes = 4 };
RyoheiHagimoto 0:0e0631af0305 261
RyoheiHagimoto 0:0e0631af0305 262 v_float16x4() {}
RyoheiHagimoto 0:0e0631af0305 263 explicit v_float16x4(__m128i v) : val(v) {}
RyoheiHagimoto 0:0e0631af0305 264 v_float16x4(short v0, short v1, short v2, short v3)
RyoheiHagimoto 0:0e0631af0305 265 {
RyoheiHagimoto 0:0e0631af0305 266 val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
RyoheiHagimoto 0:0e0631af0305 267 }
RyoheiHagimoto 0:0e0631af0305 268 short get0() const
RyoheiHagimoto 0:0e0631af0305 269 {
RyoheiHagimoto 0:0e0631af0305 270 return (short)_mm_cvtsi128_si32(val);
RyoheiHagimoto 0:0e0631af0305 271 }
RyoheiHagimoto 0:0e0631af0305 272 __m128i val;
RyoheiHagimoto 0:0e0631af0305 273 };
RyoheiHagimoto 0:0e0631af0305 274 #endif
RyoheiHagimoto 0:0e0631af0305 275
RyoheiHagimoto 0:0e0631af0305 276 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
RyoheiHagimoto 0:0e0631af0305 277 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
RyoheiHagimoto 0:0e0631af0305 278 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
RyoheiHagimoto 0:0e0631af0305 279 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
RyoheiHagimoto 0:0e0631af0305 280 { return _Tpvec(cast(a.val)); }
RyoheiHagimoto 0:0e0631af0305 281
RyoheiHagimoto 0:0e0631af0305 282 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 283 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 284 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 285 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 286 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 287 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 288 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
RyoheiHagimoto 0:0e0631af0305 289 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
RyoheiHagimoto 0:0e0631af0305 290
RyoheiHagimoto 0:0e0631af0305 291 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
RyoheiHagimoto 0:0e0631af0305 292 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
RyoheiHagimoto 0:0e0631af0305 293 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
RyoheiHagimoto 0:0e0631af0305 294 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
RyoheiHagimoto 0:0e0631af0305 295
RyoheiHagimoto 0:0e0631af0305 296 template<typename _Tpvec> inline
RyoheiHagimoto 0:0e0631af0305 297 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
RyoheiHagimoto 0:0e0631af0305 298 template<typename _Tpvec> inline
RyoheiHagimoto 0:0e0631af0305 299 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
RyoheiHagimoto 0:0e0631af0305 300 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
RyoheiHagimoto 0:0e0631af0305 301 { return v_float32x4(_mm_castsi128_ps(a.val)); }
RyoheiHagimoto 0:0e0631af0305 302 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
RyoheiHagimoto 0:0e0631af0305 303 { return v_float32x4(_mm_castsi128_ps(a.val)); }
RyoheiHagimoto 0:0e0631af0305 304 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
RyoheiHagimoto 0:0e0631af0305 305 { return v_float64x2(_mm_castsi128_pd(a.val)); }
RyoheiHagimoto 0:0e0631af0305 306 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
RyoheiHagimoto 0:0e0631af0305 307 { return v_float64x2(_mm_castsi128_pd(a.val)); }
RyoheiHagimoto 0:0e0631af0305 308
RyoheiHagimoto 0:0e0631af0305 309 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
RyoheiHagimoto 0:0e0631af0305 310 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
RyoheiHagimoto 0:0e0631af0305 311 { return _Tpvec(_mm_castps_si128(a.val)); } \
RyoheiHagimoto 0:0e0631af0305 312 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
RyoheiHagimoto 0:0e0631af0305 313 { return _Tpvec(_mm_castpd_si128(a.val)); }
RyoheiHagimoto 0:0e0631af0305 314
RyoheiHagimoto 0:0e0631af0305 315 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
RyoheiHagimoto 0:0e0631af0305 316 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
RyoheiHagimoto 0:0e0631af0305 317 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
RyoheiHagimoto 0:0e0631af0305 318 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
RyoheiHagimoto 0:0e0631af0305 319 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
RyoheiHagimoto 0:0e0631af0305 320 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
RyoheiHagimoto 0:0e0631af0305 321 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
RyoheiHagimoto 0:0e0631af0305 322 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
RyoheiHagimoto 0:0e0631af0305 323
RyoheiHagimoto 0:0e0631af0305 324 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
RyoheiHagimoto 0:0e0631af0305 325 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
RyoheiHagimoto 0:0e0631af0305 326 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
RyoheiHagimoto 0:0e0631af0305 327 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
RyoheiHagimoto 0:0e0631af0305 328
RyoheiHagimoto 0:0e0631af0305 329 //////////////// PACK ///////////////
RyoheiHagimoto 0:0e0631af0305 330 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
RyoheiHagimoto 0:0e0631af0305 331 {
RyoheiHagimoto 0:0e0631af0305 332 __m128i delta = _mm_set1_epi16(255);
RyoheiHagimoto 0:0e0631af0305 333 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
RyoheiHagimoto 0:0e0631af0305 334 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
RyoheiHagimoto 0:0e0631af0305 335 }
RyoheiHagimoto 0:0e0631af0305 336
RyoheiHagimoto 0:0e0631af0305 337 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
RyoheiHagimoto 0:0e0631af0305 338 {
RyoheiHagimoto 0:0e0631af0305 339 __m128i delta = _mm_set1_epi16(255);
RyoheiHagimoto 0:0e0631af0305 340 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
RyoheiHagimoto 0:0e0631af0305 341 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
RyoheiHagimoto 0:0e0631af0305 342 }
RyoheiHagimoto 0:0e0631af0305 343
RyoheiHagimoto 0:0e0631af0305 344 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
RyoheiHagimoto 0:0e0631af0305 345 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
RyoheiHagimoto 0:0e0631af0305 346
RyoheiHagimoto 0:0e0631af0305 347 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
RyoheiHagimoto 0:0e0631af0305 348 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
RyoheiHagimoto 0:0e0631af0305 349
RyoheiHagimoto 0:0e0631af0305 350 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 351 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
RyoheiHagimoto 0:0e0631af0305 352 {
RyoheiHagimoto 0:0e0631af0305 353 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
RyoheiHagimoto 0:0e0631af0305 354 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
RyoheiHagimoto 0:0e0631af0305 355 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
RyoheiHagimoto 0:0e0631af0305 356 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
RyoheiHagimoto 0:0e0631af0305 357 }
RyoheiHagimoto 0:0e0631af0305 358
RyoheiHagimoto 0:0e0631af0305 359 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 360 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
RyoheiHagimoto 0:0e0631af0305 361 {
RyoheiHagimoto 0:0e0631af0305 362 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
RyoheiHagimoto 0:0e0631af0305 363 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
RyoheiHagimoto 0:0e0631af0305 364 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
RyoheiHagimoto 0:0e0631af0305 365 }
RyoheiHagimoto 0:0e0631af0305 366
RyoheiHagimoto 0:0e0631af0305 367 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 368 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
RyoheiHagimoto 0:0e0631af0305 369 {
RyoheiHagimoto 0:0e0631af0305 370 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
RyoheiHagimoto 0:0e0631af0305 371 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
RyoheiHagimoto 0:0e0631af0305 372 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
RyoheiHagimoto 0:0e0631af0305 373 }
RyoheiHagimoto 0:0e0631af0305 374
RyoheiHagimoto 0:0e0631af0305 375 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 376 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
RyoheiHagimoto 0:0e0631af0305 377 {
RyoheiHagimoto 0:0e0631af0305 378 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
RyoheiHagimoto 0:0e0631af0305 379 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
RyoheiHagimoto 0:0e0631af0305 380 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
RyoheiHagimoto 0:0e0631af0305 381 }
RyoheiHagimoto 0:0e0631af0305 382
RyoheiHagimoto 0:0e0631af0305 383 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
RyoheiHagimoto 0:0e0631af0305 384 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
RyoheiHagimoto 0:0e0631af0305 385
RyoheiHagimoto 0:0e0631af0305 386 inline void v_pack_store(schar* ptr, v_int16x8& a)
RyoheiHagimoto 0:0e0631af0305 387 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
RyoheiHagimoto 0:0e0631af0305 388
RyoheiHagimoto 0:0e0631af0305 389 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 390 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
RyoheiHagimoto 0:0e0631af0305 391 {
RyoheiHagimoto 0:0e0631af0305 392 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
RyoheiHagimoto 0:0e0631af0305 393 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
RyoheiHagimoto 0:0e0631af0305 394 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
RyoheiHagimoto 0:0e0631af0305 395 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
RyoheiHagimoto 0:0e0631af0305 396 }
RyoheiHagimoto 0:0e0631af0305 397 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 398 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
RyoheiHagimoto 0:0e0631af0305 399 {
RyoheiHagimoto 0:0e0631af0305 400 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
RyoheiHagimoto 0:0e0631af0305 401 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
RyoheiHagimoto 0:0e0631af0305 402 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
RyoheiHagimoto 0:0e0631af0305 403 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
RyoheiHagimoto 0:0e0631af0305 404 }
RyoheiHagimoto 0:0e0631af0305 405
RyoheiHagimoto 0:0e0631af0305 406
RyoheiHagimoto 0:0e0631af0305 407 // bit-wise "mask ? a : b"
RyoheiHagimoto 0:0e0631af0305 408 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
RyoheiHagimoto 0:0e0631af0305 409 {
RyoheiHagimoto 0:0e0631af0305 410 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
RyoheiHagimoto 0:0e0631af0305 411 }
RyoheiHagimoto 0:0e0631af0305 412
RyoheiHagimoto 0:0e0631af0305 413 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
RyoheiHagimoto 0:0e0631af0305 414 {
RyoheiHagimoto 0:0e0631af0305 415 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
RyoheiHagimoto 0:0e0631af0305 416 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
RyoheiHagimoto 0:0e0631af0305 417 __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
RyoheiHagimoto 0:0e0631af0305 418 __m128i r = _mm_packs_epi32(a1, b1);
RyoheiHagimoto 0:0e0631af0305 419 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
RyoheiHagimoto 0:0e0631af0305 420 }
RyoheiHagimoto 0:0e0631af0305 421
RyoheiHagimoto 0:0e0631af0305 422 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
RyoheiHagimoto 0:0e0631af0305 423 {
RyoheiHagimoto 0:0e0631af0305 424 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
RyoheiHagimoto 0:0e0631af0305 425 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
RyoheiHagimoto 0:0e0631af0305 426 __m128i r = _mm_packs_epi32(a1, a1);
RyoheiHagimoto 0:0e0631af0305 427 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
RyoheiHagimoto 0:0e0631af0305 428 }
RyoheiHagimoto 0:0e0631af0305 429
RyoheiHagimoto 0:0e0631af0305 430 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 431 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
RyoheiHagimoto 0:0e0631af0305 432 {
RyoheiHagimoto 0:0e0631af0305 433 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
RyoheiHagimoto 0:0e0631af0305 434 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
RyoheiHagimoto 0:0e0631af0305 435 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
RyoheiHagimoto 0:0e0631af0305 436 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
RyoheiHagimoto 0:0e0631af0305 437 }
RyoheiHagimoto 0:0e0631af0305 438
RyoheiHagimoto 0:0e0631af0305 439 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 440 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
RyoheiHagimoto 0:0e0631af0305 441 {
RyoheiHagimoto 0:0e0631af0305 442 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
RyoheiHagimoto 0:0e0631af0305 443 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
RyoheiHagimoto 0:0e0631af0305 444 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
RyoheiHagimoto 0:0e0631af0305 445 _mm_storel_epi64((__m128i*)ptr, a2);
RyoheiHagimoto 0:0e0631af0305 446 }
RyoheiHagimoto 0:0e0631af0305 447
RyoheiHagimoto 0:0e0631af0305 448 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 449 {
RyoheiHagimoto 0:0e0631af0305 450 __m128i delta32 = _mm_set1_epi32(32768);
RyoheiHagimoto 0:0e0631af0305 451 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
RyoheiHagimoto 0:0e0631af0305 452 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
RyoheiHagimoto 0:0e0631af0305 453 }
RyoheiHagimoto 0:0e0631af0305 454
RyoheiHagimoto 0:0e0631af0305 455 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
RyoheiHagimoto 0:0e0631af0305 456 {
RyoheiHagimoto 0:0e0631af0305 457 __m128i delta32 = _mm_set1_epi32(32768);
RyoheiHagimoto 0:0e0631af0305 458 __m128i a1 = _mm_sub_epi32(a.val, delta32);
RyoheiHagimoto 0:0e0631af0305 459 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
RyoheiHagimoto 0:0e0631af0305 460 _mm_storel_epi64((__m128i*)ptr, r);
RyoheiHagimoto 0:0e0631af0305 461 }
RyoheiHagimoto 0:0e0631af0305 462
RyoheiHagimoto 0:0e0631af0305 463 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 464 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 465 {
RyoheiHagimoto 0:0e0631af0305 466 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
RyoheiHagimoto 0:0e0631af0305 467 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
RyoheiHagimoto 0:0e0631af0305 468 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
RyoheiHagimoto 0:0e0631af0305 469 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
RyoheiHagimoto 0:0e0631af0305 470 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
RyoheiHagimoto 0:0e0631af0305 471 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
RyoheiHagimoto 0:0e0631af0305 472 }
RyoheiHagimoto 0:0e0631af0305 473
RyoheiHagimoto 0:0e0631af0305 474 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 475 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
RyoheiHagimoto 0:0e0631af0305 476 {
RyoheiHagimoto 0:0e0631af0305 477 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
RyoheiHagimoto 0:0e0631af0305 478 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
RyoheiHagimoto 0:0e0631af0305 479 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
RyoheiHagimoto 0:0e0631af0305 480 _mm_storel_epi64((__m128i*)ptr, a2);
RyoheiHagimoto 0:0e0631af0305 481 }
RyoheiHagimoto 0:0e0631af0305 482
RyoheiHagimoto 0:0e0631af0305 483 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 484 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
RyoheiHagimoto 0:0e0631af0305 485
RyoheiHagimoto 0:0e0631af0305 486 inline void v_pack_store(short* ptr, const v_int32x4& a)
RyoheiHagimoto 0:0e0631af0305 487 {
RyoheiHagimoto 0:0e0631af0305 488 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
RyoheiHagimoto 0:0e0631af0305 489 }
RyoheiHagimoto 0:0e0631af0305 490
RyoheiHagimoto 0:0e0631af0305 491 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 492 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 493 {
RyoheiHagimoto 0:0e0631af0305 494 __m128i delta = _mm_set1_epi32(1 << (n-1));
RyoheiHagimoto 0:0e0631af0305 495 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
RyoheiHagimoto 0:0e0631af0305 496 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
RyoheiHagimoto 0:0e0631af0305 497 }
RyoheiHagimoto 0:0e0631af0305 498
RyoheiHagimoto 0:0e0631af0305 499 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 500 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
RyoheiHagimoto 0:0e0631af0305 501 {
RyoheiHagimoto 0:0e0631af0305 502 __m128i delta = _mm_set1_epi32(1 << (n-1));
RyoheiHagimoto 0:0e0631af0305 503 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
RyoheiHagimoto 0:0e0631af0305 504 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
RyoheiHagimoto 0:0e0631af0305 505 }
RyoheiHagimoto 0:0e0631af0305 506
RyoheiHagimoto 0:0e0631af0305 507
RyoheiHagimoto 0:0e0631af0305 508 // [a0 0 | b0 0] [a1 0 | b1 0]
RyoheiHagimoto 0:0e0631af0305 509 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
RyoheiHagimoto 0:0e0631af0305 510 {
RyoheiHagimoto 0:0e0631af0305 511 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
RyoheiHagimoto 0:0e0631af0305 512 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
RyoheiHagimoto 0:0e0631af0305 513 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
RyoheiHagimoto 0:0e0631af0305 514 }
RyoheiHagimoto 0:0e0631af0305 515
RyoheiHagimoto 0:0e0631af0305 516 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
RyoheiHagimoto 0:0e0631af0305 517 {
RyoheiHagimoto 0:0e0631af0305 518 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
RyoheiHagimoto 0:0e0631af0305 519 _mm_storel_epi64((__m128i*)ptr, a1);
RyoheiHagimoto 0:0e0631af0305 520 }
RyoheiHagimoto 0:0e0631af0305 521
RyoheiHagimoto 0:0e0631af0305 522 // [a0 0 | b0 0] [a1 0 | b1 0]
RyoheiHagimoto 0:0e0631af0305 523 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
RyoheiHagimoto 0:0e0631af0305 524 {
RyoheiHagimoto 0:0e0631af0305 525 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
RyoheiHagimoto 0:0e0631af0305 526 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
RyoheiHagimoto 0:0e0631af0305 527 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
RyoheiHagimoto 0:0e0631af0305 528 }
RyoheiHagimoto 0:0e0631af0305 529
RyoheiHagimoto 0:0e0631af0305 530 inline void v_pack_store(int* ptr, const v_int64x2& a)
RyoheiHagimoto 0:0e0631af0305 531 {
RyoheiHagimoto 0:0e0631af0305 532 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
RyoheiHagimoto 0:0e0631af0305 533 _mm_storel_epi64((__m128i*)ptr, a1);
RyoheiHagimoto 0:0e0631af0305 534 }
RyoheiHagimoto 0:0e0631af0305 535
RyoheiHagimoto 0:0e0631af0305 536 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 537 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
RyoheiHagimoto 0:0e0631af0305 538 {
RyoheiHagimoto 0:0e0631af0305 539 uint64 delta = (uint64)1 << (n-1);
RyoheiHagimoto 0:0e0631af0305 540 v_uint64x2 delta2(delta, delta);
RyoheiHagimoto 0:0e0631af0305 541 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
RyoheiHagimoto 0:0e0631af0305 542 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
RyoheiHagimoto 0:0e0631af0305 543 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
RyoheiHagimoto 0:0e0631af0305 544 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
RyoheiHagimoto 0:0e0631af0305 545 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
RyoheiHagimoto 0:0e0631af0305 546 }
RyoheiHagimoto 0:0e0631af0305 547
RyoheiHagimoto 0:0e0631af0305 548 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 549 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
RyoheiHagimoto 0:0e0631af0305 550 {
RyoheiHagimoto 0:0e0631af0305 551 uint64 delta = (uint64)1 << (n-1);
RyoheiHagimoto 0:0e0631af0305 552 v_uint64x2 delta2(delta, delta);
RyoheiHagimoto 0:0e0631af0305 553 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
RyoheiHagimoto 0:0e0631af0305 554 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
RyoheiHagimoto 0:0e0631af0305 555 _mm_storel_epi64((__m128i*)ptr, a2);
RyoheiHagimoto 0:0e0631af0305 556 }
RyoheiHagimoto 0:0e0631af0305 557
RyoheiHagimoto 0:0e0631af0305 558 inline __m128i v_sign_epi64(__m128i a)
RyoheiHagimoto 0:0e0631af0305 559 {
RyoheiHagimoto 0:0e0631af0305 560 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
RyoheiHagimoto 0:0e0631af0305 561 }
RyoheiHagimoto 0:0e0631af0305 562
RyoheiHagimoto 0:0e0631af0305 563 inline __m128i v_srai_epi64(__m128i a, int imm)
RyoheiHagimoto 0:0e0631af0305 564 {
RyoheiHagimoto 0:0e0631af0305 565 __m128i smask = v_sign_epi64(a);
RyoheiHagimoto 0:0e0631af0305 566 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
RyoheiHagimoto 0:0e0631af0305 567 }
RyoheiHagimoto 0:0e0631af0305 568
RyoheiHagimoto 0:0e0631af0305 569 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 570 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
RyoheiHagimoto 0:0e0631af0305 571 {
RyoheiHagimoto 0:0e0631af0305 572 int64 delta = (int64)1 << (n-1);
RyoheiHagimoto 0:0e0631af0305 573 v_int64x2 delta2(delta, delta);
RyoheiHagimoto 0:0e0631af0305 574 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
RyoheiHagimoto 0:0e0631af0305 575 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
RyoheiHagimoto 0:0e0631af0305 576 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
RyoheiHagimoto 0:0e0631af0305 577 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
RyoheiHagimoto 0:0e0631af0305 578 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
RyoheiHagimoto 0:0e0631af0305 579 }
RyoheiHagimoto 0:0e0631af0305 580
RyoheiHagimoto 0:0e0631af0305 581 template<int n> inline
RyoheiHagimoto 0:0e0631af0305 582 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
RyoheiHagimoto 0:0e0631af0305 583 {
RyoheiHagimoto 0:0e0631af0305 584 int64 delta = (int64)1 << (n-1);
RyoheiHagimoto 0:0e0631af0305 585 v_int64x2 delta2(delta, delta);
RyoheiHagimoto 0:0e0631af0305 586 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
RyoheiHagimoto 0:0e0631af0305 587 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
RyoheiHagimoto 0:0e0631af0305 588 _mm_storel_epi64((__m128i*)ptr, a2);
RyoheiHagimoto 0:0e0631af0305 589 }
RyoheiHagimoto 0:0e0631af0305 590
RyoheiHagimoto 0:0e0631af0305 591 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
RyoheiHagimoto 0:0e0631af0305 592 const v_float32x4& m1, const v_float32x4& m2,
RyoheiHagimoto 0:0e0631af0305 593 const v_float32x4& m3)
RyoheiHagimoto 0:0e0631af0305 594 {
RyoheiHagimoto 0:0e0631af0305 595 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
RyoheiHagimoto 0:0e0631af0305 596 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
RyoheiHagimoto 0:0e0631af0305 597 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
RyoheiHagimoto 0:0e0631af0305 598 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
RyoheiHagimoto 0:0e0631af0305 599
RyoheiHagimoto 0:0e0631af0305 600 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
RyoheiHagimoto 0:0e0631af0305 601 }
RyoheiHagimoto 0:0e0631af0305 602
RyoheiHagimoto 0:0e0631af0305 603
RyoheiHagimoto 0:0e0631af0305 604 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
RyoheiHagimoto 0:0e0631af0305 605 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 606 { \
RyoheiHagimoto 0:0e0631af0305 607 return _Tpvec(intrin(a.val, b.val)); \
RyoheiHagimoto 0:0e0631af0305 608 } \
RyoheiHagimoto 0:0e0631af0305 609 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 610 { \
RyoheiHagimoto 0:0e0631af0305 611 a.val = intrin(a.val, b.val); \
RyoheiHagimoto 0:0e0631af0305 612 return a; \
RyoheiHagimoto 0:0e0631af0305 613 }
RyoheiHagimoto 0:0e0631af0305 614
RyoheiHagimoto 0:0e0631af0305 615 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
RyoheiHagimoto 0:0e0631af0305 616 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
RyoheiHagimoto 0:0e0631af0305 617 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
RyoheiHagimoto 0:0e0631af0305 618 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
RyoheiHagimoto 0:0e0631af0305 619 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
RyoheiHagimoto 0:0e0631af0305 620 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
RyoheiHagimoto 0:0e0631af0305 621 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
RyoheiHagimoto 0:0e0631af0305 622 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
RyoheiHagimoto 0:0e0631af0305 623 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
RyoheiHagimoto 0:0e0631af0305 624 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
RyoheiHagimoto 0:0e0631af0305 625 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
RyoheiHagimoto 0:0e0631af0305 626 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
RyoheiHagimoto 0:0e0631af0305 627 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
RyoheiHagimoto 0:0e0631af0305 628 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
RyoheiHagimoto 0:0e0631af0305 629 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
RyoheiHagimoto 0:0e0631af0305 630 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
RyoheiHagimoto 0:0e0631af0305 631 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
RyoheiHagimoto 0:0e0631af0305 632 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
RyoheiHagimoto 0:0e0631af0305 633 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
RyoheiHagimoto 0:0e0631af0305 634 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
RyoheiHagimoto 0:0e0631af0305 635 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
RyoheiHagimoto 0:0e0631af0305 636 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
RyoheiHagimoto 0:0e0631af0305 637 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
RyoheiHagimoto 0:0e0631af0305 638 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
RyoheiHagimoto 0:0e0631af0305 639 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
RyoheiHagimoto 0:0e0631af0305 640 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
RyoheiHagimoto 0:0e0631af0305 641
RyoheiHagimoto 0:0e0631af0305 642 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
RyoheiHagimoto 0:0e0631af0305 643 {
RyoheiHagimoto 0:0e0631af0305 644 __m128i c0 = _mm_mul_epu32(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 645 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
RyoheiHagimoto 0:0e0631af0305 646 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
RyoheiHagimoto 0:0e0631af0305 647 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
RyoheiHagimoto 0:0e0631af0305 648 return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
RyoheiHagimoto 0:0e0631af0305 649 }
RyoheiHagimoto 0:0e0631af0305 650 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 651 {
RyoheiHagimoto 0:0e0631af0305 652 __m128i c0 = _mm_mul_epu32(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 653 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
RyoheiHagimoto 0:0e0631af0305 654 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
RyoheiHagimoto 0:0e0631af0305 655 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
RyoheiHagimoto 0:0e0631af0305 656 return v_int32x4(_mm_unpacklo_epi64(d0, d1));
RyoheiHagimoto 0:0e0631af0305 657 }
RyoheiHagimoto 0:0e0631af0305 658 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
RyoheiHagimoto 0:0e0631af0305 659 {
RyoheiHagimoto 0:0e0631af0305 660 a = a * b;
RyoheiHagimoto 0:0e0631af0305 661 return a;
RyoheiHagimoto 0:0e0631af0305 662 }
RyoheiHagimoto 0:0e0631af0305 663 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 664 {
RyoheiHagimoto 0:0e0631af0305 665 a = a * b;
RyoheiHagimoto 0:0e0631af0305 666 return a;
RyoheiHagimoto 0:0e0631af0305 667 }
RyoheiHagimoto 0:0e0631af0305 668
RyoheiHagimoto 0:0e0631af0305 669 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
RyoheiHagimoto 0:0e0631af0305 670 v_int32x4& c, v_int32x4& d)
RyoheiHagimoto 0:0e0631af0305 671 {
RyoheiHagimoto 0:0e0631af0305 672 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 673 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 674 c.val = _mm_unpacklo_epi16(v0, v1);
RyoheiHagimoto 0:0e0631af0305 675 d.val = _mm_unpackhi_epi16(v0, v1);
RyoheiHagimoto 0:0e0631af0305 676 }
RyoheiHagimoto 0:0e0631af0305 677
RyoheiHagimoto 0:0e0631af0305 678 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
RyoheiHagimoto 0:0e0631af0305 679 v_uint32x4& c, v_uint32x4& d)
RyoheiHagimoto 0:0e0631af0305 680 {
RyoheiHagimoto 0:0e0631af0305 681 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 682 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 683 c.val = _mm_unpacklo_epi16(v0, v1);
RyoheiHagimoto 0:0e0631af0305 684 d.val = _mm_unpackhi_epi16(v0, v1);
RyoheiHagimoto 0:0e0631af0305 685 }
RyoheiHagimoto 0:0e0631af0305 686
RyoheiHagimoto 0:0e0631af0305 687 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
RyoheiHagimoto 0:0e0631af0305 688 v_uint64x2& c, v_uint64x2& d)
RyoheiHagimoto 0:0e0631af0305 689 {
RyoheiHagimoto 0:0e0631af0305 690 __m128i c0 = _mm_mul_epu32(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 691 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
RyoheiHagimoto 0:0e0631af0305 692 c.val = _mm_unpacklo_epi64(c0, c1);
RyoheiHagimoto 0:0e0631af0305 693 d.val = _mm_unpackhi_epi64(c0, c1);
RyoheiHagimoto 0:0e0631af0305 694 }
RyoheiHagimoto 0:0e0631af0305 695
RyoheiHagimoto 0:0e0631af0305 696 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
RyoheiHagimoto 0:0e0631af0305 697 {
RyoheiHagimoto 0:0e0631af0305 698 return v_int32x4(_mm_madd_epi16(a.val, b.val));
RyoheiHagimoto 0:0e0631af0305 699 }
RyoheiHagimoto 0:0e0631af0305 700
RyoheiHagimoto 0:0e0631af0305 701 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
RyoheiHagimoto 0:0e0631af0305 702 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
RyoheiHagimoto 0:0e0631af0305 703 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
RyoheiHagimoto 0:0e0631af0305 704 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
RyoheiHagimoto 0:0e0631af0305 705 inline _Tpvec operator ~ (const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 706 { \
RyoheiHagimoto 0:0e0631af0305 707 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
RyoheiHagimoto 0:0e0631af0305 708 }
RyoheiHagimoto 0:0e0631af0305 709
RyoheiHagimoto 0:0e0631af0305 710 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
RyoheiHagimoto 0:0e0631af0305 711 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
RyoheiHagimoto 0:0e0631af0305 712 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
RyoheiHagimoto 0:0e0631af0305 713 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
RyoheiHagimoto 0:0e0631af0305 714 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
RyoheiHagimoto 0:0e0631af0305 715 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
RyoheiHagimoto 0:0e0631af0305 716 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
RyoheiHagimoto 0:0e0631af0305 717 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
RyoheiHagimoto 0:0e0631af0305 718 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
RyoheiHagimoto 0:0e0631af0305 719 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
RyoheiHagimoto 0:0e0631af0305 720
RyoheiHagimoto 0:0e0631af0305 721 inline v_float32x4 v_sqrt(const v_float32x4& x)
RyoheiHagimoto 0:0e0631af0305 722 { return v_float32x4(_mm_sqrt_ps(x.val)); }
RyoheiHagimoto 0:0e0631af0305 723
RyoheiHagimoto 0:0e0631af0305 724 inline v_float32x4 v_invsqrt(const v_float32x4& x)
RyoheiHagimoto 0:0e0631af0305 725 {
RyoheiHagimoto 0:0e0631af0305 726 static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
RyoheiHagimoto 0:0e0631af0305 727 __m128 t = x.val;
RyoheiHagimoto 0:0e0631af0305 728 __m128 h = _mm_mul_ps(t, _0_5);
RyoheiHagimoto 0:0e0631af0305 729 t = _mm_rsqrt_ps(t);
RyoheiHagimoto 0:0e0631af0305 730 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
RyoheiHagimoto 0:0e0631af0305 731 return v_float32x4(t);
RyoheiHagimoto 0:0e0631af0305 732 }
RyoheiHagimoto 0:0e0631af0305 733
RyoheiHagimoto 0:0e0631af0305 734 inline v_float64x2 v_sqrt(const v_float64x2& x)
RyoheiHagimoto 0:0e0631af0305 735 { return v_float64x2(_mm_sqrt_pd(x.val)); }
RyoheiHagimoto 0:0e0631af0305 736
RyoheiHagimoto 0:0e0631af0305 737 inline v_float64x2 v_invsqrt(const v_float64x2& x)
RyoheiHagimoto 0:0e0631af0305 738 {
RyoheiHagimoto 0:0e0631af0305 739 static const __m128d v_1 = _mm_set1_pd(1.);
RyoheiHagimoto 0:0e0631af0305 740 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
RyoheiHagimoto 0:0e0631af0305 741 }
RyoheiHagimoto 0:0e0631af0305 742
RyoheiHagimoto 0:0e0631af0305 743 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
RyoheiHagimoto 0:0e0631af0305 744 inline _Tpuvec v_abs(const _Tpsvec& x) \
RyoheiHagimoto 0:0e0631af0305 745 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
RyoheiHagimoto 0:0e0631af0305 746
RyoheiHagimoto 0:0e0631af0305 747 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
RyoheiHagimoto 0:0e0631af0305 748 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
RyoheiHagimoto 0:0e0631af0305 749 inline v_uint32x4 v_abs(const v_int32x4& x)
RyoheiHagimoto 0:0e0631af0305 750 {
RyoheiHagimoto 0:0e0631af0305 751 __m128i s = _mm_srli_epi32(x.val, 31);
RyoheiHagimoto 0:0e0631af0305 752 __m128i f = _mm_srai_epi32(x.val, 31);
RyoheiHagimoto 0:0e0631af0305 753 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
RyoheiHagimoto 0:0e0631af0305 754 }
RyoheiHagimoto 0:0e0631af0305 755 inline v_float32x4 v_abs(const v_float32x4& x)
RyoheiHagimoto 0:0e0631af0305 756 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
RyoheiHagimoto 0:0e0631af0305 757 inline v_float64x2 v_abs(const v_float64x2& x)
RyoheiHagimoto 0:0e0631af0305 758 {
RyoheiHagimoto 0:0e0631af0305 759 return v_float64x2(_mm_and_pd(x.val,
RyoheiHagimoto 0:0e0631af0305 760 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
RyoheiHagimoto 0:0e0631af0305 761 }
RyoheiHagimoto 0:0e0631af0305 762
RyoheiHagimoto 0:0e0631af0305 763 // TODO: exp, log, sin, cos
RyoheiHagimoto 0:0e0631af0305 764
RyoheiHagimoto 0:0e0631af0305 765 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
RyoheiHagimoto 0:0e0631af0305 766 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 767 { \
RyoheiHagimoto 0:0e0631af0305 768 return _Tpvec(intrin(a.val, b.val)); \
RyoheiHagimoto 0:0e0631af0305 769 }
RyoheiHagimoto 0:0e0631af0305 770
RyoheiHagimoto 0:0e0631af0305 771 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
RyoheiHagimoto 0:0e0631af0305 772 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
RyoheiHagimoto 0:0e0631af0305 773 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
RyoheiHagimoto 0:0e0631af0305 774 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
RyoheiHagimoto 0:0e0631af0305 775 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
RyoheiHagimoto 0:0e0631af0305 776 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
RyoheiHagimoto 0:0e0631af0305 777 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
RyoheiHagimoto 0:0e0631af0305 778 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
RyoheiHagimoto 0:0e0631af0305 779
RyoheiHagimoto 0:0e0631af0305 780 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
RyoheiHagimoto 0:0e0631af0305 781 {
RyoheiHagimoto 0:0e0631af0305 782 __m128i delta = _mm_set1_epi8((char)-128);
RyoheiHagimoto 0:0e0631af0305 783 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
RyoheiHagimoto 0:0e0631af0305 784 _mm_xor_si128(b.val, delta))));
RyoheiHagimoto 0:0e0631af0305 785 }
RyoheiHagimoto 0:0e0631af0305 786 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
RyoheiHagimoto 0:0e0631af0305 787 {
RyoheiHagimoto 0:0e0631af0305 788 __m128i delta = _mm_set1_epi8((char)-128);
RyoheiHagimoto 0:0e0631af0305 789 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
RyoheiHagimoto 0:0e0631af0305 790 _mm_xor_si128(b.val, delta))));
RyoheiHagimoto 0:0e0631af0305 791 }
RyoheiHagimoto 0:0e0631af0305 792 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
RyoheiHagimoto 0:0e0631af0305 793 {
RyoheiHagimoto 0:0e0631af0305 794 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
RyoheiHagimoto 0:0e0631af0305 795 }
RyoheiHagimoto 0:0e0631af0305 796 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
RyoheiHagimoto 0:0e0631af0305 797 {
RyoheiHagimoto 0:0e0631af0305 798 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
RyoheiHagimoto 0:0e0631af0305 799 }
RyoheiHagimoto 0:0e0631af0305 800 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
RyoheiHagimoto 0:0e0631af0305 801 {
RyoheiHagimoto 0:0e0631af0305 802 __m128i delta = _mm_set1_epi32((int)0x80000000);
RyoheiHagimoto 0:0e0631af0305 803 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
RyoheiHagimoto 0:0e0631af0305 804 return v_uint32x4(v_select_si128(mask, b.val, a.val));
RyoheiHagimoto 0:0e0631af0305 805 }
RyoheiHagimoto 0:0e0631af0305 806 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
RyoheiHagimoto 0:0e0631af0305 807 {
RyoheiHagimoto 0:0e0631af0305 808 __m128i delta = _mm_set1_epi32((int)0x80000000);
RyoheiHagimoto 0:0e0631af0305 809 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
RyoheiHagimoto 0:0e0631af0305 810 return v_uint32x4(v_select_si128(mask, a.val, b.val));
RyoheiHagimoto 0:0e0631af0305 811 }
RyoheiHagimoto 0:0e0631af0305 812 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 813 {
RyoheiHagimoto 0:0e0631af0305 814 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
RyoheiHagimoto 0:0e0631af0305 815 }
RyoheiHagimoto 0:0e0631af0305 816 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 817 {
RyoheiHagimoto 0:0e0631af0305 818 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
RyoheiHagimoto 0:0e0631af0305 819 }
RyoheiHagimoto 0:0e0631af0305 820
RyoheiHagimoto 0:0e0631af0305 821 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
RyoheiHagimoto 0:0e0631af0305 822 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
RyoheiHagimoto 0:0e0631af0305 823 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
RyoheiHagimoto 0:0e0631af0305 824 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
RyoheiHagimoto 0:0e0631af0305 825 { \
RyoheiHagimoto 0:0e0631af0305 826 __m128i not_mask = _mm_set1_epi32(-1); \
RyoheiHagimoto 0:0e0631af0305 827 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
RyoheiHagimoto 0:0e0631af0305 828 } \
RyoheiHagimoto 0:0e0631af0305 829 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
RyoheiHagimoto 0:0e0631af0305 830 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
RyoheiHagimoto 0:0e0631af0305 831 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
RyoheiHagimoto 0:0e0631af0305 832 { \
RyoheiHagimoto 0:0e0631af0305 833 __m128i not_mask = _mm_set1_epi32(-1); \
RyoheiHagimoto 0:0e0631af0305 834 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
RyoheiHagimoto 0:0e0631af0305 835 } \
RyoheiHagimoto 0:0e0631af0305 836 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
RyoheiHagimoto 0:0e0631af0305 837 { \
RyoheiHagimoto 0:0e0631af0305 838 __m128i smask = _mm_set1_##suffix(sbit); \
RyoheiHagimoto 0:0e0631af0305 839 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
RyoheiHagimoto 0:0e0631af0305 840 } \
RyoheiHagimoto 0:0e0631af0305 841 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
RyoheiHagimoto 0:0e0631af0305 842 { \
RyoheiHagimoto 0:0e0631af0305 843 __m128i smask = _mm_set1_##suffix(sbit); \
RyoheiHagimoto 0:0e0631af0305 844 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
RyoheiHagimoto 0:0e0631af0305 845 } \
RyoheiHagimoto 0:0e0631af0305 846 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
RyoheiHagimoto 0:0e0631af0305 847 { \
RyoheiHagimoto 0:0e0631af0305 848 __m128i smask = _mm_set1_##suffix(sbit); \
RyoheiHagimoto 0:0e0631af0305 849 __m128i not_mask = _mm_set1_epi32(-1); \
RyoheiHagimoto 0:0e0631af0305 850 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
RyoheiHagimoto 0:0e0631af0305 851 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
RyoheiHagimoto 0:0e0631af0305 852 } \
RyoheiHagimoto 0:0e0631af0305 853 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
RyoheiHagimoto 0:0e0631af0305 854 { \
RyoheiHagimoto 0:0e0631af0305 855 __m128i smask = _mm_set1_##suffix(sbit); \
RyoheiHagimoto 0:0e0631af0305 856 __m128i not_mask = _mm_set1_epi32(-1); \
RyoheiHagimoto 0:0e0631af0305 857 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
RyoheiHagimoto 0:0e0631af0305 858 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
RyoheiHagimoto 0:0e0631af0305 859 } \
RyoheiHagimoto 0:0e0631af0305 860 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
RyoheiHagimoto 0:0e0631af0305 861 { \
RyoheiHagimoto 0:0e0631af0305 862 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
RyoheiHagimoto 0:0e0631af0305 863 } \
RyoheiHagimoto 0:0e0631af0305 864 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
RyoheiHagimoto 0:0e0631af0305 865 { \
RyoheiHagimoto 0:0e0631af0305 866 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
RyoheiHagimoto 0:0e0631af0305 867 } \
RyoheiHagimoto 0:0e0631af0305 868 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
RyoheiHagimoto 0:0e0631af0305 869 { \
RyoheiHagimoto 0:0e0631af0305 870 __m128i not_mask = _mm_set1_epi32(-1); \
RyoheiHagimoto 0:0e0631af0305 871 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
RyoheiHagimoto 0:0e0631af0305 872 } \
RyoheiHagimoto 0:0e0631af0305 873 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
RyoheiHagimoto 0:0e0631af0305 874 { \
RyoheiHagimoto 0:0e0631af0305 875 __m128i not_mask = _mm_set1_epi32(-1); \
RyoheiHagimoto 0:0e0631af0305 876 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
RyoheiHagimoto 0:0e0631af0305 877 }
RyoheiHagimoto 0:0e0631af0305 878
RyoheiHagimoto 0:0e0631af0305 879 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
RyoheiHagimoto 0:0e0631af0305 880 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
RyoheiHagimoto 0:0e0631af0305 881 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
RyoheiHagimoto 0:0e0631af0305 882
RyoheiHagimoto 0:0e0631af0305 883 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
RyoheiHagimoto 0:0e0631af0305 884 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 885 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
RyoheiHagimoto 0:0e0631af0305 886 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 887 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
RyoheiHagimoto 0:0e0631af0305 888 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 889 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
RyoheiHagimoto 0:0e0631af0305 890 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 891 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
RyoheiHagimoto 0:0e0631af0305 892 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 893 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
RyoheiHagimoto 0:0e0631af0305 894 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 895 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
RyoheiHagimoto 0:0e0631af0305 896
RyoheiHagimoto 0:0e0631af0305 897 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
RyoheiHagimoto 0:0e0631af0305 898 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
RyoheiHagimoto 0:0e0631af0305 899
RyoheiHagimoto 0:0e0631af0305 900 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
RyoheiHagimoto 0:0e0631af0305 901 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
RyoheiHagimoto 0:0e0631af0305 902 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
RyoheiHagimoto 0:0e0631af0305 903 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
RyoheiHagimoto 0:0e0631af0305 904 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
RyoheiHagimoto 0:0e0631af0305 905 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
RyoheiHagimoto 0:0e0631af0305 906 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
RyoheiHagimoto 0:0e0631af0305 907 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
RyoheiHagimoto 0:0e0631af0305 908
RyoheiHagimoto 0:0e0631af0305 909 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
RyoheiHagimoto 0:0e0631af0305 910 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
RyoheiHagimoto 0:0e0631af0305 911 { \
RyoheiHagimoto 0:0e0631af0305 912 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
RyoheiHagimoto 0:0e0631af0305 913 } \
RyoheiHagimoto 0:0e0631af0305 914 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
RyoheiHagimoto 0:0e0631af0305 915 { \
RyoheiHagimoto 0:0e0631af0305 916 __m128i smask = _mm_set1_epi32(smask32); \
RyoheiHagimoto 0:0e0631af0305 917 __m128i a1 = _mm_xor_si128(a.val, smask); \
RyoheiHagimoto 0:0e0631af0305 918 __m128i b1 = _mm_xor_si128(b.val, smask); \
RyoheiHagimoto 0:0e0631af0305 919 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
RyoheiHagimoto 0:0e0631af0305 920 }
RyoheiHagimoto 0:0e0631af0305 921
RyoheiHagimoto 0:0e0631af0305 922 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
RyoheiHagimoto 0:0e0631af0305 923 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
RyoheiHagimoto 0:0e0631af0305 924
RyoheiHagimoto 0:0e0631af0305 925 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
RyoheiHagimoto 0:0e0631af0305 926 {
RyoheiHagimoto 0:0e0631af0305 927 return v_max(a, b) - v_min(a, b);
RyoheiHagimoto 0:0e0631af0305 928 }
RyoheiHagimoto 0:0e0631af0305 929
RyoheiHagimoto 0:0e0631af0305 930 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
RyoheiHagimoto 0:0e0631af0305 931 {
RyoheiHagimoto 0:0e0631af0305 932 __m128i d = _mm_sub_epi32(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 933 __m128i m = _mm_cmpgt_epi32(b.val, a.val);
RyoheiHagimoto 0:0e0631af0305 934 return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
RyoheiHagimoto 0:0e0631af0305 935 }
RyoheiHagimoto 0:0e0631af0305 936
RyoheiHagimoto 0:0e0631af0305 937 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
RyoheiHagimoto 0:0e0631af0305 938 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 939 { \
RyoheiHagimoto 0:0e0631af0305 940 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
RyoheiHagimoto 0:0e0631af0305 941 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
RyoheiHagimoto 0:0e0631af0305 942 } \
RyoheiHagimoto 0:0e0631af0305 943 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 944 { \
RyoheiHagimoto 0:0e0631af0305 945 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
RyoheiHagimoto 0:0e0631af0305 946 return _Tpvec(_mm_sqrt_##suffix(res)); \
RyoheiHagimoto 0:0e0631af0305 947 } \
RyoheiHagimoto 0:0e0631af0305 948 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 949 { \
RyoheiHagimoto 0:0e0631af0305 950 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
RyoheiHagimoto 0:0e0631af0305 951 return _Tpvec(res); \
RyoheiHagimoto 0:0e0631af0305 952 } \
RyoheiHagimoto 0:0e0631af0305 953 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
RyoheiHagimoto 0:0e0631af0305 954 { \
RyoheiHagimoto 0:0e0631af0305 955 return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
RyoheiHagimoto 0:0e0631af0305 956 }
RyoheiHagimoto 0:0e0631af0305 957
RyoheiHagimoto 0:0e0631af0305 958 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
RyoheiHagimoto 0:0e0631af0305 959 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
RyoheiHagimoto 0:0e0631af0305 960
RyoheiHagimoto 0:0e0631af0305 961 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
RyoheiHagimoto 0:0e0631af0305 962 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
RyoheiHagimoto 0:0e0631af0305 963 { \
RyoheiHagimoto 0:0e0631af0305 964 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
RyoheiHagimoto 0:0e0631af0305 965 } \
RyoheiHagimoto 0:0e0631af0305 966 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
RyoheiHagimoto 0:0e0631af0305 967 { \
RyoheiHagimoto 0:0e0631af0305 968 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
RyoheiHagimoto 0:0e0631af0305 969 } \
RyoheiHagimoto 0:0e0631af0305 970 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
RyoheiHagimoto 0:0e0631af0305 971 { \
RyoheiHagimoto 0:0e0631af0305 972 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
RyoheiHagimoto 0:0e0631af0305 973 } \
RyoheiHagimoto 0:0e0631af0305 974 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
RyoheiHagimoto 0:0e0631af0305 975 { \
RyoheiHagimoto 0:0e0631af0305 976 return _Tpsvec(srai(a.val, imm)); \
RyoheiHagimoto 0:0e0631af0305 977 } \
RyoheiHagimoto 0:0e0631af0305 978 template<int imm> \
RyoheiHagimoto 0:0e0631af0305 979 inline _Tpuvec v_shl(const _Tpuvec& a) \
RyoheiHagimoto 0:0e0631af0305 980 { \
RyoheiHagimoto 0:0e0631af0305 981 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
RyoheiHagimoto 0:0e0631af0305 982 } \
RyoheiHagimoto 0:0e0631af0305 983 template<int imm> \
RyoheiHagimoto 0:0e0631af0305 984 inline _Tpsvec v_shl(const _Tpsvec& a) \
RyoheiHagimoto 0:0e0631af0305 985 { \
RyoheiHagimoto 0:0e0631af0305 986 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
RyoheiHagimoto 0:0e0631af0305 987 } \
RyoheiHagimoto 0:0e0631af0305 988 template<int imm> \
RyoheiHagimoto 0:0e0631af0305 989 inline _Tpuvec v_shr(const _Tpuvec& a) \
RyoheiHagimoto 0:0e0631af0305 990 { \
RyoheiHagimoto 0:0e0631af0305 991 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
RyoheiHagimoto 0:0e0631af0305 992 } \
RyoheiHagimoto 0:0e0631af0305 993 template<int imm> \
RyoheiHagimoto 0:0e0631af0305 994 inline _Tpsvec v_shr(const _Tpsvec& a) \
RyoheiHagimoto 0:0e0631af0305 995 { \
RyoheiHagimoto 0:0e0631af0305 996 return _Tpsvec(srai(a.val, imm)); \
RyoheiHagimoto 0:0e0631af0305 997 }
RyoheiHagimoto 0:0e0631af0305 998
RyoheiHagimoto 0:0e0631af0305 999 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
RyoheiHagimoto 0:0e0631af0305 1000 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
RyoheiHagimoto 0:0e0631af0305 1001 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
RyoheiHagimoto 0:0e0631af0305 1002
RyoheiHagimoto 0:0e0631af0305 1003 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
RyoheiHagimoto 0:0e0631af0305 1004 inline _Tpvec v_load(const _Tp* ptr) \
RyoheiHagimoto 0:0e0631af0305 1005 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
RyoheiHagimoto 0:0e0631af0305 1006 inline _Tpvec v_load_aligned(const _Tp* ptr) \
RyoheiHagimoto 0:0e0631af0305 1007 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
RyoheiHagimoto 0:0e0631af0305 1008 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
RyoheiHagimoto 0:0e0631af0305 1009 { \
RyoheiHagimoto 0:0e0631af0305 1010 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
RyoheiHagimoto 0:0e0631af0305 1011 _mm_loadl_epi64((const __m128i*)ptr1))); \
RyoheiHagimoto 0:0e0631af0305 1012 } \
RyoheiHagimoto 0:0e0631af0305 1013 inline void v_store(_Tp* ptr, const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1014 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
RyoheiHagimoto 0:0e0631af0305 1015 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1016 { _mm_store_si128((__m128i*)ptr, a.val); } \
RyoheiHagimoto 0:0e0631af0305 1017 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1018 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
RyoheiHagimoto 0:0e0631af0305 1019 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1020 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
RyoheiHagimoto 0:0e0631af0305 1021
RyoheiHagimoto 0:0e0631af0305 1022 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
RyoheiHagimoto 0:0e0631af0305 1023 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
RyoheiHagimoto 0:0e0631af0305 1024 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
RyoheiHagimoto 0:0e0631af0305 1025 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
RyoheiHagimoto 0:0e0631af0305 1026 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
RyoheiHagimoto 0:0e0631af0305 1027 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
RyoheiHagimoto 0:0e0631af0305 1028 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
RyoheiHagimoto 0:0e0631af0305 1029 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
RyoheiHagimoto 0:0e0631af0305 1030
RyoheiHagimoto 0:0e0631af0305 1031 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
RyoheiHagimoto 0:0e0631af0305 1032 inline _Tpvec v_load(const _Tp* ptr) \
RyoheiHagimoto 0:0e0631af0305 1033 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
RyoheiHagimoto 0:0e0631af0305 1034 inline _Tpvec v_load_aligned(const _Tp* ptr) \
RyoheiHagimoto 0:0e0631af0305 1035 { return _Tpvec(_mm_load_##suffix(ptr)); } \
RyoheiHagimoto 0:0e0631af0305 1036 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
RyoheiHagimoto 0:0e0631af0305 1037 { \
RyoheiHagimoto 0:0e0631af0305 1038 return _Tpvec(_mm_castsi128_##suffix( \
RyoheiHagimoto 0:0e0631af0305 1039 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
RyoheiHagimoto 0:0e0631af0305 1040 _mm_loadl_epi64((const __m128i*)ptr1)))); \
RyoheiHagimoto 0:0e0631af0305 1041 } \
RyoheiHagimoto 0:0e0631af0305 1042 inline void v_store(_Tp* ptr, const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1043 { _mm_storeu_##suffix(ptr, a.val); } \
RyoheiHagimoto 0:0e0631af0305 1044 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1045 { _mm_store_##suffix(ptr, a.val); } \
RyoheiHagimoto 0:0e0631af0305 1046 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1047 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
RyoheiHagimoto 0:0e0631af0305 1048 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1049 { \
RyoheiHagimoto 0:0e0631af0305 1050 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
RyoheiHagimoto 0:0e0631af0305 1051 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
RyoheiHagimoto 0:0e0631af0305 1052 }
RyoheiHagimoto 0:0e0631af0305 1053
RyoheiHagimoto 0:0e0631af0305 1054 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
RyoheiHagimoto 0:0e0631af0305 1055 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
RyoheiHagimoto 0:0e0631af0305 1056
RyoheiHagimoto 0:0e0631af0305 1057 #if defined(HAVE_FP16)
RyoheiHagimoto 0:0e0631af0305 1058 inline v_float16x4 v_load_f16(const short* ptr)
RyoheiHagimoto 0:0e0631af0305 1059 { return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
RyoheiHagimoto 0:0e0631af0305 1060 inline void v_store_f16(short* ptr, v_float16x4& a)
RyoheiHagimoto 0:0e0631af0305 1061 { _mm_storel_epi64((__m128i*)ptr, a.val); }
RyoheiHagimoto 0:0e0631af0305 1062 #endif
RyoheiHagimoto 0:0e0631af0305 1063
RyoheiHagimoto 0:0e0631af0305 1064 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
RyoheiHagimoto 0:0e0631af0305 1065 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1066 { \
RyoheiHagimoto 0:0e0631af0305 1067 __m128i val = a.val; \
RyoheiHagimoto 0:0e0631af0305 1068 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
RyoheiHagimoto 0:0e0631af0305 1069 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
RyoheiHagimoto 0:0e0631af0305 1070 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
RyoheiHagimoto 0:0e0631af0305 1071 return (scalartype)_mm_cvtsi128_si32(val); \
RyoheiHagimoto 0:0e0631af0305 1072 } \
RyoheiHagimoto 0:0e0631af0305 1073 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1074 { \
RyoheiHagimoto 0:0e0631af0305 1075 __m128i val = a.val; \
RyoheiHagimoto 0:0e0631af0305 1076 __m128i smask = _mm_set1_epi16(sbit); \
RyoheiHagimoto 0:0e0631af0305 1077 val = _mm_xor_si128(val, smask); \
RyoheiHagimoto 0:0e0631af0305 1078 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
RyoheiHagimoto 0:0e0631af0305 1079 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
RyoheiHagimoto 0:0e0631af0305 1080 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
RyoheiHagimoto 0:0e0631af0305 1081 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
RyoheiHagimoto 0:0e0631af0305 1082 }
RyoheiHagimoto 0:0e0631af0305 1083 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
RyoheiHagimoto 0:0e0631af0305 1084 inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1085 { \
RyoheiHagimoto 0:0e0631af0305 1086 __m128i val = a.val; \
RyoheiHagimoto 0:0e0631af0305 1087 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
RyoheiHagimoto 0:0e0631af0305 1088 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
RyoheiHagimoto 0:0e0631af0305 1089 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
RyoheiHagimoto 0:0e0631af0305 1090 return (scalartype)_mm_cvtsi128_si32(val); \
RyoheiHagimoto 0:0e0631af0305 1091 } \
RyoheiHagimoto 0:0e0631af0305 1092 inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1093 { \
RyoheiHagimoto 0:0e0631af0305 1094 __m128i val = a.val; \
RyoheiHagimoto 0:0e0631af0305 1095 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
RyoheiHagimoto 0:0e0631af0305 1096 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
RyoheiHagimoto 0:0e0631af0305 1097 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
RyoheiHagimoto 0:0e0631af0305 1098 return (unsigned scalartype)_mm_cvtsi128_si32(val); \
RyoheiHagimoto 0:0e0631af0305 1099 }
RyoheiHagimoto 0:0e0631af0305 1100 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
RyoheiHagimoto 0:0e0631af0305 1101 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
RyoheiHagimoto 0:0e0631af0305 1102 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
RyoheiHagimoto 0:0e0631af0305 1103
RyoheiHagimoto 0:0e0631af0305 1104 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
RyoheiHagimoto 0:0e0631af0305 1105 inline scalartype v_reduce_##func(const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1106 { \
RyoheiHagimoto 0:0e0631af0305 1107 scalartype CV_DECL_ALIGNED(16) buf[4]; \
RyoheiHagimoto 0:0e0631af0305 1108 v_store_aligned(buf, a); \
RyoheiHagimoto 0:0e0631af0305 1109 scalartype s0 = scalar_func(buf[0], buf[1]); \
RyoheiHagimoto 0:0e0631af0305 1110 scalartype s1 = scalar_func(buf[2], buf[3]); \
RyoheiHagimoto 0:0e0631af0305 1111 return scalar_func(s0, s1); \
RyoheiHagimoto 0:0e0631af0305 1112 }
RyoheiHagimoto 0:0e0631af0305 1113
RyoheiHagimoto 0:0e0631af0305 1114 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
RyoheiHagimoto 0:0e0631af0305 1115 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
RyoheiHagimoto 0:0e0631af0305 1116 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
RyoheiHagimoto 0:0e0631af0305 1117 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
RyoheiHagimoto 0:0e0631af0305 1118 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
RyoheiHagimoto 0:0e0631af0305 1119 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
RyoheiHagimoto 0:0e0631af0305 1120 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
RyoheiHagimoto 0:0e0631af0305 1121 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
RyoheiHagimoto 0:0e0631af0305 1122 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
RyoheiHagimoto 0:0e0631af0305 1123
RyoheiHagimoto 0:0e0631af0305 1124 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
RyoheiHagimoto 0:0e0631af0305 1125 inline int v_signmask(const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1126 { \
RyoheiHagimoto 0:0e0631af0305 1127 return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
RyoheiHagimoto 0:0e0631af0305 1128 } \
RyoheiHagimoto 0:0e0631af0305 1129 inline bool v_check_all(const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1130 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
RyoheiHagimoto 0:0e0631af0305 1131 inline bool v_check_any(const _Tpvec& a) \
RyoheiHagimoto 0:0e0631af0305 1132 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
RyoheiHagimoto 0:0e0631af0305 1133
RyoheiHagimoto 0:0e0631af0305 1134 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
RyoheiHagimoto 0:0e0631af0305 1135 inline __m128i v_packq_epi32(__m128i a)
RyoheiHagimoto 0:0e0631af0305 1136 {
RyoheiHagimoto 0:0e0631af0305 1137 __m128i b = _mm_packs_epi32(a, a);
RyoheiHagimoto 0:0e0631af0305 1138 return _mm_packs_epi16(b, b);
RyoheiHagimoto 0:0e0631af0305 1139 }
RyoheiHagimoto 0:0e0631af0305 1140
RyoheiHagimoto 0:0e0631af0305 1141 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
RyoheiHagimoto 0:0e0631af0305 1142 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
RyoheiHagimoto 0:0e0631af0305 1143 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
RyoheiHagimoto 0:0e0631af0305 1144 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
RyoheiHagimoto 0:0e0631af0305 1145 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
RyoheiHagimoto 0:0e0631af0305 1146 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
RyoheiHagimoto 0:0e0631af0305 1147 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
RyoheiHagimoto 0:0e0631af0305 1148 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
RyoheiHagimoto 0:0e0631af0305 1149
RyoheiHagimoto 0:0e0631af0305 1150 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
RyoheiHagimoto 0:0e0631af0305 1151 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 1152 { \
RyoheiHagimoto 0:0e0631af0305 1153 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
RyoheiHagimoto 0:0e0631af0305 1154 }
RyoheiHagimoto 0:0e0631af0305 1155
RyoheiHagimoto 0:0e0631af0305 1156 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
RyoheiHagimoto 0:0e0631af0305 1157 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
RyoheiHagimoto 0:0e0631af0305 1158 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
RyoheiHagimoto 0:0e0631af0305 1159 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
RyoheiHagimoto 0:0e0631af0305 1160 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
RyoheiHagimoto 0:0e0631af0305 1161 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
RyoheiHagimoto 0:0e0631af0305 1162 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
RyoheiHagimoto 0:0e0631af0305 1163 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
RyoheiHagimoto 0:0e0631af0305 1164 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
RyoheiHagimoto 0:0e0631af0305 1165 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
RyoheiHagimoto 0:0e0631af0305 1166
RyoheiHagimoto 0:0e0631af0305 1167 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
RyoheiHagimoto 0:0e0631af0305 1168 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
RyoheiHagimoto 0:0e0631af0305 1169 { \
RyoheiHagimoto 0:0e0631af0305 1170 __m128i z = _mm_setzero_si128(); \
RyoheiHagimoto 0:0e0631af0305 1171 b0.val = _mm_unpacklo_##suffix(a.val, z); \
RyoheiHagimoto 0:0e0631af0305 1172 b1.val = _mm_unpackhi_##suffix(a.val, z); \
RyoheiHagimoto 0:0e0631af0305 1173 } \
RyoheiHagimoto 0:0e0631af0305 1174 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
RyoheiHagimoto 0:0e0631af0305 1175 { \
RyoheiHagimoto 0:0e0631af0305 1176 __m128i z = _mm_setzero_si128(); \
RyoheiHagimoto 0:0e0631af0305 1177 return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
RyoheiHagimoto 0:0e0631af0305 1178 } \
RyoheiHagimoto 0:0e0631af0305 1179 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
RyoheiHagimoto 0:0e0631af0305 1180 { \
RyoheiHagimoto 0:0e0631af0305 1181 b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
RyoheiHagimoto 0:0e0631af0305 1182 b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
RyoheiHagimoto 0:0e0631af0305 1183 } \
RyoheiHagimoto 0:0e0631af0305 1184 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
RyoheiHagimoto 0:0e0631af0305 1185 { \
RyoheiHagimoto 0:0e0631af0305 1186 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
RyoheiHagimoto 0:0e0631af0305 1187 return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
RyoheiHagimoto 0:0e0631af0305 1188 }
RyoheiHagimoto 0:0e0631af0305 1189
RyoheiHagimoto 0:0e0631af0305 1190 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
RyoheiHagimoto 0:0e0631af0305 1191 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
RyoheiHagimoto 0:0e0631af0305 1192
RyoheiHagimoto 0:0e0631af0305 1193 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
RyoheiHagimoto 0:0e0631af0305 1194 {
RyoheiHagimoto 0:0e0631af0305 1195 __m128i z = _mm_setzero_si128();
RyoheiHagimoto 0:0e0631af0305 1196 b0.val = _mm_unpacklo_epi32(a.val, z);
RyoheiHagimoto 0:0e0631af0305 1197 b1.val = _mm_unpackhi_epi32(a.val, z);
RyoheiHagimoto 0:0e0631af0305 1198 }
RyoheiHagimoto 0:0e0631af0305 1199 inline v_uint64x2 v_load_expand(const unsigned* ptr)
RyoheiHagimoto 0:0e0631af0305 1200 {
RyoheiHagimoto 0:0e0631af0305 1201 __m128i z = _mm_setzero_si128();
RyoheiHagimoto 0:0e0631af0305 1202 return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
RyoheiHagimoto 0:0e0631af0305 1203 }
RyoheiHagimoto 0:0e0631af0305 1204 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
RyoheiHagimoto 0:0e0631af0305 1205 {
RyoheiHagimoto 0:0e0631af0305 1206 __m128i s = _mm_srai_epi32(a.val, 31);
RyoheiHagimoto 0:0e0631af0305 1207 b0.val = _mm_unpacklo_epi32(a.val, s);
RyoheiHagimoto 0:0e0631af0305 1208 b1.val = _mm_unpackhi_epi32(a.val, s);
RyoheiHagimoto 0:0e0631af0305 1209 }
RyoheiHagimoto 0:0e0631af0305 1210 inline v_int64x2 v_load_expand(const int* ptr)
RyoheiHagimoto 0:0e0631af0305 1211 {
RyoheiHagimoto 0:0e0631af0305 1212 __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
RyoheiHagimoto 0:0e0631af0305 1213 __m128i s = _mm_srai_epi32(a, 31);
RyoheiHagimoto 0:0e0631af0305 1214 return v_int64x2(_mm_unpacklo_epi32(a, s));
RyoheiHagimoto 0:0e0631af0305 1215 }
RyoheiHagimoto 0:0e0631af0305 1216
RyoheiHagimoto 0:0e0631af0305 1217 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
RyoheiHagimoto 0:0e0631af0305 1218 {
RyoheiHagimoto 0:0e0631af0305 1219 __m128i z = _mm_setzero_si128();
RyoheiHagimoto 0:0e0631af0305 1220 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
RyoheiHagimoto 0:0e0631af0305 1221 return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
RyoheiHagimoto 0:0e0631af0305 1222 }
RyoheiHagimoto 0:0e0631af0305 1223
RyoheiHagimoto 0:0e0631af0305 1224 inline v_int32x4 v_load_expand_q(const schar* ptr)
RyoheiHagimoto 0:0e0631af0305 1225 {
RyoheiHagimoto 0:0e0631af0305 1226 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
RyoheiHagimoto 0:0e0631af0305 1227 a = _mm_unpacklo_epi8(a, a);
RyoheiHagimoto 0:0e0631af0305 1228 a = _mm_unpacklo_epi8(a, a);
RyoheiHagimoto 0:0e0631af0305 1229 return v_int32x4(_mm_srai_epi32(a, 24));
RyoheiHagimoto 0:0e0631af0305 1230 }
RyoheiHagimoto 0:0e0631af0305 1231
RyoheiHagimoto 0:0e0631af0305 1232 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
RyoheiHagimoto 0:0e0631af0305 1233 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
RyoheiHagimoto 0:0e0631af0305 1234 { \
RyoheiHagimoto 0:0e0631af0305 1235 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
RyoheiHagimoto 0:0e0631af0305 1236 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
RyoheiHagimoto 0:0e0631af0305 1237 } \
RyoheiHagimoto 0:0e0631af0305 1238 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 1239 { \
RyoheiHagimoto 0:0e0631af0305 1240 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
RyoheiHagimoto 0:0e0631af0305 1241 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
RyoheiHagimoto 0:0e0631af0305 1242 } \
RyoheiHagimoto 0:0e0631af0305 1243 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
RyoheiHagimoto 0:0e0631af0305 1244 { \
RyoheiHagimoto 0:0e0631af0305 1245 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
RyoheiHagimoto 0:0e0631af0305 1246 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
RyoheiHagimoto 0:0e0631af0305 1247 } \
RyoheiHagimoto 0:0e0631af0305 1248 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
RyoheiHagimoto 0:0e0631af0305 1249 { \
RyoheiHagimoto 0:0e0631af0305 1250 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
RyoheiHagimoto 0:0e0631af0305 1251 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
RyoheiHagimoto 0:0e0631af0305 1252 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
RyoheiHagimoto 0:0e0631af0305 1253 }
RyoheiHagimoto 0:0e0631af0305 1254
RyoheiHagimoto 0:0e0631af0305 1255 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 1256 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 1257 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 1258 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 1259 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 1260 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 1261 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
RyoheiHagimoto 0:0e0631af0305 1262 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
RyoheiHagimoto 0:0e0631af0305 1263
RyoheiHagimoto 0:0e0631af0305 1264 template<int s, typename _Tpvec>
RyoheiHagimoto 0:0e0631af0305 1265 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
RyoheiHagimoto 0:0e0631af0305 1266 {
RyoheiHagimoto 0:0e0631af0305 1267 const int w = sizeof(typename _Tpvec::lane_type);
RyoheiHagimoto 0:0e0631af0305 1268 const int n = _Tpvec::nlanes;
RyoheiHagimoto 0:0e0631af0305 1269 __m128i ra, rb;
RyoheiHagimoto 0:0e0631af0305 1270 ra = _mm_srli_si128(a.val, s*w);
RyoheiHagimoto 0:0e0631af0305 1271 rb = _mm_slli_si128(b.val, (n-s)*w);
RyoheiHagimoto 0:0e0631af0305 1272 return _Tpvec(_mm_or_si128(ra, rb));
RyoheiHagimoto 0:0e0631af0305 1273 }
RyoheiHagimoto 0:0e0631af0305 1274
RyoheiHagimoto 0:0e0631af0305 1275 inline v_int32x4 v_round(const v_float32x4& a)
RyoheiHagimoto 0:0e0631af0305 1276 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
RyoheiHagimoto 0:0e0631af0305 1277
RyoheiHagimoto 0:0e0631af0305 1278 inline v_int32x4 v_floor(const v_float32x4& a)
RyoheiHagimoto 0:0e0631af0305 1279 {
RyoheiHagimoto 0:0e0631af0305 1280 __m128i a1 = _mm_cvtps_epi32(a.val);
RyoheiHagimoto 0:0e0631af0305 1281 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
RyoheiHagimoto 0:0e0631af0305 1282 return v_int32x4(_mm_add_epi32(a1, mask));
RyoheiHagimoto 0:0e0631af0305 1283 }
RyoheiHagimoto 0:0e0631af0305 1284
RyoheiHagimoto 0:0e0631af0305 1285 inline v_int32x4 v_ceil(const v_float32x4& a)
RyoheiHagimoto 0:0e0631af0305 1286 {
RyoheiHagimoto 0:0e0631af0305 1287 __m128i a1 = _mm_cvtps_epi32(a.val);
RyoheiHagimoto 0:0e0631af0305 1288 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
RyoheiHagimoto 0:0e0631af0305 1289 return v_int32x4(_mm_sub_epi32(a1, mask));
RyoheiHagimoto 0:0e0631af0305 1290 }
RyoheiHagimoto 0:0e0631af0305 1291
RyoheiHagimoto 0:0e0631af0305 1292 inline v_int32x4 v_trunc(const v_float32x4& a)
RyoheiHagimoto 0:0e0631af0305 1293 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
RyoheiHagimoto 0:0e0631af0305 1294
RyoheiHagimoto 0:0e0631af0305 1295 inline v_int32x4 v_round(const v_float64x2& a)
RyoheiHagimoto 0:0e0631af0305 1296 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
RyoheiHagimoto 0:0e0631af0305 1297
RyoheiHagimoto 0:0e0631af0305 1298 inline v_int32x4 v_floor(const v_float64x2& a)
RyoheiHagimoto 0:0e0631af0305 1299 {
RyoheiHagimoto 0:0e0631af0305 1300 __m128i a1 = _mm_cvtpd_epi32(a.val);
RyoheiHagimoto 0:0e0631af0305 1301 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
RyoheiHagimoto 0:0e0631af0305 1302 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
RyoheiHagimoto 0:0e0631af0305 1303 return v_int32x4(_mm_add_epi32(a1, mask));
RyoheiHagimoto 0:0e0631af0305 1304 }
RyoheiHagimoto 0:0e0631af0305 1305
RyoheiHagimoto 0:0e0631af0305 1306 inline v_int32x4 v_ceil(const v_float64x2& a)
RyoheiHagimoto 0:0e0631af0305 1307 {
RyoheiHagimoto 0:0e0631af0305 1308 __m128i a1 = _mm_cvtpd_epi32(a.val);
RyoheiHagimoto 0:0e0631af0305 1309 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
RyoheiHagimoto 0:0e0631af0305 1310 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
RyoheiHagimoto 0:0e0631af0305 1311 return v_int32x4(_mm_sub_epi32(a1, mask));
RyoheiHagimoto 0:0e0631af0305 1312 }
RyoheiHagimoto 0:0e0631af0305 1313
RyoheiHagimoto 0:0e0631af0305 1314 inline v_int32x4 v_trunc(const v_float64x2& a)
RyoheiHagimoto 0:0e0631af0305 1315 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
RyoheiHagimoto 0:0e0631af0305 1316
RyoheiHagimoto 0:0e0631af0305 1317 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
RyoheiHagimoto 0:0e0631af0305 1318 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
RyoheiHagimoto 0:0e0631af0305 1319 const _Tpvec& a2, const _Tpvec& a3, \
RyoheiHagimoto 0:0e0631af0305 1320 _Tpvec& b0, _Tpvec& b1, \
RyoheiHagimoto 0:0e0631af0305 1321 _Tpvec& b2, _Tpvec& b3) \
RyoheiHagimoto 0:0e0631af0305 1322 { \
RyoheiHagimoto 0:0e0631af0305 1323 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
RyoheiHagimoto 0:0e0631af0305 1324 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
RyoheiHagimoto 0:0e0631af0305 1325 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
RyoheiHagimoto 0:0e0631af0305 1326 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
RyoheiHagimoto 0:0e0631af0305 1327 \
RyoheiHagimoto 0:0e0631af0305 1328 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
RyoheiHagimoto 0:0e0631af0305 1329 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
RyoheiHagimoto 0:0e0631af0305 1330 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
RyoheiHagimoto 0:0e0631af0305 1331 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
RyoheiHagimoto 0:0e0631af0305 1332 }
RyoheiHagimoto 0:0e0631af0305 1333
RyoheiHagimoto 0:0e0631af0305 1334 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 1335 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
RyoheiHagimoto 0:0e0631af0305 1336 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
RyoheiHagimoto 0:0e0631af0305 1337
RyoheiHagimoto 0:0e0631af0305 1338 // adopted from sse_utils.hpp
RyoheiHagimoto 0:0e0631af0305 1339 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
RyoheiHagimoto 0:0e0631af0305 1340 {
RyoheiHagimoto 0:0e0631af0305 1341 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
RyoheiHagimoto 0:0e0631af0305 1342 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
RyoheiHagimoto 0:0e0631af0305 1343 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
RyoheiHagimoto 0:0e0631af0305 1344
RyoheiHagimoto 0:0e0631af0305 1345 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
RyoheiHagimoto 0:0e0631af0305 1346 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
RyoheiHagimoto 0:0e0631af0305 1347 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
RyoheiHagimoto 0:0e0631af0305 1348
RyoheiHagimoto 0:0e0631af0305 1349 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
RyoheiHagimoto 0:0e0631af0305 1350 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
RyoheiHagimoto 0:0e0631af0305 1351 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
RyoheiHagimoto 0:0e0631af0305 1352
RyoheiHagimoto 0:0e0631af0305 1353 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
RyoheiHagimoto 0:0e0631af0305 1354 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
RyoheiHagimoto 0:0e0631af0305 1355 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
RyoheiHagimoto 0:0e0631af0305 1356
RyoheiHagimoto 0:0e0631af0305 1357 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
RyoheiHagimoto 0:0e0631af0305 1358 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
RyoheiHagimoto 0:0e0631af0305 1359 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
RyoheiHagimoto 0:0e0631af0305 1360 }
RyoheiHagimoto 0:0e0631af0305 1361
RyoheiHagimoto 0:0e0631af0305 1362 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
RyoheiHagimoto 0:0e0631af0305 1363 {
RyoheiHagimoto 0:0e0631af0305 1364 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
RyoheiHagimoto 0:0e0631af0305 1365 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
RyoheiHagimoto 0:0e0631af0305 1366 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
RyoheiHagimoto 0:0e0631af0305 1367 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
RyoheiHagimoto 0:0e0631af0305 1368
RyoheiHagimoto 0:0e0631af0305 1369 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
RyoheiHagimoto 0:0e0631af0305 1370 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
RyoheiHagimoto 0:0e0631af0305 1371 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
RyoheiHagimoto 0:0e0631af0305 1372 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
RyoheiHagimoto 0:0e0631af0305 1373
RyoheiHagimoto 0:0e0631af0305 1374 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
RyoheiHagimoto 0:0e0631af0305 1375 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
RyoheiHagimoto 0:0e0631af0305 1376 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
RyoheiHagimoto 0:0e0631af0305 1377 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
RyoheiHagimoto 0:0e0631af0305 1378
RyoheiHagimoto 0:0e0631af0305 1379 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
RyoheiHagimoto 0:0e0631af0305 1380 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
RyoheiHagimoto 0:0e0631af0305 1381 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
RyoheiHagimoto 0:0e0631af0305 1382 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
RyoheiHagimoto 0:0e0631af0305 1383
RyoheiHagimoto 0:0e0631af0305 1384 a.val = _mm_unpacklo_epi8(v0, v1);
RyoheiHagimoto 0:0e0631af0305 1385 b.val = _mm_unpackhi_epi8(v0, v1);
RyoheiHagimoto 0:0e0631af0305 1386 c.val = _mm_unpacklo_epi8(v2, v3);
RyoheiHagimoto 0:0e0631af0305 1387 d.val = _mm_unpackhi_epi8(v2, v3);
RyoheiHagimoto 0:0e0631af0305 1388 }
RyoheiHagimoto 0:0e0631af0305 1389
RyoheiHagimoto 0:0e0631af0305 1390 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
RyoheiHagimoto 0:0e0631af0305 1391 {
RyoheiHagimoto 0:0e0631af0305 1392 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
RyoheiHagimoto 0:0e0631af0305 1393 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
RyoheiHagimoto 0:0e0631af0305 1394 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
RyoheiHagimoto 0:0e0631af0305 1395
RyoheiHagimoto 0:0e0631af0305 1396 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
RyoheiHagimoto 0:0e0631af0305 1397 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
RyoheiHagimoto 0:0e0631af0305 1398 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
RyoheiHagimoto 0:0e0631af0305 1399
RyoheiHagimoto 0:0e0631af0305 1400 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
RyoheiHagimoto 0:0e0631af0305 1401 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
RyoheiHagimoto 0:0e0631af0305 1402 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
RyoheiHagimoto 0:0e0631af0305 1403
RyoheiHagimoto 0:0e0631af0305 1404 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
RyoheiHagimoto 0:0e0631af0305 1405 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
RyoheiHagimoto 0:0e0631af0305 1406 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
RyoheiHagimoto 0:0e0631af0305 1407 }
RyoheiHagimoto 0:0e0631af0305 1408
RyoheiHagimoto 0:0e0631af0305 1409 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
RyoheiHagimoto 0:0e0631af0305 1410 {
RyoheiHagimoto 0:0e0631af0305 1411 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
RyoheiHagimoto 0:0e0631af0305 1412 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
RyoheiHagimoto 0:0e0631af0305 1413 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
RyoheiHagimoto 0:0e0631af0305 1414 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
RyoheiHagimoto 0:0e0631af0305 1415
RyoheiHagimoto 0:0e0631af0305 1416 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
RyoheiHagimoto 0:0e0631af0305 1417 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
RyoheiHagimoto 0:0e0631af0305 1418 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
RyoheiHagimoto 0:0e0631af0305 1419 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
RyoheiHagimoto 0:0e0631af0305 1420
RyoheiHagimoto 0:0e0631af0305 1421 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
RyoheiHagimoto 0:0e0631af0305 1422 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
RyoheiHagimoto 0:0e0631af0305 1423 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
RyoheiHagimoto 0:0e0631af0305 1424 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
RyoheiHagimoto 0:0e0631af0305 1425
RyoheiHagimoto 0:0e0631af0305 1426 a.val = _mm_unpacklo_epi16(u0, u1);
RyoheiHagimoto 0:0e0631af0305 1427 b.val = _mm_unpackhi_epi16(u0, u1);
RyoheiHagimoto 0:0e0631af0305 1428 c.val = _mm_unpacklo_epi16(u2, u3);
RyoheiHagimoto 0:0e0631af0305 1429 d.val = _mm_unpackhi_epi16(u2, u3);
RyoheiHagimoto 0:0e0631af0305 1430 }
RyoheiHagimoto 0:0e0631af0305 1431
RyoheiHagimoto 0:0e0631af0305 1432 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
RyoheiHagimoto 0:0e0631af0305 1433 {
RyoheiHagimoto 0:0e0631af0305 1434 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
RyoheiHagimoto 0:0e0631af0305 1435 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
RyoheiHagimoto 0:0e0631af0305 1436 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
RyoheiHagimoto 0:0e0631af0305 1437
RyoheiHagimoto 0:0e0631af0305 1438 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
RyoheiHagimoto 0:0e0631af0305 1439 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
RyoheiHagimoto 0:0e0631af0305 1440 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
RyoheiHagimoto 0:0e0631af0305 1441
RyoheiHagimoto 0:0e0631af0305 1442 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
RyoheiHagimoto 0:0e0631af0305 1443 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
RyoheiHagimoto 0:0e0631af0305 1444 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
RyoheiHagimoto 0:0e0631af0305 1445 }
RyoheiHagimoto 0:0e0631af0305 1446
RyoheiHagimoto 0:0e0631af0305 1447 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
RyoheiHagimoto 0:0e0631af0305 1448 {
RyoheiHagimoto 0:0e0631af0305 1449 v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
RyoheiHagimoto 0:0e0631af0305 1450 v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
RyoheiHagimoto 0:0e0631af0305 1451 v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
RyoheiHagimoto 0:0e0631af0305 1452 v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
RyoheiHagimoto 0:0e0631af0305 1453
RyoheiHagimoto 0:0e0631af0305 1454 v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
RyoheiHagimoto 0:0e0631af0305 1455 }
RyoheiHagimoto 0:0e0631af0305 1456
RyoheiHagimoto 0:0e0631af0305 1457 // 2-channel, float only
RyoheiHagimoto 0:0e0631af0305 1458 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
RyoheiHagimoto 0:0e0631af0305 1459 {
RyoheiHagimoto 0:0e0631af0305 1460 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
RyoheiHagimoto 0:0e0631af0305 1461
RyoheiHagimoto 0:0e0631af0305 1462 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
RyoheiHagimoto 0:0e0631af0305 1463 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
RyoheiHagimoto 0:0e0631af0305 1464
RyoheiHagimoto 0:0e0631af0305 1465 a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
RyoheiHagimoto 0:0e0631af0305 1466 b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
RyoheiHagimoto 0:0e0631af0305 1467 }
RyoheiHagimoto 0:0e0631af0305 1468
RyoheiHagimoto 0:0e0631af0305 1469 inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
RyoheiHagimoto 0:0e0631af0305 1470 {
RyoheiHagimoto 0:0e0631af0305 1471 __m128i t0, t1;
RyoheiHagimoto 0:0e0631af0305 1472 t0 = _mm_unpacklo_epi16(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 1473 t1 = _mm_unpackhi_epi16(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 1474 _mm_storeu_si128((__m128i*)(ptr), t0);
RyoheiHagimoto 0:0e0631af0305 1475 _mm_storeu_si128((__m128i*)(ptr + 8), t1);
RyoheiHagimoto 0:0e0631af0305 1476 }
RyoheiHagimoto 0:0e0631af0305 1477
RyoheiHagimoto 0:0e0631af0305 1478 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
RyoheiHagimoto 0:0e0631af0305 1479 const v_uint8x16& c )
RyoheiHagimoto 0:0e0631af0305 1480 {
RyoheiHagimoto 0:0e0631af0305 1481 __m128i z = _mm_setzero_si128();
RyoheiHagimoto 0:0e0631af0305 1482 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 1483 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 1484 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
RyoheiHagimoto 0:0e0631af0305 1485 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
RyoheiHagimoto 0:0e0631af0305 1486
RyoheiHagimoto 0:0e0631af0305 1487 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
RyoheiHagimoto 0:0e0631af0305 1488 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
RyoheiHagimoto 0:0e0631af0305 1489 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
RyoheiHagimoto 0:0e0631af0305 1490 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
RyoheiHagimoto 0:0e0631af0305 1491
RyoheiHagimoto 0:0e0631af0305 1492 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
RyoheiHagimoto 0:0e0631af0305 1493 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
RyoheiHagimoto 0:0e0631af0305 1494 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
RyoheiHagimoto 0:0e0631af0305 1495 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
RyoheiHagimoto 0:0e0631af0305 1496
RyoheiHagimoto 0:0e0631af0305 1497 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
RyoheiHagimoto 0:0e0631af0305 1498 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
RyoheiHagimoto 0:0e0631af0305 1499 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
RyoheiHagimoto 0:0e0631af0305 1500 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
RyoheiHagimoto 0:0e0631af0305 1501
RyoheiHagimoto 0:0e0631af0305 1502 p20 = _mm_slli_si128(p20, 1);
RyoheiHagimoto 0:0e0631af0305 1503 p22 = _mm_slli_si128(p22, 1);
RyoheiHagimoto 0:0e0631af0305 1504
RyoheiHagimoto 0:0e0631af0305 1505 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
RyoheiHagimoto 0:0e0631af0305 1506 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
RyoheiHagimoto 0:0e0631af0305 1507 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
RyoheiHagimoto 0:0e0631af0305 1508 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
RyoheiHagimoto 0:0e0631af0305 1509
RyoheiHagimoto 0:0e0631af0305 1510 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
RyoheiHagimoto 0:0e0631af0305 1511 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
RyoheiHagimoto 0:0e0631af0305 1512 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
RyoheiHagimoto 0:0e0631af0305 1513 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
RyoheiHagimoto 0:0e0631af0305 1514
RyoheiHagimoto 0:0e0631af0305 1515 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
RyoheiHagimoto 0:0e0631af0305 1516 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
RyoheiHagimoto 0:0e0631af0305 1517 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
RyoheiHagimoto 0:0e0631af0305 1518
RyoheiHagimoto 0:0e0631af0305 1519 _mm_storeu_si128((__m128i*)(ptr), v0);
RyoheiHagimoto 0:0e0631af0305 1520 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
RyoheiHagimoto 0:0e0631af0305 1521 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
RyoheiHagimoto 0:0e0631af0305 1522 }
RyoheiHagimoto 0:0e0631af0305 1523
RyoheiHagimoto 0:0e0631af0305 1524 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
RyoheiHagimoto 0:0e0631af0305 1525 const v_uint8x16& c, const v_uint8x16& d)
RyoheiHagimoto 0:0e0631af0305 1526 {
RyoheiHagimoto 0:0e0631af0305 1527 // a0 a1 a2 a3 ....
RyoheiHagimoto 0:0e0631af0305 1528 // b0 b1 b2 b3 ....
RyoheiHagimoto 0:0e0631af0305 1529 // c0 c1 c2 c3 ....
RyoheiHagimoto 0:0e0631af0305 1530 // d0 d1 d2 d3 ....
RyoheiHagimoto 0:0e0631af0305 1531 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
RyoheiHagimoto 0:0e0631af0305 1532 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
RyoheiHagimoto 0:0e0631af0305 1533 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
RyoheiHagimoto 0:0e0631af0305 1534 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
RyoheiHagimoto 0:0e0631af0305 1535
RyoheiHagimoto 0:0e0631af0305 1536 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
RyoheiHagimoto 0:0e0631af0305 1537 __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
RyoheiHagimoto 0:0e0631af0305 1538 __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
RyoheiHagimoto 0:0e0631af0305 1539 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
RyoheiHagimoto 0:0e0631af0305 1540
RyoheiHagimoto 0:0e0631af0305 1541 _mm_storeu_si128((__m128i*)ptr, v0);
RyoheiHagimoto 0:0e0631af0305 1542 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
RyoheiHagimoto 0:0e0631af0305 1543 _mm_storeu_si128((__m128i*)(ptr + 32), v1);
RyoheiHagimoto 0:0e0631af0305 1544 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
RyoheiHagimoto 0:0e0631af0305 1545 }
RyoheiHagimoto 0:0e0631af0305 1546
RyoheiHagimoto 0:0e0631af0305 1547 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
RyoheiHagimoto 0:0e0631af0305 1548 const v_uint16x8& b,
RyoheiHagimoto 0:0e0631af0305 1549 const v_uint16x8& c )
RyoheiHagimoto 0:0e0631af0305 1550 {
RyoheiHagimoto 0:0e0631af0305 1551 __m128i z = _mm_setzero_si128();
RyoheiHagimoto 0:0e0631af0305 1552 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 1553 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
RyoheiHagimoto 0:0e0631af0305 1554 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
RyoheiHagimoto 0:0e0631af0305 1555 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
RyoheiHagimoto 0:0e0631af0305 1556
RyoheiHagimoto 0:0e0631af0305 1557 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
RyoheiHagimoto 0:0e0631af0305 1558 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
RyoheiHagimoto 0:0e0631af0305 1559 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
RyoheiHagimoto 0:0e0631af0305 1560 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
RyoheiHagimoto 0:0e0631af0305 1561
RyoheiHagimoto 0:0e0631af0305 1562 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
RyoheiHagimoto 0:0e0631af0305 1563 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
RyoheiHagimoto 0:0e0631af0305 1564 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
RyoheiHagimoto 0:0e0631af0305 1565 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
RyoheiHagimoto 0:0e0631af0305 1566
RyoheiHagimoto 0:0e0631af0305 1567 p20 = _mm_slli_si128(p20, 2);
RyoheiHagimoto 0:0e0631af0305 1568 p22 = _mm_slli_si128(p22, 2);
RyoheiHagimoto 0:0e0631af0305 1569
RyoheiHagimoto 0:0e0631af0305 1570 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
RyoheiHagimoto 0:0e0631af0305 1571 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
RyoheiHagimoto 0:0e0631af0305 1572 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
RyoheiHagimoto 0:0e0631af0305 1573 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
RyoheiHagimoto 0:0e0631af0305 1574
RyoheiHagimoto 0:0e0631af0305 1575 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
RyoheiHagimoto 0:0e0631af0305 1576 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
RyoheiHagimoto 0:0e0631af0305 1577 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
RyoheiHagimoto 0:0e0631af0305 1578
RyoheiHagimoto 0:0e0631af0305 1579 _mm_storeu_si128((__m128i*)(ptr), v0);
RyoheiHagimoto 0:0e0631af0305 1580 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
RyoheiHagimoto 0:0e0631af0305 1581 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
RyoheiHagimoto 0:0e0631af0305 1582 }
RyoheiHagimoto 0:0e0631af0305 1583
RyoheiHagimoto 0:0e0631af0305 1584 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
RyoheiHagimoto 0:0e0631af0305 1585 const v_uint16x8& c, const v_uint16x8& d)
RyoheiHagimoto 0:0e0631af0305 1586 {
RyoheiHagimoto 0:0e0631af0305 1587 // a0 a1 a2 a3 ....
RyoheiHagimoto 0:0e0631af0305 1588 // b0 b1 b2 b3 ....
RyoheiHagimoto 0:0e0631af0305 1589 // c0 c1 c2 c3 ....
RyoheiHagimoto 0:0e0631af0305 1590 // d0 d1 d2 d3 ....
RyoheiHagimoto 0:0e0631af0305 1591 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
RyoheiHagimoto 0:0e0631af0305 1592 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
RyoheiHagimoto 0:0e0631af0305 1593 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
RyoheiHagimoto 0:0e0631af0305 1594 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
RyoheiHagimoto 0:0e0631af0305 1595
RyoheiHagimoto 0:0e0631af0305 1596 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
RyoheiHagimoto 0:0e0631af0305 1597 __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
RyoheiHagimoto 0:0e0631af0305 1598 __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
RyoheiHagimoto 0:0e0631af0305 1599 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
RyoheiHagimoto 0:0e0631af0305 1600
RyoheiHagimoto 0:0e0631af0305 1601 _mm_storeu_si128((__m128i*)ptr, v0);
RyoheiHagimoto 0:0e0631af0305 1602 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
RyoheiHagimoto 0:0e0631af0305 1603 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
RyoheiHagimoto 0:0e0631af0305 1604 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
RyoheiHagimoto 0:0e0631af0305 1605 }
RyoheiHagimoto 0:0e0631af0305 1606
RyoheiHagimoto 0:0e0631af0305 1607 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
RyoheiHagimoto 0:0e0631af0305 1608 const v_uint32x4& c )
RyoheiHagimoto 0:0e0631af0305 1609 {
RyoheiHagimoto 0:0e0631af0305 1610 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
RyoheiHagimoto 0:0e0631af0305 1611 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
RyoheiHagimoto 0:0e0631af0305 1612
RyoheiHagimoto 0:0e0631af0305 1613 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
RyoheiHagimoto 0:0e0631af0305 1614 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
RyoheiHagimoto 0:0e0631af0305 1615 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
RyoheiHagimoto 0:0e0631af0305 1616
RyoheiHagimoto 0:0e0631af0305 1617 _mm_storeu_si128((__m128i*)ptr, v0);
RyoheiHagimoto 0:0e0631af0305 1618 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
RyoheiHagimoto 0:0e0631af0305 1619 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
RyoheiHagimoto 0:0e0631af0305 1620 }
RyoheiHagimoto 0:0e0631af0305 1621
RyoheiHagimoto 0:0e0631af0305 1622 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
RyoheiHagimoto 0:0e0631af0305 1623 const v_uint32x4& c, const v_uint32x4& d)
RyoheiHagimoto 0:0e0631af0305 1624 {
RyoheiHagimoto 0:0e0631af0305 1625 v_uint32x4 t0, t1, t2, t3;
RyoheiHagimoto 0:0e0631af0305 1626 v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
RyoheiHagimoto 0:0e0631af0305 1627 v_store(ptr, t0);
RyoheiHagimoto 0:0e0631af0305 1628 v_store(ptr + 4, t1);
RyoheiHagimoto 0:0e0631af0305 1629 v_store(ptr + 8, t2);
RyoheiHagimoto 0:0e0631af0305 1630 v_store(ptr + 12, t3);
RyoheiHagimoto 0:0e0631af0305 1631 }
RyoheiHagimoto 0:0e0631af0305 1632
RyoheiHagimoto 0:0e0631af0305 1633 // 2-channel, float only
RyoheiHagimoto 0:0e0631af0305 1634 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
RyoheiHagimoto 0:0e0631af0305 1635 {
RyoheiHagimoto 0:0e0631af0305 1636 // a0 a1 a2 a3 ...
RyoheiHagimoto 0:0e0631af0305 1637 // b0 b1 b2 b3 ...
RyoheiHagimoto 0:0e0631af0305 1638 __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
RyoheiHagimoto 0:0e0631af0305 1639 __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
RyoheiHagimoto 0:0e0631af0305 1640
RyoheiHagimoto 0:0e0631af0305 1641 _mm_storeu_ps(ptr, u0);
RyoheiHagimoto 0:0e0631af0305 1642 _mm_storeu_ps((ptr + 4), u1);
RyoheiHagimoto 0:0e0631af0305 1643 }
RyoheiHagimoto 0:0e0631af0305 1644
RyoheiHagimoto 0:0e0631af0305 1645 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
RyoheiHagimoto 0:0e0631af0305 1646 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
RyoheiHagimoto 0:0e0631af0305 1647 _Tpvec& b0, _Tpvec& c0 ) \
RyoheiHagimoto 0:0e0631af0305 1648 { \
RyoheiHagimoto 0:0e0631af0305 1649 _Tpuvec a1, b1, c1; \
RyoheiHagimoto 0:0e0631af0305 1650 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
RyoheiHagimoto 0:0e0631af0305 1651 a0 = v_reinterpret_as_##suffix(a1); \
RyoheiHagimoto 0:0e0631af0305 1652 b0 = v_reinterpret_as_##suffix(b1); \
RyoheiHagimoto 0:0e0631af0305 1653 c0 = v_reinterpret_as_##suffix(c1); \
RyoheiHagimoto 0:0e0631af0305 1654 } \
RyoheiHagimoto 0:0e0631af0305 1655 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
RyoheiHagimoto 0:0e0631af0305 1656 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
RyoheiHagimoto 0:0e0631af0305 1657 { \
RyoheiHagimoto 0:0e0631af0305 1658 _Tpuvec a1, b1, c1, d1; \
RyoheiHagimoto 0:0e0631af0305 1659 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
RyoheiHagimoto 0:0e0631af0305 1660 a0 = v_reinterpret_as_##suffix(a1); \
RyoheiHagimoto 0:0e0631af0305 1661 b0 = v_reinterpret_as_##suffix(b1); \
RyoheiHagimoto 0:0e0631af0305 1662 c0 = v_reinterpret_as_##suffix(c1); \
RyoheiHagimoto 0:0e0631af0305 1663 d0 = v_reinterpret_as_##suffix(d1); \
RyoheiHagimoto 0:0e0631af0305 1664 } \
RyoheiHagimoto 0:0e0631af0305 1665 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
RyoheiHagimoto 0:0e0631af0305 1666 const _Tpvec& b0, const _Tpvec& c0 ) \
RyoheiHagimoto 0:0e0631af0305 1667 { \
RyoheiHagimoto 0:0e0631af0305 1668 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
RyoheiHagimoto 0:0e0631af0305 1669 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
RyoheiHagimoto 0:0e0631af0305 1670 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
RyoheiHagimoto 0:0e0631af0305 1671 v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
RyoheiHagimoto 0:0e0631af0305 1672 } \
RyoheiHagimoto 0:0e0631af0305 1673 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
RyoheiHagimoto 0:0e0631af0305 1674 const _Tpvec& c0, const _Tpvec& d0 ) \
RyoheiHagimoto 0:0e0631af0305 1675 { \
RyoheiHagimoto 0:0e0631af0305 1676 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
RyoheiHagimoto 0:0e0631af0305 1677 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
RyoheiHagimoto 0:0e0631af0305 1678 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
RyoheiHagimoto 0:0e0631af0305 1679 _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
RyoheiHagimoto 0:0e0631af0305 1680 v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
RyoheiHagimoto 0:0e0631af0305 1681 }
RyoheiHagimoto 0:0e0631af0305 1682
RyoheiHagimoto 0:0e0631af0305 1683 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
RyoheiHagimoto 0:0e0631af0305 1684 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
RyoheiHagimoto 0:0e0631af0305 1685 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
RyoheiHagimoto 0:0e0631af0305 1686 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
RyoheiHagimoto 0:0e0631af0305 1687
RyoheiHagimoto 0:0e0631af0305 1688 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
RyoheiHagimoto 0:0e0631af0305 1689 {
RyoheiHagimoto 0:0e0631af0305 1690 return v_float32x4(_mm_cvtepi32_ps(a.val));
RyoheiHagimoto 0:0e0631af0305 1691 }
RyoheiHagimoto 0:0e0631af0305 1692
RyoheiHagimoto 0:0e0631af0305 1693 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
RyoheiHagimoto 0:0e0631af0305 1694 {
RyoheiHagimoto 0:0e0631af0305 1695 return v_float32x4(_mm_cvtpd_ps(a.val));
RyoheiHagimoto 0:0e0631af0305 1696 }
RyoheiHagimoto 0:0e0631af0305 1697
RyoheiHagimoto 0:0e0631af0305 1698 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
RyoheiHagimoto 0:0e0631af0305 1699 {
RyoheiHagimoto 0:0e0631af0305 1700 return v_float64x2(_mm_cvtepi32_pd(a.val));
RyoheiHagimoto 0:0e0631af0305 1701 }
RyoheiHagimoto 0:0e0631af0305 1702
RyoheiHagimoto 0:0e0631af0305 1703 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
RyoheiHagimoto 0:0e0631af0305 1704 {
RyoheiHagimoto 0:0e0631af0305 1705 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
RyoheiHagimoto 0:0e0631af0305 1706 }
RyoheiHagimoto 0:0e0631af0305 1707
RyoheiHagimoto 0:0e0631af0305 1708 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
RyoheiHagimoto 0:0e0631af0305 1709 {
RyoheiHagimoto 0:0e0631af0305 1710 return v_float64x2(_mm_cvtps_pd(a.val));
RyoheiHagimoto 0:0e0631af0305 1711 }
RyoheiHagimoto 0:0e0631af0305 1712
RyoheiHagimoto 0:0e0631af0305 1713 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
RyoheiHagimoto 0:0e0631af0305 1714 {
RyoheiHagimoto 0:0e0631af0305 1715 return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
RyoheiHagimoto 0:0e0631af0305 1716 }
RyoheiHagimoto 0:0e0631af0305 1717
RyoheiHagimoto 0:0e0631af0305 1718 #if defined(HAVE_FP16)
RyoheiHagimoto 0:0e0631af0305 1719 inline v_float32x4 v_cvt_f32(const v_float16x4& a)
RyoheiHagimoto 0:0e0631af0305 1720 {
RyoheiHagimoto 0:0e0631af0305 1721 return v_float32x4(_mm_cvtph_ps(a.val));
RyoheiHagimoto 0:0e0631af0305 1722 }
RyoheiHagimoto 0:0e0631af0305 1723
RyoheiHagimoto 0:0e0631af0305 1724 inline v_float16x4 v_cvt_f16(const v_float32x4& a)
RyoheiHagimoto 0:0e0631af0305 1725 {
RyoheiHagimoto 0:0e0631af0305 1726 return v_float16x4(_mm_cvtps_ph(a.val, 0));
RyoheiHagimoto 0:0e0631af0305 1727 }
RyoheiHagimoto 0:0e0631af0305 1728 #endif
RyoheiHagimoto 0:0e0631af0305 1729
RyoheiHagimoto 0:0e0631af0305 1730 //! @name Check SIMD support
RyoheiHagimoto 0:0e0631af0305 1731 //! @{
RyoheiHagimoto 0:0e0631af0305 1732 //! @brief Check CPU capability of SIMD operation
RyoheiHagimoto 0:0e0631af0305 1733 static inline bool hasSIMD128()
RyoheiHagimoto 0:0e0631af0305 1734 {
RyoheiHagimoto 0:0e0631af0305 1735 return checkHardwareSupport(CV_CPU_SSE2);
RyoheiHagimoto 0:0e0631af0305 1736 }
RyoheiHagimoto 0:0e0631af0305 1737
RyoheiHagimoto 0:0e0631af0305 1738 //! @}
RyoheiHagimoto 0:0e0631af0305 1739
RyoheiHagimoto 0:0e0631af0305 1740 //! @endcond
RyoheiHagimoto 0:0e0631af0305 1741
RyoheiHagimoto 0:0e0631af0305 1742 }
RyoheiHagimoto 0:0e0631af0305 1743
RyoheiHagimoto 0:0e0631af0305 1744 #endif