openCV library for Renesas RZ/A
Dependents: RZ_A2M_Mbed_samples
include/opencv2/core/hal/intrin_sse.hpp@0:0e0631af0305, 2021-01-29 (annotated)
- Committer:
- RyoheiHagimoto
- Date:
- Fri Jan 29 04:53:38 2021 +0000
- Revision:
- 0:0e0631af0305
copied from https://github.com/d-kato/opencv-lib.
Who changed what in which revision?
| User | Revision | Line number | New contents of line |
|---|---|---|---|
| RyoheiHagimoto | 0:0e0631af0305 | 1 | /*M/////////////////////////////////////////////////////////////////////////////////////// |
| RyoheiHagimoto | 0:0e0631af0305 | 2 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 3 | // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
| RyoheiHagimoto | 0:0e0631af0305 | 4 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 5 | // By downloading, copying, installing or using the software you agree to this license. |
| RyoheiHagimoto | 0:0e0631af0305 | 6 | // If you do not agree to this license, do not download, install, |
| RyoheiHagimoto | 0:0e0631af0305 | 7 | // copy or use the software. |
| RyoheiHagimoto | 0:0e0631af0305 | 8 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 9 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 10 | // License Agreement |
| RyoheiHagimoto | 0:0e0631af0305 | 11 | // For Open Source Computer Vision Library |
| RyoheiHagimoto | 0:0e0631af0305 | 12 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 13 | // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
| RyoheiHagimoto | 0:0e0631af0305 | 14 | // Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
| RyoheiHagimoto | 0:0e0631af0305 | 15 | // Copyright (C) 2013, OpenCV Foundation, all rights reserved. |
| RyoheiHagimoto | 0:0e0631af0305 | 16 | // Copyright (C) 2015, Itseez Inc., all rights reserved. |
| RyoheiHagimoto | 0:0e0631af0305 | 17 | // Third party copyrights are property of their respective owners. |
| RyoheiHagimoto | 0:0e0631af0305 | 18 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 19 | // Redistribution and use in source and binary forms, with or without modification, |
| RyoheiHagimoto | 0:0e0631af0305 | 20 | // are permitted provided that the following conditions are met: |
| RyoheiHagimoto | 0:0e0631af0305 | 21 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 22 | // * Redistribution's of source code must retain the above copyright notice, |
| RyoheiHagimoto | 0:0e0631af0305 | 23 | // this list of conditions and the following disclaimer. |
| RyoheiHagimoto | 0:0e0631af0305 | 24 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 25 | // * Redistribution's in binary form must reproduce the above copyright notice, |
| RyoheiHagimoto | 0:0e0631af0305 | 26 | // this list of conditions and the following disclaimer in the documentation |
| RyoheiHagimoto | 0:0e0631af0305 | 27 | // and/or other materials provided with the distribution. |
| RyoheiHagimoto | 0:0e0631af0305 | 28 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 29 | // * The name of the copyright holders may not be used to endorse or promote products |
| RyoheiHagimoto | 0:0e0631af0305 | 30 | // derived from this software without specific prior written permission. |
| RyoheiHagimoto | 0:0e0631af0305 | 31 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 32 | // This software is provided by the copyright holders and contributors "as is" and |
| RyoheiHagimoto | 0:0e0631af0305 | 33 | // any express or implied warranties, including, but not limited to, the implied |
| RyoheiHagimoto | 0:0e0631af0305 | 34 | // warranties of merchantability and fitness for a particular purpose are disclaimed. |
| RyoheiHagimoto | 0:0e0631af0305 | 35 | // In no event shall the Intel Corporation or contributors be liable for any direct, |
| RyoheiHagimoto | 0:0e0631af0305 | 36 | // indirect, incidental, special, exemplary, or consequential damages |
| RyoheiHagimoto | 0:0e0631af0305 | 37 | // (including, but not limited to, procurement of substitute goods or services; |
| RyoheiHagimoto | 0:0e0631af0305 | 38 | // loss of use, data, or profits; or business interruption) however caused |
| RyoheiHagimoto | 0:0e0631af0305 | 39 | // and on any theory of liability, whether in contract, strict liability, |
| RyoheiHagimoto | 0:0e0631af0305 | 40 | // or tort (including negligence or otherwise) arising in any way out of |
| RyoheiHagimoto | 0:0e0631af0305 | 41 | // the use of this software, even if advised of the possibility of such damage. |
| RyoheiHagimoto | 0:0e0631af0305 | 42 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 43 | //M*/ |
| RyoheiHagimoto | 0:0e0631af0305 | 44 | |
| RyoheiHagimoto | 0:0e0631af0305 | 45 | #ifndef OPENCV_HAL_SSE_HPP |
| RyoheiHagimoto | 0:0e0631af0305 | 46 | #define OPENCV_HAL_SSE_HPP |
| RyoheiHagimoto | 0:0e0631af0305 | 47 | |
| RyoheiHagimoto | 0:0e0631af0305 | 48 | #include <algorithm> |
| RyoheiHagimoto | 0:0e0631af0305 | 49 | #include "opencv2/core/utility.hpp" |
| RyoheiHagimoto | 0:0e0631af0305 | 50 | |
| RyoheiHagimoto | 0:0e0631af0305 | 51 | #define CV_SIMD128 1 |
| RyoheiHagimoto | 0:0e0631af0305 | 52 | #define CV_SIMD128_64F 1 |
| RyoheiHagimoto | 0:0e0631af0305 | 53 | |
| RyoheiHagimoto | 0:0e0631af0305 | 54 | namespace cv |
| RyoheiHagimoto | 0:0e0631af0305 | 55 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 56 | |
| RyoheiHagimoto | 0:0e0631af0305 | 57 | //! @cond IGNORED |
| RyoheiHagimoto | 0:0e0631af0305 | 58 | |
| RyoheiHagimoto | 0:0e0631af0305 | 59 | struct v_uint8x16 |
| RyoheiHagimoto | 0:0e0631af0305 | 60 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 61 | typedef uchar lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 62 | enum { nlanes = 16 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 63 | |
| RyoheiHagimoto | 0:0e0631af0305 | 64 | v_uint8x16() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 65 | explicit v_uint8x16(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 66 | v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, |
| RyoheiHagimoto | 0:0e0631af0305 | 67 | uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) |
| RyoheiHagimoto | 0:0e0631af0305 | 68 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 69 | val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, |
| RyoheiHagimoto | 0:0e0631af0305 | 70 | (char)v4, (char)v5, (char)v6, (char)v7, |
| RyoheiHagimoto | 0:0e0631af0305 | 71 | (char)v8, (char)v9, (char)v10, (char)v11, |
| RyoheiHagimoto | 0:0e0631af0305 | 72 | (char)v12, (char)v13, (char)v14, (char)v15); |
| RyoheiHagimoto | 0:0e0631af0305 | 73 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 74 | uchar get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 75 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 76 | return (uchar)_mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 77 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 78 | |
| RyoheiHagimoto | 0:0e0631af0305 | 79 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 80 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 81 | |
| RyoheiHagimoto | 0:0e0631af0305 | 82 | struct v_int8x16 |
| RyoheiHagimoto | 0:0e0631af0305 | 83 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 84 | typedef schar lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 85 | enum { nlanes = 16 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 86 | |
| RyoheiHagimoto | 0:0e0631af0305 | 87 | v_int8x16() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 88 | explicit v_int8x16(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 89 | v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, |
| RyoheiHagimoto | 0:0e0631af0305 | 90 | schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) |
| RyoheiHagimoto | 0:0e0631af0305 | 91 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 92 | val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, |
| RyoheiHagimoto | 0:0e0631af0305 | 93 | (char)v4, (char)v5, (char)v6, (char)v7, |
| RyoheiHagimoto | 0:0e0631af0305 | 94 | (char)v8, (char)v9, (char)v10, (char)v11, |
| RyoheiHagimoto | 0:0e0631af0305 | 95 | (char)v12, (char)v13, (char)v14, (char)v15); |
| RyoheiHagimoto | 0:0e0631af0305 | 96 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 97 | schar get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 98 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 99 | return (schar)_mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 100 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 101 | |
| RyoheiHagimoto | 0:0e0631af0305 | 102 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 103 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 104 | |
| RyoheiHagimoto | 0:0e0631af0305 | 105 | struct v_uint16x8 |
| RyoheiHagimoto | 0:0e0631af0305 | 106 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 107 | typedef ushort lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 108 | enum { nlanes = 8 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 109 | |
| RyoheiHagimoto | 0:0e0631af0305 | 110 | v_uint16x8() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 111 | explicit v_uint16x8(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 112 | v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) |
| RyoheiHagimoto | 0:0e0631af0305 | 113 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 114 | val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, |
| RyoheiHagimoto | 0:0e0631af0305 | 115 | (short)v4, (short)v5, (short)v6, (short)v7); |
| RyoheiHagimoto | 0:0e0631af0305 | 116 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 117 | ushort get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 118 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 119 | return (ushort)_mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 120 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 121 | |
| RyoheiHagimoto | 0:0e0631af0305 | 122 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 123 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 124 | |
| RyoheiHagimoto | 0:0e0631af0305 | 125 | struct v_int16x8 |
| RyoheiHagimoto | 0:0e0631af0305 | 126 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 127 | typedef short lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 128 | enum { nlanes = 8 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 129 | |
| RyoheiHagimoto | 0:0e0631af0305 | 130 | v_int16x8() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 131 | explicit v_int16x8(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 132 | v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) |
| RyoheiHagimoto | 0:0e0631af0305 | 133 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 134 | val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, |
| RyoheiHagimoto | 0:0e0631af0305 | 135 | (short)v4, (short)v5, (short)v6, (short)v7); |
| RyoheiHagimoto | 0:0e0631af0305 | 136 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 137 | short get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 138 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 139 | return (short)_mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 140 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 141 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 142 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 143 | |
| RyoheiHagimoto | 0:0e0631af0305 | 144 | struct v_uint32x4 |
| RyoheiHagimoto | 0:0e0631af0305 | 145 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 146 | typedef unsigned lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 147 | enum { nlanes = 4 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 148 | |
| RyoheiHagimoto | 0:0e0631af0305 | 149 | v_uint32x4() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 150 | explicit v_uint32x4(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 151 | v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) |
| RyoheiHagimoto | 0:0e0631af0305 | 152 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 153 | val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3); |
| RyoheiHagimoto | 0:0e0631af0305 | 154 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 155 | unsigned get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 156 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 157 | return (unsigned)_mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 158 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 159 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 160 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 161 | |
| RyoheiHagimoto | 0:0e0631af0305 | 162 | struct v_int32x4 |
| RyoheiHagimoto | 0:0e0631af0305 | 163 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 164 | typedef int lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 165 | enum { nlanes = 4 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 166 | |
| RyoheiHagimoto | 0:0e0631af0305 | 167 | v_int32x4() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 168 | explicit v_int32x4(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 169 | v_int32x4(int v0, int v1, int v2, int v3) |
| RyoheiHagimoto | 0:0e0631af0305 | 170 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 171 | val = _mm_setr_epi32(v0, v1, v2, v3); |
| RyoheiHagimoto | 0:0e0631af0305 | 172 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 173 | int get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 174 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 175 | return _mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 176 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 177 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 178 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 179 | |
| RyoheiHagimoto | 0:0e0631af0305 | 180 | struct v_float32x4 |
| RyoheiHagimoto | 0:0e0631af0305 | 181 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 182 | typedef float lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 183 | enum { nlanes = 4 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 184 | |
| RyoheiHagimoto | 0:0e0631af0305 | 185 | v_float32x4() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 186 | explicit v_float32x4(__m128 v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 187 | v_float32x4(float v0, float v1, float v2, float v3) |
| RyoheiHagimoto | 0:0e0631af0305 | 188 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 189 | val = _mm_setr_ps(v0, v1, v2, v3); |
| RyoheiHagimoto | 0:0e0631af0305 | 190 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 191 | float get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 192 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 193 | return _mm_cvtss_f32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 194 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 195 | __m128 val; |
| RyoheiHagimoto | 0:0e0631af0305 | 196 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 197 | |
| RyoheiHagimoto | 0:0e0631af0305 | 198 | struct v_uint64x2 |
| RyoheiHagimoto | 0:0e0631af0305 | 199 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 200 | typedef uint64 lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 201 | enum { nlanes = 2 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 202 | |
| RyoheiHagimoto | 0:0e0631af0305 | 203 | v_uint64x2() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 204 | explicit v_uint64x2(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 205 | v_uint64x2(uint64 v0, uint64 v1) |
| RyoheiHagimoto | 0:0e0631af0305 | 206 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 207 | val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 208 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 209 | uint64 get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 210 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 211 | int a = _mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 212 | int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 213 | return (unsigned)a | ((uint64)(unsigned)b << 32); |
| RyoheiHagimoto | 0:0e0631af0305 | 214 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 215 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 216 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 217 | |
| RyoheiHagimoto | 0:0e0631af0305 | 218 | struct v_int64x2 |
| RyoheiHagimoto | 0:0e0631af0305 | 219 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 220 | typedef int64 lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 221 | enum { nlanes = 2 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 222 | |
| RyoheiHagimoto | 0:0e0631af0305 | 223 | v_int64x2() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 224 | explicit v_int64x2(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 225 | v_int64x2(int64 v0, int64 v1) |
| RyoheiHagimoto | 0:0e0631af0305 | 226 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 227 | val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 228 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 229 | int64 get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 230 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 231 | int a = _mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 232 | int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 233 | return (int64)((unsigned)a | ((uint64)(unsigned)b << 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 234 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 235 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 236 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 237 | |
| RyoheiHagimoto | 0:0e0631af0305 | 238 | struct v_float64x2 |
| RyoheiHagimoto | 0:0e0631af0305 | 239 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 240 | typedef double lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 241 | enum { nlanes = 2 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 242 | |
| RyoheiHagimoto | 0:0e0631af0305 | 243 | v_float64x2() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 244 | explicit v_float64x2(__m128d v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 245 | v_float64x2(double v0, double v1) |
| RyoheiHagimoto | 0:0e0631af0305 | 246 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 247 | val = _mm_setr_pd(v0, v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 248 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 249 | double get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 250 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 251 | return _mm_cvtsd_f64(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 252 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 253 | __m128d val; |
| RyoheiHagimoto | 0:0e0631af0305 | 254 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 255 | |
| RyoheiHagimoto | 0:0e0631af0305 | 256 | #if defined(HAVE_FP16) |
| RyoheiHagimoto | 0:0e0631af0305 | 257 | struct v_float16x4 |
| RyoheiHagimoto | 0:0e0631af0305 | 258 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 259 | typedef short lane_type; |
| RyoheiHagimoto | 0:0e0631af0305 | 260 | enum { nlanes = 4 }; |
| RyoheiHagimoto | 0:0e0631af0305 | 261 | |
| RyoheiHagimoto | 0:0e0631af0305 | 262 | v_float16x4() {} |
| RyoheiHagimoto | 0:0e0631af0305 | 263 | explicit v_float16x4(__m128i v) : val(v) {} |
| RyoheiHagimoto | 0:0e0631af0305 | 264 | v_float16x4(short v0, short v1, short v2, short v3) |
| RyoheiHagimoto | 0:0e0631af0305 | 265 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 266 | val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0); |
| RyoheiHagimoto | 0:0e0631af0305 | 267 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 268 | short get0() const |
| RyoheiHagimoto | 0:0e0631af0305 | 269 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 270 | return (short)_mm_cvtsi128_si32(val); |
| RyoheiHagimoto | 0:0e0631af0305 | 271 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 272 | __m128i val; |
| RyoheiHagimoto | 0:0e0631af0305 | 273 | }; |
| RyoheiHagimoto | 0:0e0631af0305 | 274 | #endif |
| RyoheiHagimoto | 0:0e0631af0305 | 275 | |
| RyoheiHagimoto | 0:0e0631af0305 | 276 | #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 277 | inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 278 | inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 279 | template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 280 | { return _Tpvec(cast(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 281 | |
| RyoheiHagimoto | 0:0e0631af0305 | 282 | OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 283 | OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 284 | OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 285 | OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 286 | OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 287 | OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 288 | OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 289 | OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 290 | |
| RyoheiHagimoto | 0:0e0631af0305 | 291 | inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); } |
| RyoheiHagimoto | 0:0e0631af0305 | 292 | inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); } |
| RyoheiHagimoto | 0:0e0631af0305 | 293 | inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); } |
| RyoheiHagimoto | 0:0e0631af0305 | 294 | inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); } |
| RyoheiHagimoto | 0:0e0631af0305 | 295 | |
| RyoheiHagimoto | 0:0e0631af0305 | 296 | template<typename _Tpvec> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 297 | v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); } |
| RyoheiHagimoto | 0:0e0631af0305 | 298 | template<typename _Tpvec> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 299 | v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); } |
| RyoheiHagimoto | 0:0e0631af0305 | 300 | inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 301 | { return v_float32x4(_mm_castsi128_ps(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 302 | inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 303 | { return v_float32x4(_mm_castsi128_ps(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 304 | inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 305 | { return v_float64x2(_mm_castsi128_pd(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 306 | inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 307 | { return v_float64x2(_mm_castsi128_pd(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 308 | |
| RyoheiHagimoto | 0:0e0631af0305 | 309 | #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 310 | inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 311 | { return _Tpvec(_mm_castps_si128(a.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 312 | inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 313 | { return _Tpvec(_mm_castpd_si128(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 314 | |
| RyoheiHagimoto | 0:0e0631af0305 | 315 | OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8) |
| RyoheiHagimoto | 0:0e0631af0305 | 316 | OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8) |
| RyoheiHagimoto | 0:0e0631af0305 | 317 | OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16) |
| RyoheiHagimoto | 0:0e0631af0305 | 318 | OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16) |
| RyoheiHagimoto | 0:0e0631af0305 | 319 | OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32) |
| RyoheiHagimoto | 0:0e0631af0305 | 320 | OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32) |
| RyoheiHagimoto | 0:0e0631af0305 | 321 | OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64) |
| RyoheiHagimoto | 0:0e0631af0305 | 322 | OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64) |
| RyoheiHagimoto | 0:0e0631af0305 | 323 | |
| RyoheiHagimoto | 0:0e0631af0305 | 324 | inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; } |
| RyoheiHagimoto | 0:0e0631af0305 | 325 | inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; } |
| RyoheiHagimoto | 0:0e0631af0305 | 326 | inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 327 | inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 328 | |
| RyoheiHagimoto | 0:0e0631af0305 | 329 | //////////////// PACK /////////////// |
| RyoheiHagimoto | 0:0e0631af0305 | 330 | inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 331 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 332 | __m128i delta = _mm_set1_epi16(255); |
| RyoheiHagimoto | 0:0e0631af0305 | 333 | return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)), |
| RyoheiHagimoto | 0:0e0631af0305 | 334 | _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta)))); |
| RyoheiHagimoto | 0:0e0631af0305 | 335 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 336 | |
| RyoheiHagimoto | 0:0e0631af0305 | 337 | inline void v_pack_store(uchar* ptr, const v_uint16x8& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 338 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 339 | __m128i delta = _mm_set1_epi16(255); |
| RyoheiHagimoto | 0:0e0631af0305 | 340 | __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)); |
| RyoheiHagimoto | 0:0e0631af0305 | 341 | _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 342 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 343 | |
| RyoheiHagimoto | 0:0e0631af0305 | 344 | inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 345 | { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 346 | |
| RyoheiHagimoto | 0:0e0631af0305 | 347 | inline void v_pack_u_store(uchar* ptr, const v_int16x8& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 348 | { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 349 | |
| RyoheiHagimoto | 0:0e0631af0305 | 350 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 351 | v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 352 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 353 | // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. |
| RyoheiHagimoto | 0:0e0631af0305 | 354 | __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); |
| RyoheiHagimoto | 0:0e0631af0305 | 355 | return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n), |
| RyoheiHagimoto | 0:0e0631af0305 | 356 | _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n))); |
| RyoheiHagimoto | 0:0e0631af0305 | 357 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 358 | |
| RyoheiHagimoto | 0:0e0631af0305 | 359 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 360 | void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 361 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 362 | __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); |
| RyoheiHagimoto | 0:0e0631af0305 | 363 | __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 364 | _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 365 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 366 | |
| RyoheiHagimoto | 0:0e0631af0305 | 367 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 368 | v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 369 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 370 | __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); |
| RyoheiHagimoto | 0:0e0631af0305 | 371 | return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), |
| RyoheiHagimoto | 0:0e0631af0305 | 372 | _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); |
| RyoheiHagimoto | 0:0e0631af0305 | 373 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 374 | |
| RyoheiHagimoto | 0:0e0631af0305 | 375 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 376 | void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 377 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 378 | __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); |
| RyoheiHagimoto | 0:0e0631af0305 | 379 | __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 380 | _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 381 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 382 | |
| RyoheiHagimoto | 0:0e0631af0305 | 383 | inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 384 | { return v_int8x16(_mm_packs_epi16(a.val, b.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 385 | |
| RyoheiHagimoto | 0:0e0631af0305 | 386 | inline void v_pack_store(schar* ptr, v_int16x8& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 387 | { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 388 | |
| RyoheiHagimoto | 0:0e0631af0305 | 389 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 390 | v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 391 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 392 | // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. |
| RyoheiHagimoto | 0:0e0631af0305 | 393 | __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); |
| RyoheiHagimoto | 0:0e0631af0305 | 394 | return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), |
| RyoheiHagimoto | 0:0e0631af0305 | 395 | _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); |
| RyoheiHagimoto | 0:0e0631af0305 | 396 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 397 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 398 | void v_rshr_pack_store(schar* ptr, const v_int16x8& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 399 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 400 | // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. |
| RyoheiHagimoto | 0:0e0631af0305 | 401 | __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); |
| RyoheiHagimoto | 0:0e0631af0305 | 402 | __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 403 | _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 404 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 405 | |
| RyoheiHagimoto | 0:0e0631af0305 | 406 | |
| RyoheiHagimoto | 0:0e0631af0305 | 407 | // bit-wise "mask ? a : b" |
| RyoheiHagimoto | 0:0e0631af0305 | 408 | inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) |
| RyoheiHagimoto | 0:0e0631af0305 | 409 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 410 | return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 411 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 412 | |
| RyoheiHagimoto | 0:0e0631af0305 | 413 | inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 414 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 415 | __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); |
| RyoheiHagimoto | 0:0e0631af0305 | 416 | __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 417 | __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 418 | __m128i r = _mm_packs_epi32(a1, b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 419 | return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); |
| RyoheiHagimoto | 0:0e0631af0305 | 420 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 421 | |
| RyoheiHagimoto | 0:0e0631af0305 | 422 | inline void v_pack_store(ushort* ptr, const v_uint32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 423 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 424 | __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); |
| RyoheiHagimoto | 0:0e0631af0305 | 425 | __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 426 | __m128i r = _mm_packs_epi32(a1, a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 427 | _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768))); |
| RyoheiHagimoto | 0:0e0631af0305 | 428 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 429 | |
| RyoheiHagimoto | 0:0e0631af0305 | 430 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 431 | v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 432 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 433 | __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); |
| RyoheiHagimoto | 0:0e0631af0305 | 434 | __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 435 | __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 436 | return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); |
| RyoheiHagimoto | 0:0e0631af0305 | 437 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 438 | |
| RyoheiHagimoto | 0:0e0631af0305 | 439 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 440 | void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 441 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 442 | __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); |
| RyoheiHagimoto | 0:0e0631af0305 | 443 | __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 444 | __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); |
| RyoheiHagimoto | 0:0e0631af0305 | 445 | _mm_storel_epi64((__m128i*)ptr, a2); |
| RyoheiHagimoto | 0:0e0631af0305 | 446 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 447 | |
| RyoheiHagimoto | 0:0e0631af0305 | 448 | inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 449 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 450 | __m128i delta32 = _mm_set1_epi32(32768); |
| RyoheiHagimoto | 0:0e0631af0305 | 451 | __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 452 | return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); |
| RyoheiHagimoto | 0:0e0631af0305 | 453 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 454 | |
| RyoheiHagimoto | 0:0e0631af0305 | 455 | inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 456 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 457 | __m128i delta32 = _mm_set1_epi32(32768); |
| RyoheiHagimoto | 0:0e0631af0305 | 458 | __m128i a1 = _mm_sub_epi32(a.val, delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 459 | __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); |
| RyoheiHagimoto | 0:0e0631af0305 | 460 | _mm_storel_epi64((__m128i*)ptr, r); |
| RyoheiHagimoto | 0:0e0631af0305 | 461 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 462 | |
| RyoheiHagimoto | 0:0e0631af0305 | 463 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 464 | v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 465 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 466 | __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); |
| RyoheiHagimoto | 0:0e0631af0305 | 467 | __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 468 | __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); |
| RyoheiHagimoto | 0:0e0631af0305 | 469 | __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 470 | __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768)); |
| RyoheiHagimoto | 0:0e0631af0305 | 471 | return v_uint16x8(_mm_unpacklo_epi64(a2, b2)); |
| RyoheiHagimoto | 0:0e0631af0305 | 472 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 473 | |
| RyoheiHagimoto | 0:0e0631af0305 | 474 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 475 | void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 476 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 477 | __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); |
| RyoheiHagimoto | 0:0e0631af0305 | 478 | __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); |
| RyoheiHagimoto | 0:0e0631af0305 | 479 | __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); |
| RyoheiHagimoto | 0:0e0631af0305 | 480 | _mm_storel_epi64((__m128i*)ptr, a2); |
| RyoheiHagimoto | 0:0e0631af0305 | 481 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 482 | |
| RyoheiHagimoto | 0:0e0631af0305 | 483 | inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 484 | { return v_int16x8(_mm_packs_epi32(a.val, b.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 485 | |
| RyoheiHagimoto | 0:0e0631af0305 | 486 | inline void v_pack_store(short* ptr, const v_int32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 487 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 488 | _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 489 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 490 | |
| RyoheiHagimoto | 0:0e0631af0305 | 491 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 492 | v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 493 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 494 | __m128i delta = _mm_set1_epi32(1 << (n-1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 495 | return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), |
| RyoheiHagimoto | 0:0e0631af0305 | 496 | _mm_srai_epi32(_mm_add_epi32(b.val, delta), n))); |
| RyoheiHagimoto | 0:0e0631af0305 | 497 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 498 | |
| RyoheiHagimoto | 0:0e0631af0305 | 499 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 500 | void v_rshr_pack_store(short* ptr, const v_int32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 501 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 502 | __m128i delta = _mm_set1_epi32(1 << (n-1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 503 | __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 504 | _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 505 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 506 | |
| RyoheiHagimoto | 0:0e0631af0305 | 507 | |
| RyoheiHagimoto | 0:0e0631af0305 | 508 | // [a0 0 | b0 0] [a1 0 | b1 0] |
| RyoheiHagimoto | 0:0e0631af0305 | 509 | inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 510 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 511 | __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 512 | __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 513 | return v_uint32x4(_mm_unpacklo_epi32(v0, v1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 514 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 515 | |
| RyoheiHagimoto | 0:0e0631af0305 | 516 | inline void v_pack_store(unsigned* ptr, const v_uint64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 517 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 518 | __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0)); |
| RyoheiHagimoto | 0:0e0631af0305 | 519 | _mm_storel_epi64((__m128i*)ptr, a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 520 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 521 | |
| RyoheiHagimoto | 0:0e0631af0305 | 522 | // [a0 0 | b0 0] [a1 0 | b1 0] |
| RyoheiHagimoto | 0:0e0631af0305 | 523 | inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 524 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 525 | __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 526 | __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 527 | return v_int32x4(_mm_unpacklo_epi32(v0, v1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 528 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 529 | |
| RyoheiHagimoto | 0:0e0631af0305 | 530 | inline void v_pack_store(int* ptr, const v_int64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 531 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 532 | __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0)); |
| RyoheiHagimoto | 0:0e0631af0305 | 533 | _mm_storel_epi64((__m128i*)ptr, a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 534 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 535 | |
| RyoheiHagimoto | 0:0e0631af0305 | 536 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 537 | v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 538 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 539 | uint64 delta = (uint64)1 << (n-1); |
| RyoheiHagimoto | 0:0e0631af0305 | 540 | v_uint64x2 delta2(delta, delta); |
| RyoheiHagimoto | 0:0e0631af0305 | 541 | __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 542 | __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 543 | __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 544 | __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 545 | return v_uint32x4(_mm_unpacklo_epi32(v0, v1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 546 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 547 | |
| RyoheiHagimoto | 0:0e0631af0305 | 548 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 549 | void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 550 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 551 | uint64 delta = (uint64)1 << (n-1); |
| RyoheiHagimoto | 0:0e0631af0305 | 552 | v_uint64x2 delta2(delta, delta); |
| RyoheiHagimoto | 0:0e0631af0305 | 553 | __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 554 | __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0)); |
| RyoheiHagimoto | 0:0e0631af0305 | 555 | _mm_storel_epi64((__m128i*)ptr, a2); |
| RyoheiHagimoto | 0:0e0631af0305 | 556 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 557 | |
| RyoheiHagimoto | 0:0e0631af0305 | 558 | inline __m128i v_sign_epi64(__m128i a) |
| RyoheiHagimoto | 0:0e0631af0305 | 559 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 560 | return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1 |
| RyoheiHagimoto | 0:0e0631af0305 | 561 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 562 | |
| RyoheiHagimoto | 0:0e0631af0305 | 563 | inline __m128i v_srai_epi64(__m128i a, int imm) |
| RyoheiHagimoto | 0:0e0631af0305 | 564 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 565 | __m128i smask = v_sign_epi64(a); |
| RyoheiHagimoto | 0:0e0631af0305 | 566 | return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask); |
| RyoheiHagimoto | 0:0e0631af0305 | 567 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 568 | |
| RyoheiHagimoto | 0:0e0631af0305 | 569 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 570 | v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 571 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 572 | int64 delta = (int64)1 << (n-1); |
| RyoheiHagimoto | 0:0e0631af0305 | 573 | v_int64x2 delta2(delta, delta); |
| RyoheiHagimoto | 0:0e0631af0305 | 574 | __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 575 | __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 576 | __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 577 | __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 578 | return v_int32x4(_mm_unpacklo_epi32(v0, v1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 579 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 580 | |
| RyoheiHagimoto | 0:0e0631af0305 | 581 | template<int n> inline |
| RyoheiHagimoto | 0:0e0631af0305 | 582 | void v_rshr_pack_store(int* ptr, const v_int64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 583 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 584 | int64 delta = (int64)1 << (n-1); |
| RyoheiHagimoto | 0:0e0631af0305 | 585 | v_int64x2 delta2(delta, delta); |
| RyoheiHagimoto | 0:0e0631af0305 | 586 | __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n); |
| RyoheiHagimoto | 0:0e0631af0305 | 587 | __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0)); |
| RyoheiHagimoto | 0:0e0631af0305 | 588 | _mm_storel_epi64((__m128i*)ptr, a2); |
| RyoheiHagimoto | 0:0e0631af0305 | 589 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 590 | |
| RyoheiHagimoto | 0:0e0631af0305 | 591 | inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, |
| RyoheiHagimoto | 0:0e0631af0305 | 592 | const v_float32x4& m1, const v_float32x4& m2, |
| RyoheiHagimoto | 0:0e0631af0305 | 593 | const v_float32x4& m3) |
| RyoheiHagimoto | 0:0e0631af0305 | 594 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 595 | __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 596 | __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 597 | __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 598 | __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 599 | |
| RyoheiHagimoto | 0:0e0631af0305 | 600 | return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3))); |
| RyoheiHagimoto | 0:0e0631af0305 | 601 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 602 | |
| RyoheiHagimoto | 0:0e0631af0305 | 603 | |
| RyoheiHagimoto | 0:0e0631af0305 | 604 | #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 605 | inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 606 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 607 | return _Tpvec(intrin(a.val, b.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 608 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 609 | inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 610 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 611 | a.val = intrin(a.val, b.val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 612 | return a; \ |
| RyoheiHagimoto | 0:0e0631af0305 | 613 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 614 | |
| RyoheiHagimoto | 0:0e0631af0305 | 615 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8) |
| RyoheiHagimoto | 0:0e0631af0305 | 616 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8) |
| RyoheiHagimoto | 0:0e0631af0305 | 617 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) |
| RyoheiHagimoto | 0:0e0631af0305 | 618 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) |
| RyoheiHagimoto | 0:0e0631af0305 | 619 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) |
| RyoheiHagimoto | 0:0e0631af0305 | 620 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) |
| RyoheiHagimoto | 0:0e0631af0305 | 621 | OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 622 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 623 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 624 | OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 625 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) |
| RyoheiHagimoto | 0:0e0631af0305 | 626 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) |
| RyoheiHagimoto | 0:0e0631af0305 | 627 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) |
| RyoheiHagimoto | 0:0e0631af0305 | 628 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) |
| RyoheiHagimoto | 0:0e0631af0305 | 629 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 630 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 631 | OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 632 | OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 633 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 634 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 635 | OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 636 | OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 637 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64) |
| RyoheiHagimoto | 0:0e0631af0305 | 638 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64) |
| RyoheiHagimoto | 0:0e0631af0305 | 639 | OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64) |
| RyoheiHagimoto | 0:0e0631af0305 | 640 | OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) |
| RyoheiHagimoto | 0:0e0631af0305 | 641 | |
| RyoheiHagimoto | 0:0e0631af0305 | 642 | inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 643 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 644 | __m128i c0 = _mm_mul_epu32(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 645 | __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 646 | __m128i d0 = _mm_unpacklo_epi32(c0, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 647 | __m128i d1 = _mm_unpackhi_epi32(c0, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 648 | return v_uint32x4(_mm_unpacklo_epi64(d0, d1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 649 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 650 | inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 651 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 652 | __m128i c0 = _mm_mul_epu32(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 653 | __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 654 | __m128i d0 = _mm_unpacklo_epi32(c0, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 655 | __m128i d1 = _mm_unpackhi_epi32(c0, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 656 | return v_int32x4(_mm_unpacklo_epi64(d0, d1)); |
| RyoheiHagimoto | 0:0e0631af0305 | 657 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 658 | inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 659 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 660 | a = a * b; |
| RyoheiHagimoto | 0:0e0631af0305 | 661 | return a; |
| RyoheiHagimoto | 0:0e0631af0305 | 662 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 663 | inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 664 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 665 | a = a * b; |
| RyoheiHagimoto | 0:0e0631af0305 | 666 | return a; |
| RyoheiHagimoto | 0:0e0631af0305 | 667 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 668 | |
| RyoheiHagimoto | 0:0e0631af0305 | 669 | inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 670 | v_int32x4& c, v_int32x4& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 671 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 672 | __m128i v0 = _mm_mullo_epi16(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 673 | __m128i v1 = _mm_mulhi_epi16(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 674 | c.val = _mm_unpacklo_epi16(v0, v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 675 | d.val = _mm_unpackhi_epi16(v0, v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 676 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 677 | |
| RyoheiHagimoto | 0:0e0631af0305 | 678 | inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 679 | v_uint32x4& c, v_uint32x4& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 680 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 681 | __m128i v0 = _mm_mullo_epi16(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 682 | __m128i v1 = _mm_mulhi_epu16(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 683 | c.val = _mm_unpacklo_epi16(v0, v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 684 | d.val = _mm_unpackhi_epi16(v0, v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 685 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 686 | |
| RyoheiHagimoto | 0:0e0631af0305 | 687 | inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 688 | v_uint64x2& c, v_uint64x2& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 689 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 690 | __m128i c0 = _mm_mul_epu32(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 691 | __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 692 | c.val = _mm_unpacklo_epi64(c0, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 693 | d.val = _mm_unpackhi_epi64(c0, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 694 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 695 | |
| RyoheiHagimoto | 0:0e0631af0305 | 696 | inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 697 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 698 | return v_int32x4(_mm_madd_epi16(a.val, b.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 699 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 700 | |
| RyoheiHagimoto | 0:0e0631af0305 | 701 | #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 702 | OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 703 | OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 704 | OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 705 | inline _Tpvec operator ~ (const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 706 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 707 | return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 708 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 709 | |
| RyoheiHagimoto | 0:0e0631af0305 | 710 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 711 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 712 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 713 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 714 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 715 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 716 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 717 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 718 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1))) |
| RyoheiHagimoto | 0:0e0631af0305 | 719 | OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1))) |
| RyoheiHagimoto | 0:0e0631af0305 | 720 | |
| RyoheiHagimoto | 0:0e0631af0305 | 721 | inline v_float32x4 v_sqrt(const v_float32x4& x) |
| RyoheiHagimoto | 0:0e0631af0305 | 722 | { return v_float32x4(_mm_sqrt_ps(x.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 723 | |
| RyoheiHagimoto | 0:0e0631af0305 | 724 | inline v_float32x4 v_invsqrt(const v_float32x4& x) |
| RyoheiHagimoto | 0:0e0631af0305 | 725 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 726 | static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); |
| RyoheiHagimoto | 0:0e0631af0305 | 727 | __m128 t = x.val; |
| RyoheiHagimoto | 0:0e0631af0305 | 728 | __m128 h = _mm_mul_ps(t, _0_5); |
| RyoheiHagimoto | 0:0e0631af0305 | 729 | t = _mm_rsqrt_ps(t); |
| RyoheiHagimoto | 0:0e0631af0305 | 730 | t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h))); |
| RyoheiHagimoto | 0:0e0631af0305 | 731 | return v_float32x4(t); |
| RyoheiHagimoto | 0:0e0631af0305 | 732 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 733 | |
| RyoheiHagimoto | 0:0e0631af0305 | 734 | inline v_float64x2 v_sqrt(const v_float64x2& x) |
| RyoheiHagimoto | 0:0e0631af0305 | 735 | { return v_float64x2(_mm_sqrt_pd(x.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 736 | |
| RyoheiHagimoto | 0:0e0631af0305 | 737 | inline v_float64x2 v_invsqrt(const v_float64x2& x) |
| RyoheiHagimoto | 0:0e0631af0305 | 738 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 739 | static const __m128d v_1 = _mm_set1_pd(1.); |
| RyoheiHagimoto | 0:0e0631af0305 | 740 | return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); |
| RyoheiHagimoto | 0:0e0631af0305 | 741 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 742 | |
| RyoheiHagimoto | 0:0e0631af0305 | 743 | #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 744 | inline _Tpuvec v_abs(const _Tpsvec& x) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 745 | { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); } |
| RyoheiHagimoto | 0:0e0631af0305 | 746 | |
| RyoheiHagimoto | 0:0e0631af0305 | 747 | OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8) |
| RyoheiHagimoto | 0:0e0631af0305 | 748 | OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16) |
| RyoheiHagimoto | 0:0e0631af0305 | 749 | inline v_uint32x4 v_abs(const v_int32x4& x) |
| RyoheiHagimoto | 0:0e0631af0305 | 750 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 751 | __m128i s = _mm_srli_epi32(x.val, 31); |
| RyoheiHagimoto | 0:0e0631af0305 | 752 | __m128i f = _mm_srai_epi32(x.val, 31); |
| RyoheiHagimoto | 0:0e0631af0305 | 753 | return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s)); |
| RyoheiHagimoto | 0:0e0631af0305 | 754 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 755 | inline v_float32x4 v_abs(const v_float32x4& x) |
| RyoheiHagimoto | 0:0e0631af0305 | 756 | { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } |
| RyoheiHagimoto | 0:0e0631af0305 | 757 | inline v_float64x2 v_abs(const v_float64x2& x) |
| RyoheiHagimoto | 0:0e0631af0305 | 758 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 759 | return v_float64x2(_mm_and_pd(x.val, |
| RyoheiHagimoto | 0:0e0631af0305 | 760 | _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1)))); |
| RyoheiHagimoto | 0:0e0631af0305 | 761 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 762 | |
| RyoheiHagimoto | 0:0e0631af0305 | 763 | // TODO: exp, log, sin, cos |
| RyoheiHagimoto | 0:0e0631af0305 | 764 | |
| RyoheiHagimoto | 0:0e0631af0305 | 765 | #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 766 | inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 767 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 768 | return _Tpvec(intrin(a.val, b.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 769 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 770 | |
| RyoheiHagimoto | 0:0e0631af0305 | 771 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8) |
| RyoheiHagimoto | 0:0e0631af0305 | 772 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8) |
| RyoheiHagimoto | 0:0e0631af0305 | 773 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 774 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 775 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 776 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 777 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 778 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 779 | |
| RyoheiHagimoto | 0:0e0631af0305 | 780 | inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 781 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 782 | __m128i delta = _mm_set1_epi8((char)-128); |
| RyoheiHagimoto | 0:0e0631af0305 | 783 | return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta), |
| RyoheiHagimoto | 0:0e0631af0305 | 784 | _mm_xor_si128(b.val, delta)))); |
| RyoheiHagimoto | 0:0e0631af0305 | 785 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 786 | inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 787 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 788 | __m128i delta = _mm_set1_epi8((char)-128); |
| RyoheiHagimoto | 0:0e0631af0305 | 789 | return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta), |
| RyoheiHagimoto | 0:0e0631af0305 | 790 | _mm_xor_si128(b.val, delta)))); |
| RyoheiHagimoto | 0:0e0631af0305 | 791 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 792 | inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 793 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 794 | return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val))); |
| RyoheiHagimoto | 0:0e0631af0305 | 795 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 796 | inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 797 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 798 | return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 799 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 800 | inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 801 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 802 | __m128i delta = _mm_set1_epi32((int)0x80000000); |
| RyoheiHagimoto | 0:0e0631af0305 | 803 | __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); |
| RyoheiHagimoto | 0:0e0631af0305 | 804 | return v_uint32x4(v_select_si128(mask, b.val, a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 805 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 806 | inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 807 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 808 | __m128i delta = _mm_set1_epi32((int)0x80000000); |
| RyoheiHagimoto | 0:0e0631af0305 | 809 | __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); |
| RyoheiHagimoto | 0:0e0631af0305 | 810 | return v_uint32x4(v_select_si128(mask, a.val, b.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 811 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 812 | inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 813 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 814 | return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 815 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 816 | inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 817 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 818 | return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 819 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 820 | |
| RyoheiHagimoto | 0:0e0631af0305 | 821 | #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 822 | inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 823 | { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 824 | inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 825 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 826 | __m128i not_mask = _mm_set1_epi32(-1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 827 | return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 828 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 829 | inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 830 | { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 831 | inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 832 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 833 | __m128i not_mask = _mm_set1_epi32(-1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 834 | return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 835 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 836 | inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 837 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 838 | __m128i smask = _mm_set1_##suffix(sbit); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 839 | return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 840 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 841 | inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 842 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 843 | __m128i smask = _mm_set1_##suffix(sbit); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 844 | return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 845 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 846 | inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 847 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 848 | __m128i smask = _mm_set1_##suffix(sbit); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 849 | __m128i not_mask = _mm_set1_epi32(-1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 850 | __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 851 | return _Tpuvec(_mm_xor_si128(res, not_mask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 852 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 853 | inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 854 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 855 | __m128i smask = _mm_set1_##suffix(sbit); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 856 | __m128i not_mask = _mm_set1_epi32(-1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 857 | __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 858 | return _Tpuvec(_mm_xor_si128(res, not_mask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 859 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 860 | inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 861 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 862 | return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 863 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 864 | inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 865 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 866 | return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 867 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 868 | inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 869 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 870 | __m128i not_mask = _mm_set1_epi32(-1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 871 | return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 872 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 873 | inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 874 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 875 | __m128i not_mask = _mm_set1_epi32(-1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 876 | return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 877 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 878 | |
| RyoheiHagimoto | 0:0e0631af0305 | 879 | OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128) |
| RyoheiHagimoto | 0:0e0631af0305 | 880 | OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768) |
| RyoheiHagimoto | 0:0e0631af0305 | 881 | OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) |
| RyoheiHagimoto | 0:0e0631af0305 | 882 | |
| RyoheiHagimoto | 0:0e0631af0305 | 883 | #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 884 | inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 885 | { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 886 | inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 887 | { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 888 | inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 889 | { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 890 | inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 891 | { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 892 | inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 893 | { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 894 | inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 895 | { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 896 | |
| RyoheiHagimoto | 0:0e0631af0305 | 897 | OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 898 | OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 899 | |
| RyoheiHagimoto | 0:0e0631af0305 | 900 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) |
| RyoheiHagimoto | 0:0e0631af0305 | 901 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) |
| RyoheiHagimoto | 0:0e0631af0305 | 902 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 903 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 904 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8) |
| RyoheiHagimoto | 0:0e0631af0305 | 905 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) |
| RyoheiHagimoto | 0:0e0631af0305 | 906 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 907 | OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 908 | |
| RyoheiHagimoto | 0:0e0631af0305 | 909 | #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 910 | inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 911 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 912 | return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 913 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 914 | inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 915 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 916 | __m128i smask = _mm_set1_epi32(smask32); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 917 | __m128i a1 = _mm_xor_si128(a.val, smask); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 918 | __m128i b1 = _mm_xor_si128(b.val, smask); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 919 | return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 920 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 921 | |
| RyoheiHagimoto | 0:0e0631af0305 | 922 | OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) |
| RyoheiHagimoto | 0:0e0631af0305 | 923 | OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) |
| RyoheiHagimoto | 0:0e0631af0305 | 924 | |
| RyoheiHagimoto | 0:0e0631af0305 | 925 | inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 926 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 927 | return v_max(a, b) - v_min(a, b); |
| RyoheiHagimoto | 0:0e0631af0305 | 928 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 929 | |
| RyoheiHagimoto | 0:0e0631af0305 | 930 | inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 931 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 932 | __m128i d = _mm_sub_epi32(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 933 | __m128i m = _mm_cmpgt_epi32(b.val, a.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 934 | return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m)); |
| RyoheiHagimoto | 0:0e0631af0305 | 935 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 936 | |
| RyoheiHagimoto | 0:0e0631af0305 | 937 | #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 938 | inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 939 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 940 | _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 941 | return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 942 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 943 | inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 944 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 945 | _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 946 | return _Tpvec(_mm_sqrt_##suffix(res)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 947 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 948 | inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 949 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 950 | _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 951 | return _Tpvec(res); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 952 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 953 | inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 954 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 955 | return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 956 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 957 | |
| RyoheiHagimoto | 0:0e0631af0305 | 958 | OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff)) |
| RyoheiHagimoto | 0:0e0631af0305 | 959 | OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1)) |
| RyoheiHagimoto | 0:0e0631af0305 | 960 | |
| RyoheiHagimoto | 0:0e0631af0305 | 961 | #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 962 | inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 963 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 964 | return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 965 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 966 | inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 967 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 968 | return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 969 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 970 | inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 971 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 972 | return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 973 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 974 | inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 975 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 976 | return _Tpsvec(srai(a.val, imm)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 977 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 978 | template<int imm> \ |
| RyoheiHagimoto | 0:0e0631af0305 | 979 | inline _Tpuvec v_shl(const _Tpuvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 980 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 981 | return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 982 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 983 | template<int imm> \ |
| RyoheiHagimoto | 0:0e0631af0305 | 984 | inline _Tpsvec v_shl(const _Tpsvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 985 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 986 | return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 987 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 988 | template<int imm> \ |
| RyoheiHagimoto | 0:0e0631af0305 | 989 | inline _Tpuvec v_shr(const _Tpuvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 990 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 991 | return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 992 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 993 | template<int imm> \ |
| RyoheiHagimoto | 0:0e0631af0305 | 994 | inline _Tpsvec v_shr(const _Tpsvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 995 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 996 | return _Tpsvec(srai(a.val, imm)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 997 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 998 | |
| RyoheiHagimoto | 0:0e0631af0305 | 999 | OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16) |
| RyoheiHagimoto | 0:0e0631af0305 | 1000 | OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32) |
| RyoheiHagimoto | 0:0e0631af0305 | 1001 | OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64) |
| RyoheiHagimoto | 0:0e0631af0305 | 1002 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1003 | #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1004 | inline _Tpvec v_load(const _Tp* ptr) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1005 | { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1006 | inline _Tpvec v_load_aligned(const _Tp* ptr) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1007 | { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1008 | inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1009 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1010 | return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1011 | _mm_loadl_epi64((const __m128i*)ptr1))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1012 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1013 | inline void v_store(_Tp* ptr, const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1014 | { _mm_storeu_si128((__m128i*)ptr, a.val); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1015 | inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1016 | { _mm_store_si128((__m128i*)ptr, a.val); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1017 | inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1018 | { _mm_storel_epi64((__m128i*)ptr, a.val); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1019 | inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1020 | { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 1021 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1022 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar) |
| RyoheiHagimoto | 0:0e0631af0305 | 1023 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar) |
| RyoheiHagimoto | 0:0e0631af0305 | 1024 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort) |
| RyoheiHagimoto | 0:0e0631af0305 | 1025 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short) |
| RyoheiHagimoto | 0:0e0631af0305 | 1026 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned) |
| RyoheiHagimoto | 0:0e0631af0305 | 1027 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int) |
| RyoheiHagimoto | 0:0e0631af0305 | 1028 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64) |
| RyoheiHagimoto | 0:0e0631af0305 | 1029 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64) |
| RyoheiHagimoto | 0:0e0631af0305 | 1030 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1031 | #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1032 | inline _Tpvec v_load(const _Tp* ptr) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1033 | { return _Tpvec(_mm_loadu_##suffix(ptr)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1034 | inline _Tpvec v_load_aligned(const _Tp* ptr) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1035 | { return _Tpvec(_mm_load_##suffix(ptr)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1036 | inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1037 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1038 | return _Tpvec(_mm_castsi128_##suffix( \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1039 | _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1040 | _mm_loadl_epi64((const __m128i*)ptr1)))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1041 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1042 | inline void v_store(_Tp* ptr, const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1043 | { _mm_storeu_##suffix(ptr, a.val); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1044 | inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1045 | { _mm_store_##suffix(ptr, a.val); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1046 | inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1047 | { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1048 | inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1049 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1050 | __m128i a1 = _mm_cast##suffix##_si128(a.val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1051 | _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1052 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1053 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1054 | OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 1055 | OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 1056 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1057 | #if defined(HAVE_FP16) |
| RyoheiHagimoto | 0:0e0631af0305 | 1058 | inline v_float16x4 v_load_f16(const short* ptr) |
| RyoheiHagimoto | 0:0e0631af0305 | 1059 | { return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 1060 | inline void v_store_f16(short* ptr, v_float16x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1061 | { _mm_storel_epi64((__m128i*)ptr, a.val); } |
| RyoheiHagimoto | 0:0e0631af0305 | 1062 | #endif |
| RyoheiHagimoto | 0:0e0631af0305 | 1063 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1064 | #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1065 | inline scalartype v_reduce_##func(const v_##_Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1066 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1067 | __m128i val = a.val; \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1068 | val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1069 | val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1070 | val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1071 | return (scalartype)_mm_cvtsi128_si32(val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1072 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1073 | inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1074 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1075 | __m128i val = a.val; \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1076 | __m128i smask = _mm_set1_epi16(sbit); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1077 | val = _mm_xor_si128(val, smask); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1078 | val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1079 | val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1080 | val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1081 | return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1082 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1083 | #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1084 | inline scalartype v_reduce_sum(const v_##_Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1085 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1086 | __m128i val = a.val; \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1087 | val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1088 | val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1089 | val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1090 | return (scalartype)_mm_cvtsi128_si32(val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1091 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1092 | inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1093 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1094 | __m128i val = a.val; \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1095 | val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1096 | val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1097 | val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1098 | return (unsigned scalartype)_mm_cvtsi128_si32(val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1099 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1100 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768) |
| RyoheiHagimoto | 0:0e0631af0305 | 1101 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768) |
| RyoheiHagimoto | 0:0e0631af0305 | 1102 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16) |
| RyoheiHagimoto | 0:0e0631af0305 | 1103 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1104 | #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1105 | inline scalartype v_reduce_##func(const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1106 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1107 | scalartype CV_DECL_ALIGNED(16) buf[4]; \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1108 | v_store_aligned(buf, a); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1109 | scalartype s0 = scalar_func(buf[0], buf[1]); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1110 | scalartype s1 = scalar_func(buf[2], buf[3]); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1111 | return scalar_func(s0, s1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1112 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1113 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1114 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) |
| RyoheiHagimoto | 0:0e0631af0305 | 1115 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) |
| RyoheiHagimoto | 0:0e0631af0305 | 1116 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) |
| RyoheiHagimoto | 0:0e0631af0305 | 1117 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) |
| RyoheiHagimoto | 0:0e0631af0305 | 1118 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max) |
| RyoheiHagimoto | 0:0e0631af0305 | 1119 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min) |
| RyoheiHagimoto | 0:0e0631af0305 | 1120 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) |
| RyoheiHagimoto | 0:0e0631af0305 | 1121 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) |
| RyoheiHagimoto | 0:0e0631af0305 | 1122 | OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) |
| RyoheiHagimoto | 0:0e0631af0305 | 1123 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1124 | #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1125 | inline int v_signmask(const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1126 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1127 | return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1128 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1129 | inline bool v_check_all(const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1130 | { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1131 | inline bool v_check_any(const _Tpvec& a) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1132 | { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; } |
| RyoheiHagimoto | 0:0e0631af0305 | 1133 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1134 | #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1135 | inline __m128i v_packq_epi32(__m128i a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1136 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1137 | __m128i b = _mm_packs_epi32(a, a); |
| RyoheiHagimoto | 0:0e0631af0305 | 1138 | return _mm_packs_epi16(b, b); |
| RyoheiHagimoto | 0:0e0631af0305 | 1139 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1140 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1141 | OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) |
| RyoheiHagimoto | 0:0e0631af0305 | 1142 | OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) |
| RyoheiHagimoto | 0:0e0631af0305 | 1143 | OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) |
| RyoheiHagimoto | 0:0e0631af0305 | 1144 | OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) |
| RyoheiHagimoto | 0:0e0631af0305 | 1145 | OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) |
| RyoheiHagimoto | 0:0e0631af0305 | 1146 | OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) |
| RyoheiHagimoto | 0:0e0631af0305 | 1147 | OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) |
| RyoheiHagimoto | 0:0e0631af0305 | 1148 | OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) |
| RyoheiHagimoto | 0:0e0631af0305 | 1149 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1150 | #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1151 | inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1152 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1153 | return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1154 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1155 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1156 | OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128) |
| RyoheiHagimoto | 0:0e0631af0305 | 1157 | OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128) |
| RyoheiHagimoto | 0:0e0631af0305 | 1158 | OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128) |
| RyoheiHagimoto | 0:0e0631af0305 | 1159 | OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128) |
| RyoheiHagimoto | 0:0e0631af0305 | 1160 | OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128) |
| RyoheiHagimoto | 0:0e0631af0305 | 1161 | OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) |
| RyoheiHagimoto | 0:0e0631af0305 | 1162 | // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128) |
| RyoheiHagimoto | 0:0e0631af0305 | 1163 | // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128) |
| RyoheiHagimoto | 0:0e0631af0305 | 1164 | OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 1165 | OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 1166 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1167 | #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1168 | inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1169 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1170 | __m128i z = _mm_setzero_si128(); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1171 | b0.val = _mm_unpacklo_##suffix(a.val, z); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1172 | b1.val = _mm_unpackhi_##suffix(a.val, z); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1173 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1174 | inline _Tpwuvec v_load_expand(const _Tpu* ptr) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1175 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1176 | __m128i z = _mm_setzero_si128(); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1177 | return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1178 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1179 | inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1180 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1181 | b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1182 | b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1183 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1184 | inline _Tpwsvec v_load_expand(const _Tps* ptr) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1185 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1186 | __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1187 | return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1188 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1189 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1190 | OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) |
| RyoheiHagimoto | 0:0e0631af0305 | 1191 | OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) |
| RyoheiHagimoto | 0:0e0631af0305 | 1192 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1193 | inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1) |
| RyoheiHagimoto | 0:0e0631af0305 | 1194 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1195 | __m128i z = _mm_setzero_si128(); |
| RyoheiHagimoto | 0:0e0631af0305 | 1196 | b0.val = _mm_unpacklo_epi32(a.val, z); |
| RyoheiHagimoto | 0:0e0631af0305 | 1197 | b1.val = _mm_unpackhi_epi32(a.val, z); |
| RyoheiHagimoto | 0:0e0631af0305 | 1198 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1199 | inline v_uint64x2 v_load_expand(const unsigned* ptr) |
| RyoheiHagimoto | 0:0e0631af0305 | 1200 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1201 | __m128i z = _mm_setzero_si128(); |
| RyoheiHagimoto | 0:0e0631af0305 | 1202 | return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1203 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1204 | inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1) |
| RyoheiHagimoto | 0:0e0631af0305 | 1205 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1206 | __m128i s = _mm_srai_epi32(a.val, 31); |
| RyoheiHagimoto | 0:0e0631af0305 | 1207 | b0.val = _mm_unpacklo_epi32(a.val, s); |
| RyoheiHagimoto | 0:0e0631af0305 | 1208 | b1.val = _mm_unpackhi_epi32(a.val, s); |
| RyoheiHagimoto | 0:0e0631af0305 | 1209 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1210 | inline v_int64x2 v_load_expand(const int* ptr) |
| RyoheiHagimoto | 0:0e0631af0305 | 1211 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1212 | __m128i a = _mm_loadl_epi64((const __m128i*)ptr); |
| RyoheiHagimoto | 0:0e0631af0305 | 1213 | __m128i s = _mm_srai_epi32(a, 31); |
| RyoheiHagimoto | 0:0e0631af0305 | 1214 | return v_int64x2(_mm_unpacklo_epi32(a, s)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1215 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1216 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1217 | inline v_uint32x4 v_load_expand_q(const uchar* ptr) |
| RyoheiHagimoto | 0:0e0631af0305 | 1218 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1219 | __m128i z = _mm_setzero_si128(); |
| RyoheiHagimoto | 0:0e0631af0305 | 1220 | __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); |
| RyoheiHagimoto | 0:0e0631af0305 | 1221 | return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1222 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1223 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1224 | inline v_int32x4 v_load_expand_q(const schar* ptr) |
| RyoheiHagimoto | 0:0e0631af0305 | 1225 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1226 | __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); |
| RyoheiHagimoto | 0:0e0631af0305 | 1227 | a = _mm_unpacklo_epi8(a, a); |
| RyoheiHagimoto | 0:0e0631af0305 | 1228 | a = _mm_unpacklo_epi8(a, a); |
| RyoheiHagimoto | 0:0e0631af0305 | 1229 | return v_int32x4(_mm_srai_epi32(a, 24)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1230 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1231 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1232 | #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1233 | inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1234 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1235 | b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1236 | b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1237 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1238 | inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1239 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1240 | __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1241 | return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1242 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1243 | inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1244 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1245 | __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1246 | return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1247 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1248 | inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1249 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1250 | __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1251 | c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1252 | d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1253 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1254 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1255 | OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 1256 | OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 1257 | OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 1258 | OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 1259 | OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 1260 | OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 1261 | OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 1262 | OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) |
| RyoheiHagimoto | 0:0e0631af0305 | 1263 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1264 | template<int s, typename _Tpvec> |
| RyoheiHagimoto | 0:0e0631af0305 | 1265 | inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 1266 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1267 | const int w = sizeof(typename _Tpvec::lane_type); |
| RyoheiHagimoto | 0:0e0631af0305 | 1268 | const int n = _Tpvec::nlanes; |
| RyoheiHagimoto | 0:0e0631af0305 | 1269 | __m128i ra, rb; |
| RyoheiHagimoto | 0:0e0631af0305 | 1270 | ra = _mm_srli_si128(a.val, s*w); |
| RyoheiHagimoto | 0:0e0631af0305 | 1271 | rb = _mm_slli_si128(b.val, (n-s)*w); |
| RyoheiHagimoto | 0:0e0631af0305 | 1272 | return _Tpvec(_mm_or_si128(ra, rb)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1273 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1274 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1275 | inline v_int32x4 v_round(const v_float32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1276 | { return v_int32x4(_mm_cvtps_epi32(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 1277 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1278 | inline v_int32x4 v_floor(const v_float32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1279 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1280 | __m128i a1 = _mm_cvtps_epi32(a.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1281 | __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1282 | return v_int32x4(_mm_add_epi32(a1, mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1283 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1284 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1285 | inline v_int32x4 v_ceil(const v_float32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1286 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1287 | __m128i a1 = _mm_cvtps_epi32(a.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1288 | __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1))); |
| RyoheiHagimoto | 0:0e0631af0305 | 1289 | return v_int32x4(_mm_sub_epi32(a1, mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1290 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1291 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1292 | inline v_int32x4 v_trunc(const v_float32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1293 | { return v_int32x4(_mm_cvttps_epi32(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 1294 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1295 | inline v_int32x4 v_round(const v_float64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1296 | { return v_int32x4(_mm_cvtpd_epi32(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 1297 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1298 | inline v_int32x4 v_floor(const v_float64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1299 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1300 | __m128i a1 = _mm_cvtpd_epi32(a.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1301 | __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1302 | mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 1303 | return v_int32x4(_mm_add_epi32(a1, mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1304 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1305 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1306 | inline v_int32x4 v_ceil(const v_float64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1307 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1308 | __m128i a1 = _mm_cvtpd_epi32(a.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1309 | __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1))); |
| RyoheiHagimoto | 0:0e0631af0305 | 1310 | mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 |
| RyoheiHagimoto | 0:0e0631af0305 | 1311 | return v_int32x4(_mm_sub_epi32(a1, mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1312 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1313 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1314 | inline v_int32x4 v_trunc(const v_float64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1315 | { return v_int32x4(_mm_cvttpd_epi32(a.val)); } |
| RyoheiHagimoto | 0:0e0631af0305 | 1316 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1317 | #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1318 | inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1319 | const _Tpvec& a2, const _Tpvec& a3, \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1320 | _Tpvec& b0, _Tpvec& b1, \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1321 | _Tpvec& b2, _Tpvec& b3) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1322 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1323 | __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1324 | __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1325 | __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1326 | __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1327 | \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1328 | b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1329 | b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1330 | b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1331 | b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1332 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1333 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1334 | OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 1335 | OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) |
| RyoheiHagimoto | 0:0e0631af0305 | 1336 | OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) |
| RyoheiHagimoto | 0:0e0631af0305 | 1337 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1338 | // adopted from sse_utils.hpp |
| RyoheiHagimoto | 0:0e0631af0305 | 1339 | inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) |
| RyoheiHagimoto | 0:0e0631af0305 | 1340 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1341 | __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); |
| RyoheiHagimoto | 0:0e0631af0305 | 1342 | __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1343 | __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1344 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1345 | __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1346 | __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02); |
| RyoheiHagimoto | 0:0e0631af0305 | 1347 | __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1348 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1349 | __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1350 | __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12); |
| RyoheiHagimoto | 0:0e0631af0305 | 1351 | __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1352 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1353 | __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1354 | __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22); |
| RyoheiHagimoto | 0:0e0631af0305 | 1355 | __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1356 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1357 | a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1358 | b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32); |
| RyoheiHagimoto | 0:0e0631af0305 | 1359 | c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1360 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1361 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1362 | inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 1363 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1364 | __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1365 | __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1366 | __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1367 | __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1368 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1369 | __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1370 | __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1371 | __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1372 | __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1373 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1374 | u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1375 | u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1376 | u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1377 | u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1378 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1379 | v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1380 | v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1381 | v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1382 | v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1383 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1384 | a.val = _mm_unpacklo_epi8(v0, v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1385 | b.val = _mm_unpackhi_epi8(v0, v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1386 | c.val = _mm_unpacklo_epi8(v2, v3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1387 | d.val = _mm_unpackhi_epi8(v2, v3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1388 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1389 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1390 | inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) |
| RyoheiHagimoto | 0:0e0631af0305 | 1391 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1392 | __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); |
| RyoheiHagimoto | 0:0e0631af0305 | 1393 | __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1394 | __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1395 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1396 | __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1397 | __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02); |
| RyoheiHagimoto | 0:0e0631af0305 | 1398 | __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1399 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1400 | __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1401 | __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12); |
| RyoheiHagimoto | 0:0e0631af0305 | 1402 | __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1403 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1404 | a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1405 | b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22); |
| RyoheiHagimoto | 0:0e0631af0305 | 1406 | c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1407 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1408 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1409 | inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 1410 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1411 | __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 |
| RyoheiHagimoto | 0:0e0631af0305 | 1412 | __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1413 | __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1414 | __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1415 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1416 | __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1417 | __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1418 | __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1419 | __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1420 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1421 | u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1422 | u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1423 | u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1424 | u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1425 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1426 | a.val = _mm_unpacklo_epi16(u0, u1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1427 | b.val = _mm_unpackhi_epi16(u0, u1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1428 | c.val = _mm_unpacklo_epi16(u2, u3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1429 | d.val = _mm_unpackhi_epi16(u2, u3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1430 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1431 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1432 | inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c) |
| RyoheiHagimoto | 0:0e0631af0305 | 1433 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1434 | __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); |
| RyoheiHagimoto | 0:0e0631af0305 | 1435 | __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1436 | __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1437 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1438 | __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1439 | __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02); |
| RyoheiHagimoto | 0:0e0631af0305 | 1440 | __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1441 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1442 | a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1443 | b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12); |
| RyoheiHagimoto | 0:0e0631af0305 | 1444 | c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1445 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1446 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1447 | inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 1448 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1449 | v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 |
| RyoheiHagimoto | 0:0e0631af0305 | 1450 | v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 |
| RyoheiHagimoto | 0:0e0631af0305 | 1451 | v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 |
| RyoheiHagimoto | 0:0e0631af0305 | 1452 | v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 |
| RyoheiHagimoto | 0:0e0631af0305 | 1453 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1454 | v_transpose4x4(u0, u1, u2, u3, a, b, c, d); |
| RyoheiHagimoto | 0:0e0631af0305 | 1455 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1456 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1457 | // 2-channel, float only |
| RyoheiHagimoto | 0:0e0631af0305 | 1458 | inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 1459 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1460 | const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1461 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1462 | __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1 |
| RyoheiHagimoto | 0:0e0631af0305 | 1463 | __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3 |
| RyoheiHagimoto | 0:0e0631af0305 | 1464 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1465 | a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3 |
| RyoheiHagimoto | 0:0e0631af0305 | 1466 | b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 |
| RyoheiHagimoto | 0:0e0631af0305 | 1467 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1468 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1469 | inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b ) |
| RyoheiHagimoto | 0:0e0631af0305 | 1470 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1471 | __m128i t0, t1; |
| RyoheiHagimoto | 0:0e0631af0305 | 1472 | t0 = _mm_unpacklo_epi16(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1473 | t1 = _mm_unpackhi_epi16(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1474 | _mm_storeu_si128((__m128i*)(ptr), t0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1475 | _mm_storeu_si128((__m128i*)(ptr + 8), t1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1476 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1477 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1478 | inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 1479 | const v_uint8x16& c ) |
| RyoheiHagimoto | 0:0e0631af0305 | 1480 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1481 | __m128i z = _mm_setzero_si128(); |
| RyoheiHagimoto | 0:0e0631af0305 | 1482 | __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1483 | __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1484 | __m128i c0 = _mm_unpacklo_epi8(c.val, z); |
| RyoheiHagimoto | 0:0e0631af0305 | 1485 | __m128i c1 = _mm_unpackhi_epi8(c.val, z); |
| RyoheiHagimoto | 0:0e0631af0305 | 1486 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1487 | __m128i p00 = _mm_unpacklo_epi16(ab0, c0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1488 | __m128i p01 = _mm_unpackhi_epi16(ab0, c0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1489 | __m128i p02 = _mm_unpacklo_epi16(ab1, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1490 | __m128i p03 = _mm_unpackhi_epi16(ab1, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1491 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1492 | __m128i p10 = _mm_unpacklo_epi32(p00, p01); |
| RyoheiHagimoto | 0:0e0631af0305 | 1493 | __m128i p11 = _mm_unpackhi_epi32(p00, p01); |
| RyoheiHagimoto | 0:0e0631af0305 | 1494 | __m128i p12 = _mm_unpacklo_epi32(p02, p03); |
| RyoheiHagimoto | 0:0e0631af0305 | 1495 | __m128i p13 = _mm_unpackhi_epi32(p02, p03); |
| RyoheiHagimoto | 0:0e0631af0305 | 1496 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1497 | __m128i p20 = _mm_unpacklo_epi64(p10, p11); |
| RyoheiHagimoto | 0:0e0631af0305 | 1498 | __m128i p21 = _mm_unpackhi_epi64(p10, p11); |
| RyoheiHagimoto | 0:0e0631af0305 | 1499 | __m128i p22 = _mm_unpacklo_epi64(p12, p13); |
| RyoheiHagimoto | 0:0e0631af0305 | 1500 | __m128i p23 = _mm_unpackhi_epi64(p12, p13); |
| RyoheiHagimoto | 0:0e0631af0305 | 1501 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1502 | p20 = _mm_slli_si128(p20, 1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1503 | p22 = _mm_slli_si128(p22, 1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1504 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1505 | __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8); |
| RyoheiHagimoto | 0:0e0631af0305 | 1506 | __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8); |
| RyoheiHagimoto | 0:0e0631af0305 | 1507 | __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8); |
| RyoheiHagimoto | 0:0e0631af0305 | 1508 | __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8); |
| RyoheiHagimoto | 0:0e0631af0305 | 1509 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1510 | __m128i p40 = _mm_unpacklo_epi64(p30, p31); |
| RyoheiHagimoto | 0:0e0631af0305 | 1511 | __m128i p41 = _mm_unpackhi_epi64(p30, p31); |
| RyoheiHagimoto | 0:0e0631af0305 | 1512 | __m128i p42 = _mm_unpacklo_epi64(p32, p33); |
| RyoheiHagimoto | 0:0e0631af0305 | 1513 | __m128i p43 = _mm_unpackhi_epi64(p32, p33); |
| RyoheiHagimoto | 0:0e0631af0305 | 1514 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1515 | __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1516 | __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1517 | __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1518 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1519 | _mm_storeu_si128((__m128i*)(ptr), v0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1520 | _mm_storeu_si128((__m128i*)(ptr + 16), v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1521 | _mm_storeu_si128((__m128i*)(ptr + 32), v2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1522 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1523 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1524 | inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 1525 | const v_uint8x16& c, const v_uint8x16& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 1526 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1527 | // a0 a1 a2 a3 .... |
| RyoheiHagimoto | 0:0e0631af0305 | 1528 | // b0 b1 b2 b3 .... |
| RyoheiHagimoto | 0:0e0631af0305 | 1529 | // c0 c1 c2 c3 .... |
| RyoheiHagimoto | 0:0e0631af0305 | 1530 | // d0 d1 d2 d3 .... |
| RyoheiHagimoto | 0:0e0631af0305 | 1531 | __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1532 | __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1533 | __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1534 | __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1535 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1536 | __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1537 | __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1538 | __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1539 | __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1540 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1541 | _mm_storeu_si128((__m128i*)ptr, v0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1542 | _mm_storeu_si128((__m128i*)(ptr + 16), v2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1543 | _mm_storeu_si128((__m128i*)(ptr + 32), v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1544 | _mm_storeu_si128((__m128i*)(ptr + 48), v3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1545 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1546 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1547 | inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, |
| RyoheiHagimoto | 0:0e0631af0305 | 1548 | const v_uint16x8& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 1549 | const v_uint16x8& c ) |
| RyoheiHagimoto | 0:0e0631af0305 | 1550 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1551 | __m128i z = _mm_setzero_si128(); |
| RyoheiHagimoto | 0:0e0631af0305 | 1552 | __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1553 | __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val); |
| RyoheiHagimoto | 0:0e0631af0305 | 1554 | __m128i c0 = _mm_unpacklo_epi16(c.val, z); |
| RyoheiHagimoto | 0:0e0631af0305 | 1555 | __m128i c1 = _mm_unpackhi_epi16(c.val, z); |
| RyoheiHagimoto | 0:0e0631af0305 | 1556 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1557 | __m128i p10 = _mm_unpacklo_epi32(ab0, c0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1558 | __m128i p11 = _mm_unpackhi_epi32(ab0, c0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1559 | __m128i p12 = _mm_unpacklo_epi32(ab1, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1560 | __m128i p13 = _mm_unpackhi_epi32(ab1, c1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1561 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1562 | __m128i p20 = _mm_unpacklo_epi64(p10, p11); |
| RyoheiHagimoto | 0:0e0631af0305 | 1563 | __m128i p21 = _mm_unpackhi_epi64(p10, p11); |
| RyoheiHagimoto | 0:0e0631af0305 | 1564 | __m128i p22 = _mm_unpacklo_epi64(p12, p13); |
| RyoheiHagimoto | 0:0e0631af0305 | 1565 | __m128i p23 = _mm_unpackhi_epi64(p12, p13); |
| RyoheiHagimoto | 0:0e0631af0305 | 1566 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1567 | p20 = _mm_slli_si128(p20, 2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1568 | p22 = _mm_slli_si128(p22, 2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1569 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1570 | __m128i p30 = _mm_unpacklo_epi64(p20, p21); |
| RyoheiHagimoto | 0:0e0631af0305 | 1571 | __m128i p31 = _mm_unpackhi_epi64(p20, p21); |
| RyoheiHagimoto | 0:0e0631af0305 | 1572 | __m128i p32 = _mm_unpacklo_epi64(p22, p23); |
| RyoheiHagimoto | 0:0e0631af0305 | 1573 | __m128i p33 = _mm_unpackhi_epi64(p22, p23); |
| RyoheiHagimoto | 0:0e0631af0305 | 1574 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1575 | __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1576 | __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1577 | __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1578 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1579 | _mm_storeu_si128((__m128i*)(ptr), v0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1580 | _mm_storeu_si128((__m128i*)(ptr + 8), v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1581 | _mm_storeu_si128((__m128i*)(ptr + 16), v2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1582 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1583 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1584 | inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 1585 | const v_uint16x8& c, const v_uint16x8& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 1586 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1587 | // a0 a1 a2 a3 .... |
| RyoheiHagimoto | 0:0e0631af0305 | 1588 | // b0 b1 b2 b3 .... |
| RyoheiHagimoto | 0:0e0631af0305 | 1589 | // c0 c1 c2 c3 .... |
| RyoheiHagimoto | 0:0e0631af0305 | 1590 | // d0 d1 d2 d3 .... |
| RyoheiHagimoto | 0:0e0631af0305 | 1591 | __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1592 | __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1593 | __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1594 | __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1595 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1596 | __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1597 | __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1598 | __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1599 | __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1600 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1601 | _mm_storeu_si128((__m128i*)ptr, v0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1602 | _mm_storeu_si128((__m128i*)(ptr + 8), v2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1603 | _mm_storeu_si128((__m128i*)(ptr + 16), v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1604 | _mm_storeu_si128((__m128i*)(ptr + 24), v3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1605 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1606 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1607 | inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 1608 | const v_uint32x4& c ) |
| RyoheiHagimoto | 0:0e0631af0305 | 1609 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1610 | v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3; |
| RyoheiHagimoto | 0:0e0631af0305 | 1611 | v_transpose4x4(a, b, c, z, u0, u1, u2, u3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1612 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1613 | __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1614 | __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1615 | __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1616 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1617 | _mm_storeu_si128((__m128i*)ptr, v0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1618 | _mm_storeu_si128((__m128i*)(ptr + 4), v1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1619 | _mm_storeu_si128((__m128i*)(ptr + 8), v2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1620 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1621 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1622 | inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, |
| RyoheiHagimoto | 0:0e0631af0305 | 1623 | const v_uint32x4& c, const v_uint32x4& d) |
| RyoheiHagimoto | 0:0e0631af0305 | 1624 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1625 | v_uint32x4 t0, t1, t2, t3; |
| RyoheiHagimoto | 0:0e0631af0305 | 1626 | v_transpose4x4(a, b, c, d, t0, t1, t2, t3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1627 | v_store(ptr, t0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1628 | v_store(ptr + 4, t1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1629 | v_store(ptr + 8, t2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1630 | v_store(ptr + 12, t3); |
| RyoheiHagimoto | 0:0e0631af0305 | 1631 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1632 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1633 | // 2-channel, float only |
| RyoheiHagimoto | 0:0e0631af0305 | 1634 | inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b) |
| RyoheiHagimoto | 0:0e0631af0305 | 1635 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1636 | // a0 a1 a2 a3 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1637 | // b0 b1 b2 b3 ... |
| RyoheiHagimoto | 0:0e0631af0305 | 1638 | __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1 |
| RyoheiHagimoto | 0:0e0631af0305 | 1639 | __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3 |
| RyoheiHagimoto | 0:0e0631af0305 | 1640 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1641 | _mm_storeu_ps(ptr, u0); |
| RyoheiHagimoto | 0:0e0631af0305 | 1642 | _mm_storeu_ps((ptr + 4), u1); |
| RyoheiHagimoto | 0:0e0631af0305 | 1643 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1644 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1645 | #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1646 | inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1647 | _Tpvec& b0, _Tpvec& c0 ) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1648 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1649 | _Tpuvec a1, b1, c1; \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1650 | v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1651 | a0 = v_reinterpret_as_##suffix(a1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1652 | b0 = v_reinterpret_as_##suffix(b1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1653 | c0 = v_reinterpret_as_##suffix(c1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1654 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1655 | inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1656 | _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1657 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1658 | _Tpuvec a1, b1, c1, d1; \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1659 | v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1660 | a0 = v_reinterpret_as_##suffix(a1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1661 | b0 = v_reinterpret_as_##suffix(b1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1662 | c0 = v_reinterpret_as_##suffix(c1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1663 | d0 = v_reinterpret_as_##suffix(d1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1664 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1665 | inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1666 | const _Tpvec& b0, const _Tpvec& c0 ) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1667 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1668 | _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1669 | _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1670 | _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1671 | v_store_interleave((_Tpu*)ptr, a1, b1, c1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1672 | } \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1673 | inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1674 | const _Tpvec& c0, const _Tpvec& d0 ) \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1675 | { \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1676 | _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1677 | _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1678 | _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1679 | _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1680 | v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \ |
| RyoheiHagimoto | 0:0e0631af0305 | 1681 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1682 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1683 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) |
| RyoheiHagimoto | 0:0e0631af0305 | 1684 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) |
| RyoheiHagimoto | 0:0e0631af0305 | 1685 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) |
| RyoheiHagimoto | 0:0e0631af0305 | 1686 | OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) |
| RyoheiHagimoto | 0:0e0631af0305 | 1687 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1688 | inline v_float32x4 v_cvt_f32(const v_int32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1689 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1690 | return v_float32x4(_mm_cvtepi32_ps(a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1691 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1692 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1693 | inline v_float32x4 v_cvt_f32(const v_float64x2& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1694 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1695 | return v_float32x4(_mm_cvtpd_ps(a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1696 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1697 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1698 | inline v_float64x2 v_cvt_f64(const v_int32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1699 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1700 | return v_float64x2(_mm_cvtepi32_pd(a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1701 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1702 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1703 | inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1704 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1705 | return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8))); |
| RyoheiHagimoto | 0:0e0631af0305 | 1706 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1707 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1708 | inline v_float64x2 v_cvt_f64(const v_float32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1709 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1710 | return v_float64x2(_mm_cvtps_pd(a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1711 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1712 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1713 | inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1714 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1715 | return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8)))); |
| RyoheiHagimoto | 0:0e0631af0305 | 1716 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1717 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1718 | #if defined(HAVE_FP16) |
| RyoheiHagimoto | 0:0e0631af0305 | 1719 | inline v_float32x4 v_cvt_f32(const v_float16x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1720 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1721 | return v_float32x4(_mm_cvtph_ps(a.val)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1722 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1723 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1724 | inline v_float16x4 v_cvt_f16(const v_float32x4& a) |
| RyoheiHagimoto | 0:0e0631af0305 | 1725 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1726 | return v_float16x4(_mm_cvtps_ph(a.val, 0)); |
| RyoheiHagimoto | 0:0e0631af0305 | 1727 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1728 | #endif |
| RyoheiHagimoto | 0:0e0631af0305 | 1729 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1730 | //! @name Check SIMD support |
| RyoheiHagimoto | 0:0e0631af0305 | 1731 | //! @{ |
| RyoheiHagimoto | 0:0e0631af0305 | 1732 | //! @brief Check CPU capability of SIMD operation |
| RyoheiHagimoto | 0:0e0631af0305 | 1733 | static inline bool hasSIMD128() |
| RyoheiHagimoto | 0:0e0631af0305 | 1734 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 1735 | return checkHardwareSupport(CV_CPU_SSE2); |
| RyoheiHagimoto | 0:0e0631af0305 | 1736 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1737 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1738 | //! @} |
| RyoheiHagimoto | 0:0e0631af0305 | 1739 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1740 | //! @endcond |
| RyoheiHagimoto | 0:0e0631af0305 | 1741 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1742 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 1743 | |
| RyoheiHagimoto | 0:0e0631af0305 | 1744 | #endif |