Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
intrin_neon.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 00016 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00017 // Third party copyrights are property of their respective owners. 00018 // 00019 // Redistribution and use in source and binary forms, with or without modification, 00020 // are permitted provided that the following conditions are met: 00021 // 00022 // * Redistribution's of source code must retain the above copyright notice, 00023 // this list of conditions and the following disclaimer. 00024 // 00025 // * Redistribution's in binary form must reproduce the above copyright notice, 00026 // this list of conditions and the following disclaimer in the documentation 00027 // and/or other materials provided with the distribution. 00028 // 00029 // * The name of the copyright holders may not be used to endorse or promote products 00030 // derived from this software without specific prior written permission. 00031 // 00032 // This software is provided by the copyright holders and contributors "as is" and 00033 // any express or implied warranties, including, but not limited to, the implied 00034 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00035 // In no event shall the Intel Corporation or contributors be liable for any direct, 00036 // indirect, incidental, special, exemplary, or consequential damages 00037 // (including, but not limited to, procurement of substitute goods or services; 00038 // loss of use, data, or profits; or business interruption) however caused 00039 // and on any theory of liability, whether in contract, strict liability, 00040 // or tort (including negligence or otherwise) arising in any way out of 00041 // the use of this software, even if advised of the possibility of such damage. 00042 // 00043 //M*/ 00044 00045 #ifndef __OPENCV_HAL_INTRIN_NEON_HPP__ 00046 #define __OPENCV_HAL_INTRIN_NEON_HPP__ 00047 00048 #include <algorithm> 00049 00050 namespace cv 00051 { 00052 00053 //! @cond IGNORED 00054 00055 #define CV_SIMD128 1 00056 00057 struct v_uint8x16 00058 { 00059 typedef uchar lane_type; 00060 enum { nlanes = 16 }; 00061 00062 v_uint8x16() {} 00063 explicit v_uint8x16(uint8x16_t v) : val(v) {} 00064 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, 00065 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) 00066 { 00067 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; 00068 val = vld1q_u8(v); 00069 } 00070 uchar get0() const 00071 { 00072 return vgetq_lane_u8(val, 0); 00073 } 00074 00075 uint8x16_t val; 00076 }; 00077 00078 struct v_int8x16 00079 { 00080 typedef schar lane_type; 00081 enum { nlanes = 16 }; 00082 00083 v_int8x16() {} 00084 explicit v_int8x16(int8x16_t v) : val(v) {} 00085 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, 00086 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) 00087 { 00088 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; 00089 val = vld1q_s8(v); 00090 } 00091 schar get0() const 00092 { 00093 return vgetq_lane_s8(val, 0); 00094 } 00095 00096 int8x16_t val; 00097 }; 00098 00099 struct v_uint16x8 00100 { 00101 typedef ushort lane_type; 00102 enum { nlanes = 8 }; 00103 00104 v_uint16x8() {} 00105 explicit v_uint16x8(uint16x8_t v) : val(v) {} 00106 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) 00107 { 00108 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; 00109 val = vld1q_u16(v); 00110 } 00111 ushort get0() const 00112 { 00113 return vgetq_lane_u16(val, 0); 00114 } 00115 00116 uint16x8_t val; 00117 }; 00118 00119 struct v_int16x8 00120 { 00121 typedef short lane_type; 00122 enum { nlanes = 8 }; 00123 00124 v_int16x8() {} 00125 explicit v_int16x8(int16x8_t v) : val(v) {} 00126 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) 00127 { 00128 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; 00129 val = vld1q_s16(v); 00130 } 00131 short get0() const 00132 { 00133 return vgetq_lane_s16(val, 0); 00134 } 00135 00136 int16x8_t val; 00137 }; 00138 00139 struct v_uint32x4 00140 { 00141 typedef unsigned lane_type; 00142 enum { nlanes = 4 }; 00143 00144 v_uint32x4() {} 00145 explicit v_uint32x4(uint32x4_t v) : val(v) {} 00146 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) 00147 { 00148 unsigned v[] = {v0, v1, v2, v3}; 00149 val = vld1q_u32(v); 00150 } 00151 unsigned get0() const 00152 { 00153 return vgetq_lane_u32(val, 0); 00154 } 00155 00156 uint32x4_t val; 00157 }; 00158 00159 struct v_int32x4 00160 { 00161 typedef int lane_type; 00162 enum { nlanes = 4 }; 00163 00164 v_int32x4() {} 00165 explicit v_int32x4(int32x4_t v) : val(v) {} 00166 v_int32x4(int v0, int v1, int v2, int v3) 00167 { 00168 int v[] = {v0, v1, v2, v3}; 00169 val = vld1q_s32(v); 00170 } 00171 int get0() const 00172 { 00173 return vgetq_lane_s32(val, 0); 00174 } 00175 int32x4_t val; 00176 }; 00177 00178 struct v_float32x4 00179 { 00180 typedef float lane_type; 00181 enum { nlanes = 4 }; 00182 00183 v_float32x4() {} 00184 explicit v_float32x4(float32x4_t v) : val(v) {} 00185 v_float32x4(float v0, float v1, float v2, float v3) 00186 { 00187 float v[] = {v0, v1, v2, v3}; 00188 val = vld1q_f32(v); 00189 } 00190 float get0() const 00191 { 00192 return vgetq_lane_f32(val, 0); 00193 } 00194 float32x4_t val; 00195 }; 00196 00197 struct v_uint64x2 00198 { 00199 typedef uint64 lane_type; 00200 enum { nlanes = 2 }; 00201 00202 v_uint64x2() {} 00203 explicit v_uint64x2(uint64x2_t v) : val(v) {} 00204 v_uint64x2(unsigned v0, unsigned v1) 00205 { 00206 uint64 v[] = {v0, v1}; 00207 val = vld1q_u64(v); 00208 } 00209 uint64 get0() const 00210 { 00211 return vgetq_lane_u64(val, 0); 00212 } 00213 uint64x2_t val; 00214 }; 00215 00216 struct v_int64x2 00217 { 00218 typedef int64 lane_type; 00219 enum { nlanes = 2 }; 00220 00221 v_int64x2() {} 00222 explicit v_int64x2(int64x2_t v) : val(v) {} 00223 v_int64x2(int v0, int v1) 00224 { 00225 int64 v[] = {v0, v1}; 00226 val = vld1q_s64(v); 00227 } 00228 int64 get0() const 00229 { 00230 return vgetq_lane_s64(val, 0); 00231 } 00232 int64x2_t val; 00233 }; 00234 00235 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ 00236 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ 00237 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ 00238 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \ 00239 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \ 00240 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \ 00241 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \ 00242 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \ 00243 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \ 00244 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \ 00245 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \ 00246 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \ 00247 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); } 00248 00249 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8) 00250 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8) 00251 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16) 00252 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16) 00253 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32) 00254 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32) 00255 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64) 00256 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64) 00257 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32) 00258 00259 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \ 00260 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \ 00261 { \ 00262 hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \ 00263 return _Tpvec(vcombine_##suffix(a1, b1)); \ 00264 } \ 00265 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ 00266 { \ 00267 hreg a1 = vqmov##op##_##wsuffix(a.val); \ 00268 vst1_##suffix(ptr, a1); \ 00269 } \ 00270 template<int n> inline \ 00271 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \ 00272 { \ 00273 hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ 00274 hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \ 00275 return _Tpvec(vcombine_##suffix(a1, b1)); \ 00276 } \ 00277 template<int n> inline \ 00278 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ 00279 { \ 00280 hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ 00281 vst1_##suffix(ptr, a1); \ 00282 } 00283 00284 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n) 00285 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n) 00286 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n) 00287 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n) 00288 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n) 00289 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n) 00290 00291 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un) 00292 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un) 00293 00294 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, 00295 const v_float32x4& m1, const v_float32x4& m2, 00296 const v_float32x4& m3) 00297 { 00298 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val); 00299 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0); 00300 res = vmlaq_lane_f32(res, m1.val, vl, 1); 00301 res = vmlaq_lane_f32(res, m2.val, vh, 0); 00302 res = vmlaq_lane_f32(res, m3.val, vh, 1); 00303 return v_float32x4(res); 00304 } 00305 00306 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \ 00307 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ 00308 { \ 00309 return _Tpvec(intrin(a.val, b.val)); \ 00310 } \ 00311 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ 00312 { \ 00313 a.val = intrin(a.val, b.val); \ 00314 return a; \ 00315 } 00316 00317 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8) 00318 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8) 00319 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8) 00320 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8) 00321 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) 00322 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) 00323 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16) 00324 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16) 00325 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16) 00326 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16) 00327 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) 00328 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) 00329 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) 00330 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32) 00331 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32) 00332 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32) 00333 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32) 00334 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32) 00335 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32) 00336 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64) 00337 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64) 00338 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64) 00339 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64) 00340 00341 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) 00342 { 00343 float32x4_t reciprocal = vrecpeq_f32(b.val); 00344 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); 00345 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); 00346 return v_float32x4(vmulq_f32(a.val, reciprocal)); 00347 } 00348 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) 00349 { 00350 float32x4_t reciprocal = vrecpeq_f32(b.val); 00351 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); 00352 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); 00353 a.val = vmulq_f32(a.val, reciprocal); 00354 return a; 00355 } 00356 00357 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, 00358 v_int32x4& c, v_int32x4& d) 00359 { 00360 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); 00361 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); 00362 } 00363 00364 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, 00365 v_uint32x4& c, v_uint32x4& d) 00366 { 00367 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val)); 00368 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)); 00369 } 00370 00371 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, 00372 v_uint64x2& c, v_uint64x2& d) 00373 { 00374 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val)); 00375 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val)); 00376 } 00377 00378 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) 00379 { 00380 int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); 00381 int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); 00382 int32x4x2_t cd = vuzpq_s32(c, d); 00383 return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); 00384 } 00385 00386 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ 00387 OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ 00388 OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ 00389 OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \ 00390 inline _Tpvec operator ~ (const _Tpvec& a) \ 00391 { \ 00392 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \ 00393 } 00394 00395 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8) 00396 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8) 00397 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16) 00398 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16) 00399 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32) 00400 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32) 00401 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64) 00402 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64) 00403 00404 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \ 00405 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ 00406 { \ 00407 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \ 00408 } \ 00409 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ 00410 { \ 00411 a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \ 00412 return a; \ 00413 } 00414 00415 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32) 00416 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32) 00417 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32) 00418 00419 inline v_float32x4 operator ~ (const v_float32x4& a) 00420 { 00421 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val)))); 00422 } 00423 00424 inline v_float32x4 v_sqrt(const v_float32x4& x) 00425 { 00426 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN)); 00427 float32x4_t e = vrsqrteq_f32(x1); 00428 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); 00429 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); 00430 return v_float32x4(vmulq_f32(x.val, e)); 00431 } 00432 00433 inline v_float32x4 v_invsqrt(const v_float32x4& x) 00434 { 00435 float32x4_t e = vrsqrteq_f32(x.val); 00436 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); 00437 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); 00438 return v_float32x4(e); 00439 } 00440 00441 inline v_float32x4 v_abs(v_float32x4 x) 00442 { return v_float32x4(vabsq_f32(x.val)); } 00443 00444 // TODO: exp, log, sin, cos 00445 00446 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \ 00447 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ 00448 { \ 00449 return _Tpvec(intrin(a.val, b.val)); \ 00450 } 00451 00452 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8) 00453 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8) 00454 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8) 00455 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8) 00456 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16) 00457 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16) 00458 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16) 00459 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16) 00460 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32) 00461 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32) 00462 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32) 00463 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32) 00464 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32) 00465 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32) 00466 00467 00468 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \ 00469 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ 00470 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \ 00471 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ 00472 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \ 00473 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ 00474 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \ 00475 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ 00476 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \ 00477 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ 00478 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \ 00479 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ 00480 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); } 00481 00482 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8) 00483 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8) 00484 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16) 00485 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16) 00486 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32) 00487 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32) 00488 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32) 00489 00490 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8) 00491 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8) 00492 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16) 00493 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16) 00494 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8) 00495 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8) 00496 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16) 00497 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16) 00498 00499 // TODO: absdiff for signed integers 00500 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) 00501 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) 00502 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) 00503 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) 00504 00505 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \ 00506 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ 00507 { \ 00508 return _Tpvec2(cast(intrin(a.val, b.val))); \ 00509 } 00510 00511 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8) 00512 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16) 00513 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32) 00514 00515 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) 00516 { 00517 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); 00518 return v_sqrt(x); 00519 } 00520 00521 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) 00522 { 00523 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); 00524 } 00525 00526 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) 00527 { 00528 return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); 00529 } 00530 00531 // trade efficiency for convenience 00532 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ 00533 inline _Tpvec operator << (const _Tpvec& a, int n) \ 00534 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \ 00535 inline _Tpvec operator >> (const _Tpvec& a, int n) \ 00536 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \ 00537 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \ 00538 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \ 00539 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \ 00540 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \ 00541 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \ 00542 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); } 00543 00544 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8) 00545 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8) 00546 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16) 00547 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16) 00548 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32) 00549 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32) 00550 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64) 00551 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64) 00552 00553 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ 00554 inline _Tpvec v_load(const _Tp* ptr) \ 00555 { return _Tpvec(vld1q_##suffix(ptr)); } \ 00556 inline _Tpvec v_load_aligned(const _Tp* ptr) \ 00557 { return _Tpvec(vld1q_##suffix(ptr)); } \ 00558 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ 00559 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \ 00560 inline void v_store(_Tp* ptr, const _Tpvec& a) \ 00561 { vst1q_##suffix(ptr, a.val); } \ 00562 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ 00563 { vst1q_##suffix(ptr, a.val); } \ 00564 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ 00565 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \ 00566 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ 00567 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); } 00568 00569 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8) 00570 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8) 00571 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16) 00572 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16) 00573 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32) 00574 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32) 00575 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64) 00576 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64) 00577 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) 00578 00579 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ 00580 inline scalartype v_reduce_##func(const _Tpvec& a) \ 00581 { \ 00582 scalartype CV_DECL_ALIGNED(16) buf[4]; \ 00583 v_store_aligned(buf, a); \ 00584 scalartype s0 = scalar_func(buf[0], buf[1]); \ 00585 scalartype s1 = scalar_func(buf[2], buf[3]); \ 00586 return scalar_func(s0, s1); \ 00587 } 00588 00589 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) 00590 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) 00591 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) 00592 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) 00593 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max) 00594 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min) 00595 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) 00596 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max) 00597 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min) 00598 00599 inline int v_signmask(const v_uint8x16& a) 00600 { 00601 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); 00602 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0)); 00603 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); 00604 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); 00605 } 00606 inline int v_signmask(const v_int8x16& a) 00607 { return v_signmask(v_reinterpret_as_u8(a)); } 00608 00609 inline int v_signmask(const v_uint16x8& a) 00610 { 00611 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000)); 00612 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0)); 00613 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); 00614 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); 00615 } 00616 inline int v_signmask(const v_int16x8& a) 00617 { return v_signmask(v_reinterpret_as_u16(a)); } 00618 00619 inline int v_signmask(const v_uint32x4& a) 00620 { 00621 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000)); 00622 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0)); 00623 uint64x2_t v1 = vpaddlq_u32(v0); 00624 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2); 00625 } 00626 inline int v_signmask(const v_int32x4& a) 00627 { return v_signmask(v_reinterpret_as_u32(a)); } 00628 inline int v_signmask(const v_float32x4& a) 00629 { return v_signmask(v_reinterpret_as_u32(a)); } 00630 00631 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \ 00632 inline bool v_check_all(const v_##_Tpvec& a) \ 00633 { \ 00634 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \ 00635 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ 00636 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \ 00637 } \ 00638 inline bool v_check_any(const v_##_Tpvec& a) \ 00639 { \ 00640 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \ 00641 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ 00642 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \ 00643 } 00644 00645 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7) 00646 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15) 00647 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31) 00648 00649 inline bool v_check_all(const v_int8x16& a) 00650 { return v_check_all(v_reinterpret_as_u8(a)); } 00651 inline bool v_check_all(const v_int16x8& a) 00652 { return v_check_all(v_reinterpret_as_u16(a)); } 00653 inline bool v_check_all(const v_int32x4& a) 00654 { return v_check_all(v_reinterpret_as_u32(a)); } 00655 inline bool v_check_all(const v_float32x4& a) 00656 { return v_check_all(v_reinterpret_as_u32(a)); } 00657 00658 inline bool v_check_any(const v_int8x16& a) 00659 { return v_check_any(v_reinterpret_as_u8(a)); } 00660 inline bool v_check_any(const v_int16x8& a) 00661 { return v_check_any(v_reinterpret_as_u16(a)); } 00662 inline bool v_check_any(const v_int32x4& a) 00663 { return v_check_any(v_reinterpret_as_u32(a)); } 00664 inline bool v_check_any(const v_float32x4& a) 00665 { return v_check_any(v_reinterpret_as_u32(a)); } 00666 00667 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \ 00668 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ 00669 { \ 00670 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \ 00671 } 00672 00673 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8) 00674 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8) 00675 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16) 00676 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16) 00677 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32) 00678 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32) 00679 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32) 00680 00681 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \ 00682 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ 00683 { \ 00684 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \ 00685 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \ 00686 } \ 00687 inline _Tpwvec v_load_expand(const _Tp* ptr) \ 00688 { \ 00689 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \ 00690 } 00691 00692 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8) 00693 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8) 00694 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16) 00695 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16) 00696 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32) 00697 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32) 00698 00699 inline v_uint32x4 v_load_expand_q(const uchar* ptr) 00700 { 00701 uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr); 00702 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0)); 00703 return v_uint32x4(vmovl_u16(v1)); 00704 } 00705 00706 inline v_int32x4 v_load_expand_q(const schar* ptr) 00707 { 00708 int8x8_t v0 = vcreate_s8(*(unsigned*)ptr); 00709 int16x4_t v1 = vget_low_s16(vmovl_s8(v0)); 00710 return v_int32x4(vmovl_s16(v1)); 00711 } 00712 00713 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \ 00714 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ 00715 { \ 00716 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \ 00717 b0.val = p.val[0]; \ 00718 b1.val = p.val[1]; \ 00719 } \ 00720 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ 00721 { \ 00722 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \ 00723 } \ 00724 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ 00725 { \ 00726 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \ 00727 } \ 00728 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ 00729 { \ 00730 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \ 00731 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \ 00732 } 00733 00734 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8) 00735 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8) 00736 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16) 00737 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16) 00738 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32) 00739 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32) 00740 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32) 00741 00742 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \ 00743 template <int s> \ 00744 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \ 00745 { \ 00746 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \ 00747 } 00748 00749 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8) 00750 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8) 00751 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16) 00752 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16) 00753 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32) 00754 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32) 00755 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64) 00756 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64) 00757 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32) 00758 00759 inline v_int32x4 v_round(const v_float32x4& a) 00760 { 00761 static const int32x4_t v_sign = vdupq_n_s32(1 << 31), 00762 v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); 00763 00764 int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); 00765 return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); 00766 } 00767 00768 inline v_int32x4 v_floor(const v_float32x4& a) 00769 { 00770 int32x4_t a1 = vcvtq_s32_f32(a.val); 00771 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val); 00772 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask))); 00773 } 00774 00775 inline v_int32x4 v_ceil(const v_float32x4& a) 00776 { 00777 int32x4_t a1 = vcvtq_s32_f32(a.val); 00778 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1)); 00779 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask))); 00780 } 00781 00782 inline v_int32x4 v_trunc(const v_float32x4& a) 00783 { return v_int32x4(vcvtq_s32_f32(a.val)); } 00784 00785 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \ 00786 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \ 00787 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \ 00788 v_##_Tpvec& b0, v_##_Tpvec& b1, \ 00789 v_##_Tpvec& b2, v_##_Tpvec& b3) \ 00790 { \ 00791 /* m00 m01 m02 m03 */ \ 00792 /* m10 m11 m12 m13 */ \ 00793 /* m20 m21 m22 m23 */ \ 00794 /* m30 m31 m32 m33 */ \ 00795 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \ 00796 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \ 00797 /* m00 m10 m02 m12 */ \ 00798 /* m01 m11 m03 m13 */ \ 00799 /* m20 m30 m22 m32 */ \ 00800 /* m21 m31 m23 m33 */ \ 00801 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \ 00802 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \ 00803 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \ 00804 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \ 00805 } 00806 00807 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32) 00808 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) 00809 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) 00810 00811 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ 00812 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ 00813 { \ 00814 _Tpvec##x3_t v = vld3q_##suffix(ptr); \ 00815 a.val = v.val[0]; \ 00816 b.val = v.val[1]; \ 00817 c.val = v.val[2]; \ 00818 } \ 00819 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ 00820 v_##_Tpvec& c, v_##_Tpvec& d) \ 00821 { \ 00822 _Tpvec##x4_t v = vld4q_##suffix(ptr); \ 00823 a.val = v.val[0]; \ 00824 b.val = v.val[1]; \ 00825 c.val = v.val[2]; \ 00826 d.val = v.val[3]; \ 00827 } \ 00828 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ 00829 { \ 00830 _Tpvec##x3_t v; \ 00831 v.val[0] = a.val; \ 00832 v.val[1] = b.val; \ 00833 v.val[2] = c.val; \ 00834 vst3q_##suffix(ptr, v); \ 00835 } \ 00836 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ 00837 const v_##_Tpvec& c, const v_##_Tpvec& d) \ 00838 { \ 00839 _Tpvec##x4_t v; \ 00840 v.val[0] = a.val; \ 00841 v.val[1] = b.val; \ 00842 v.val[2] = c.val; \ 00843 v.val[3] = d.val; \ 00844 vst4q_##suffix(ptr, v); \ 00845 } 00846 00847 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) 00848 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) 00849 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) 00850 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16) 00851 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32) 00852 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32) 00853 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) 00854 00855 inline v_float32x4 v_cvt_f32(const v_int32x4& a) 00856 { 00857 return v_float32x4(vcvtq_f32_s32(a.val)); 00858 } 00859 00860 //! @endcond 00861 00862 } 00863 00864 #endif 00865
Generated on Tue Jul 12 2022 14:47:12 by
1.7.2
