Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Dependents: RZ_A2M_Mbed_samples
intrin_neon.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 00016 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00017 // Third party copyrights are property of their respective owners. 00018 // 00019 // Redistribution and use in source and binary forms, with or without modification, 00020 // are permitted provided that the following conditions are met: 00021 // 00022 // * Redistribution's of source code must retain the above copyright notice, 00023 // this list of conditions and the following disclaimer. 00024 // 00025 // * Redistribution's in binary form must reproduce the above copyright notice, 00026 // this list of conditions and the following disclaimer in the documentation 00027 // and/or other materials provided with the distribution. 00028 // 00029 // * The name of the copyright holders may not be used to endorse or promote products 00030 // derived from this software without specific prior written permission. 00031 // 00032 // This software is provided by the copyright holders and contributors "as is" and 00033 // any express or implied warranties, including, but not limited to, the implied 00034 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00035 // In no event shall the Intel Corporation or contributors be liable for any direct, 00036 // indirect, incidental, special, exemplary, or consequential damages 00037 // (including, but not limited to, procurement of substitute goods or services; 00038 // loss of use, data, or profits; or business interruption) however caused 00039 // and on any theory of liability, whether in contract, strict liability, 00040 // or tort (including negligence or otherwise) arising in any way out of 00041 // the use of this software, even if advised of the possibility of such damage. 00042 // 00043 //M*/ 00044 00045 #ifndef OPENCV_HAL_INTRIN_NEON_HPP 00046 #define OPENCV_HAL_INTRIN_NEON_HPP 00047 00048 #include <algorithm> 00049 #include "opencv2/core/utility.hpp" 00050 00051 namespace cv 00052 { 00053 00054 //! @cond IGNORED 00055 00056 #define CV_SIMD128 1 00057 #if defined(__aarch64__) 00058 #define CV_SIMD128_64F 1 00059 #else 00060 #define CV_SIMD128_64F 0 00061 #endif 00062 00063 #if CV_SIMD128_64F 00064 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \ 00065 template <typename T> static inline \ 00066 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \ 00067 template <typename T> static inline \ 00068 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; } 00069 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8) 00070 OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8) 00071 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16) 00072 OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16) 00073 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32) 00074 OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32) 00075 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64) 00076 OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64) 00077 OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32) 00078 #endif 00079 00080 struct v_uint8x16 00081 { 00082 typedef uchar lane_type; 00083 enum { nlanes = 16 }; 00084 00085 v_uint8x16() {} 00086 explicit v_uint8x16(uint8x16_t v) : val(v) {} 00087 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, 00088 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) 00089 { 00090 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; 00091 val = vld1q_u8(v); 00092 } 00093 uchar get0() const 00094 { 00095 return vgetq_lane_u8(val, 0); 00096 } 00097 00098 uint8x16_t val; 00099 }; 00100 00101 struct v_int8x16 00102 { 00103 typedef schar lane_type; 00104 enum { nlanes = 16 }; 00105 00106 v_int8x16() {} 00107 explicit v_int8x16(int8x16_t v) : val(v) {} 00108 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, 00109 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) 00110 { 00111 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; 00112 val = vld1q_s8(v); 00113 } 00114 schar get0() const 00115 { 00116 return vgetq_lane_s8(val, 0); 00117 } 00118 00119 int8x16_t val; 00120 }; 00121 00122 struct v_uint16x8 00123 { 00124 typedef ushort lane_type; 00125 enum { nlanes = 8 }; 00126 00127 v_uint16x8() {} 00128 explicit v_uint16x8(uint16x8_t v) : val(v) {} 00129 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) 00130 { 00131 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; 00132 val = vld1q_u16(v); 00133 } 00134 ushort get0() const 00135 { 00136 return vgetq_lane_u16(val, 0); 00137 } 00138 00139 uint16x8_t val; 00140 }; 00141 00142 struct v_int16x8 00143 { 00144 typedef short lane_type; 00145 enum { nlanes = 8 }; 00146 00147 v_int16x8() {} 00148 explicit v_int16x8(int16x8_t v) : val(v) {} 00149 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) 00150 { 00151 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; 00152 val = vld1q_s16(v); 00153 } 00154 short get0() const 00155 { 00156 return vgetq_lane_s16(val, 0); 00157 } 00158 00159 int16x8_t val; 00160 }; 00161 00162 struct v_uint32x4 00163 { 00164 typedef unsigned lane_type; 00165 enum { nlanes = 4 }; 00166 00167 v_uint32x4() {} 00168 explicit v_uint32x4(uint32x4_t v) : val(v) {} 00169 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) 00170 { 00171 unsigned v[] = {v0, v1, v2, v3}; 00172 val = vld1q_u32(v); 00173 } 00174 unsigned get0() const 00175 { 00176 return vgetq_lane_u32(val, 0); 00177 } 00178 00179 uint32x4_t val; 00180 }; 00181 00182 struct v_int32x4 00183 { 00184 typedef int lane_type; 00185 enum { nlanes = 4 }; 00186 00187 v_int32x4() {} 00188 explicit v_int32x4(int32x4_t v) : val(v) {} 00189 v_int32x4(int v0, int v1, int v2, int v3) 00190 { 00191 int v[] = {v0, v1, v2, v3}; 00192 val = vld1q_s32(v); 00193 } 00194 int get0() const 00195 { 00196 return vgetq_lane_s32(val, 0); 00197 } 00198 int32x4_t val; 00199 }; 00200 00201 struct v_float32x4 00202 { 00203 typedef float lane_type; 00204 enum { nlanes = 4 }; 00205 00206 v_float32x4() {} 00207 explicit v_float32x4(float32x4_t v) : val(v) {} 00208 v_float32x4(float v0, float v1, float v2, float v3) 00209 { 00210 float v[] = {v0, v1, v2, v3}; 00211 val = vld1q_f32(v); 00212 } 00213 float get0() const 00214 { 00215 return vgetq_lane_f32(val, 0); 00216 } 00217 float32x4_t val; 00218 }; 00219 00220 struct v_uint64x2 00221 { 00222 typedef uint64 lane_type; 00223 enum { nlanes = 2 }; 00224 00225 v_uint64x2() {} 00226 explicit v_uint64x2(uint64x2_t v) : val(v) {} 00227 v_uint64x2(unsigned v0, unsigned v1) 00228 { 00229 uint64 v[] = {v0, v1}; 00230 val = vld1q_u64(v); 00231 } 00232 uint64 get0() const 00233 { 00234 return vgetq_lane_u64(val, 0); 00235 } 00236 uint64x2_t val; 00237 }; 00238 00239 struct v_int64x2 00240 { 00241 typedef int64 lane_type; 00242 enum { nlanes = 2 }; 00243 00244 v_int64x2() {} 00245 explicit v_int64x2(int64x2_t v) : val(v) {} 00246 v_int64x2(int v0, int v1) 00247 { 00248 int64 v[] = {v0, v1}; 00249 val = vld1q_s64(v); 00250 } 00251 int64 get0() const 00252 { 00253 return vgetq_lane_s64(val, 0); 00254 } 00255 int64x2_t val; 00256 }; 00257 00258 #if CV_SIMD128_64F 00259 struct v_float64x2 00260 { 00261 typedef double lane_type; 00262 enum { nlanes = 2 }; 00263 00264 v_float64x2() {} 00265 explicit v_float64x2(float64x2_t v) : val(v) {} 00266 v_float64x2(double v0, double v1) 00267 { 00268 double v[] = {v0, v1}; 00269 val = vld1q_f64(v); 00270 } 00271 double get0() const 00272 { 00273 return vgetq_lane_f64(val, 0); 00274 } 00275 float64x2_t val; 00276 }; 00277 #endif 00278 00279 #if defined (HAVE_FP16) 00280 // Workaround for old comiplers 00281 template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a) 00282 { return (int16x4_t)a; } 00283 template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a) 00284 { return (float16x4_t)a; } 00285 template <typename T> static inline float16x4_t vld1_f16(const T* ptr) 00286 { return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); } 00287 template <typename T> static inline void vst1_f16(T* ptr, float16x4_t a) 00288 { vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); } 00289 00290 struct v_float16x4 00291 { 00292 typedef short lane_type; 00293 enum { nlanes = 4 }; 00294 00295 v_float16x4() {} 00296 explicit v_float16x4(float16x4_t v) : val(v) {} 00297 v_float16x4(short v0, short v1, short v2, short v3) 00298 { 00299 short v[] = {v0, v1, v2, v3}; 00300 val = vld1_f16(v); 00301 } 00302 short get0() const 00303 { 00304 return vget_lane_s16(vreinterpret_s16_f16(val), 0); 00305 } 00306 float16x4_t val; 00307 }; 00308 #endif 00309 00310 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ 00311 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ 00312 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ 00313 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \ 00314 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \ 00315 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \ 00316 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \ 00317 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \ 00318 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \ 00319 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \ 00320 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \ 00321 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \ 00322 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); } 00323 00324 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8) 00325 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8) 00326 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16) 00327 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16) 00328 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32) 00329 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32) 00330 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64) 00331 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64) 00332 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32) 00333 #if CV_SIMD128_64F 00334 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \ 00335 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); } 00336 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64) 00337 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8) 00338 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8) 00339 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16) 00340 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16) 00341 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32) 00342 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32) 00343 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64) 00344 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64) 00345 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32) 00346 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64) 00347 #endif 00348 00349 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \ 00350 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \ 00351 { \ 00352 hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \ 00353 return _Tpvec(vcombine_##suffix(a1, b1)); \ 00354 } \ 00355 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ 00356 { \ 00357 hreg a1 = vqmov##op##_##wsuffix(a.val); \ 00358 vst1_##suffix(ptr, a1); \ 00359 } \ 00360 template<int n> inline \ 00361 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \ 00362 { \ 00363 hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ 00364 hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \ 00365 return _Tpvec(vcombine_##suffix(a1, b1)); \ 00366 } \ 00367 template<int n> inline \ 00368 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ 00369 { \ 00370 hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ 00371 vst1_##suffix(ptr, a1); \ 00372 } 00373 00374 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n) 00375 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n) 00376 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n) 00377 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n) 00378 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n) 00379 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n) 00380 00381 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un) 00382 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un) 00383 00384 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, 00385 const v_float32x4& m1, const v_float32x4& m2, 00386 const v_float32x4& m3) 00387 { 00388 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val); 00389 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0); 00390 res = vmlaq_lane_f32(res, m1.val, vl, 1); 00391 res = vmlaq_lane_f32(res, m2.val, vh, 0); 00392 res = vmlaq_lane_f32(res, m3.val, vh, 1); 00393 return v_float32x4(res); 00394 } 00395 00396 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \ 00397 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ 00398 { \ 00399 return _Tpvec(intrin(a.val, b.val)); \ 00400 } \ 00401 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ 00402 { \ 00403 a.val = intrin(a.val, b.val); \ 00404 return a; \ 00405 } 00406 00407 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8) 00408 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8) 00409 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8) 00410 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8) 00411 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) 00412 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) 00413 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16) 00414 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16) 00415 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16) 00416 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16) 00417 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) 00418 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) 00419 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) 00420 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32) 00421 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32) 00422 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32) 00423 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32) 00424 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32) 00425 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32) 00426 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64) 00427 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64) 00428 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64) 00429 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64) 00430 #if CV_SIMD128_64F 00431 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32) 00432 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64) 00433 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64) 00434 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64) 00435 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64) 00436 #else 00437 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) 00438 { 00439 float32x4_t reciprocal = vrecpeq_f32(b.val); 00440 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); 00441 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); 00442 return v_float32x4(vmulq_f32(a.val, reciprocal)); 00443 } 00444 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) 00445 { 00446 float32x4_t reciprocal = vrecpeq_f32(b.val); 00447 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); 00448 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); 00449 a.val = vmulq_f32(a.val, reciprocal); 00450 return a; 00451 } 00452 #endif 00453 00454 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, 00455 v_int32x4& c, v_int32x4& d) 00456 { 00457 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); 00458 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); 00459 } 00460 00461 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, 00462 v_uint32x4& c, v_uint32x4& d) 00463 { 00464 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val)); 00465 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)); 00466 } 00467 00468 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, 00469 v_uint64x2& c, v_uint64x2& d) 00470 { 00471 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val)); 00472 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val)); 00473 } 00474 00475 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) 00476 { 00477 int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); 00478 int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); 00479 int32x4x2_t cd = vuzpq_s32(c, d); 00480 return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); 00481 } 00482 00483 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ 00484 OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ 00485 OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ 00486 OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \ 00487 inline _Tpvec operator ~ (const _Tpvec& a) \ 00488 { \ 00489 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \ 00490 } 00491 00492 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8) 00493 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8) 00494 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16) 00495 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16) 00496 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32) 00497 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32) 00498 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64) 00499 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64) 00500 00501 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \ 00502 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ 00503 { \ 00504 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \ 00505 } \ 00506 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ 00507 { \ 00508 a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \ 00509 return a; \ 00510 } 00511 00512 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32) 00513 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32) 00514 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32) 00515 00516 inline v_float32x4 operator ~ (const v_float32x4& a) 00517 { 00518 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val)))); 00519 } 00520 00521 #if CV_SIMD128_64F 00522 inline v_float32x4 v_sqrt(const v_float32x4& x) 00523 { 00524 return v_float32x4(vsqrtq_f32(x.val)); 00525 } 00526 00527 inline v_float32x4 v_invsqrt(const v_float32x4& x) 00528 { 00529 v_float32x4 one = v_setall_f32(1.0f); 00530 return one / v_sqrt(x); 00531 } 00532 #else 00533 inline v_float32x4 v_sqrt(const v_float32x4& x) 00534 { 00535 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN)); 00536 float32x4_t e = vrsqrteq_f32(x1); 00537 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); 00538 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); 00539 return v_float32x4(vmulq_f32(x.val, e)); 00540 } 00541 00542 inline v_float32x4 v_invsqrt(const v_float32x4& x) 00543 { 00544 float32x4_t e = vrsqrteq_f32(x.val); 00545 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); 00546 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); 00547 return v_float32x4(e); 00548 } 00549 #endif 00550 00551 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \ 00552 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); } 00553 00554 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8) 00555 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16) 00556 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32) 00557 00558 inline v_float32x4 v_abs(v_float32x4 x) 00559 { return v_float32x4(vabsq_f32(x.val)); } 00560 00561 #if CV_SIMD128_64F 00562 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \ 00563 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ 00564 { \ 00565 return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \ 00566 } \ 00567 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ 00568 { \ 00569 a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \ 00570 return a; \ 00571 } 00572 00573 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64) 00574 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64) 00575 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64) 00576 00577 inline v_float64x2 operator ~ (const v_float64x2& a) 00578 { 00579 return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val)))); 00580 } 00581 00582 inline v_float64x2 v_sqrt(const v_float64x2& x) 00583 { 00584 return v_float64x2(vsqrtq_f64(x.val)); 00585 } 00586 00587 inline v_float64x2 v_invsqrt(const v_float64x2& x) 00588 { 00589 v_float64x2 one = v_setall_f64(1.0f); 00590 return one / v_sqrt(x); 00591 } 00592 00593 inline v_float64x2 v_abs(v_float64x2 x) 00594 { return v_float64x2(vabsq_f64(x.val)); } 00595 #endif 00596 00597 // TODO: exp, log, sin, cos 00598 00599 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \ 00600 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ 00601 { \ 00602 return _Tpvec(intrin(a.val, b.val)); \ 00603 } 00604 00605 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8) 00606 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8) 00607 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8) 00608 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8) 00609 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16) 00610 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16) 00611 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16) 00612 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16) 00613 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32) 00614 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32) 00615 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32) 00616 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32) 00617 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32) 00618 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32) 00619 #if CV_SIMD128_64F 00620 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64) 00621 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64) 00622 #endif 00623 00624 #if CV_SIMD128_64F 00625 inline int64x2_t vmvnq_s64(int64x2_t a) 00626 { 00627 int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF)); 00628 return veorq_s64(a, vx); 00629 } 00630 inline uint64x2_t vmvnq_u64(uint64x2_t a) 00631 { 00632 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF)); 00633 return veorq_u64(a, vx); 00634 } 00635 #endif 00636 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \ 00637 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ 00638 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \ 00639 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ 00640 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \ 00641 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ 00642 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \ 00643 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ 00644 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \ 00645 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ 00646 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \ 00647 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ 00648 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); } 00649 00650 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8) 00651 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8) 00652 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16) 00653 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16) 00654 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32) 00655 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32) 00656 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32) 00657 #if CV_SIMD128_64F 00658 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64) 00659 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64) 00660 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64) 00661 #endif 00662 00663 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8) 00664 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8) 00665 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16) 00666 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16) 00667 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8) 00668 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8) 00669 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16) 00670 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16) 00671 00672 // TODO: absdiff for signed integers 00673 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) 00674 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) 00675 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) 00676 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) 00677 #if CV_SIMD128_64F 00678 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64) 00679 #endif 00680 00681 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \ 00682 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ 00683 { \ 00684 return _Tpvec2(cast(intrin(a.val, b.val))); \ 00685 } 00686 00687 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8) 00688 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16) 00689 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32) 00690 00691 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) 00692 { 00693 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); 00694 return v_sqrt(x); 00695 } 00696 00697 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) 00698 { 00699 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); 00700 } 00701 00702 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) 00703 { 00704 return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); 00705 } 00706 00707 #if CV_SIMD128_64F 00708 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b) 00709 { 00710 v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val))); 00711 return v_sqrt(x); 00712 } 00713 00714 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b) 00715 { 00716 return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val))); 00717 } 00718 00719 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) 00720 { 00721 return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val))); 00722 } 00723 #endif 00724 00725 // trade efficiency for convenience 00726 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ 00727 inline _Tpvec operator << (const _Tpvec& a, int n) \ 00728 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \ 00729 inline _Tpvec operator >> (const _Tpvec& a, int n) \ 00730 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \ 00731 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \ 00732 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \ 00733 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \ 00734 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \ 00735 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \ 00736 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); } 00737 00738 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8) 00739 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8) 00740 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16) 00741 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16) 00742 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32) 00743 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32) 00744 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64) 00745 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64) 00746 00747 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ 00748 inline _Tpvec v_load(const _Tp* ptr) \ 00749 { return _Tpvec(vld1q_##suffix(ptr)); } \ 00750 inline _Tpvec v_load_aligned(const _Tp* ptr) \ 00751 { return _Tpvec(vld1q_##suffix(ptr)); } \ 00752 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ 00753 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \ 00754 inline void v_store(_Tp* ptr, const _Tpvec& a) \ 00755 { vst1q_##suffix(ptr, a.val); } \ 00756 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ 00757 { vst1q_##suffix(ptr, a.val); } \ 00758 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ 00759 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \ 00760 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ 00761 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); } 00762 00763 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8) 00764 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8) 00765 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16) 00766 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16) 00767 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32) 00768 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32) 00769 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64) 00770 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64) 00771 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) 00772 #if CV_SIMD128_64F 00773 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64) 00774 #endif 00775 00776 #if defined (HAVE_FP16) 00777 // Workaround for old comiplers 00778 inline v_float16x4 v_load_f16(const short* ptr) 00779 { return v_float16x4(vld1_f16(ptr)); } 00780 inline void v_store_f16(short* ptr, v_float16x4& a) 00781 { vst1_f16(ptr, a.val); } 00782 #endif 00783 00784 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ 00785 inline scalartype v_reduce_##func(const _Tpvec& a) \ 00786 { \ 00787 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \ 00788 a0 = vp##vectorfunc##_##suffix(a0, a0); \ 00789 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \ 00790 } 00791 00792 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16) 00793 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16) 00794 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16) 00795 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16) 00796 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16) 00797 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16) 00798 00799 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ 00800 inline scalartype v_reduce_##func(const _Tpvec& a) \ 00801 { \ 00802 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \ 00803 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \ 00804 } 00805 00806 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32) 00807 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32) 00808 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32) 00809 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32) 00810 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32) 00811 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32) 00812 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32) 00813 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32) 00814 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32) 00815 00816 inline int v_signmask(const v_uint8x16& a) 00817 { 00818 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); 00819 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0)); 00820 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); 00821 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); 00822 } 00823 inline int v_signmask(const v_int8x16& a) 00824 { return v_signmask(v_reinterpret_as_u8(a)); } 00825 00826 inline int v_signmask(const v_uint16x8& a) 00827 { 00828 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000)); 00829 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0)); 00830 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); 00831 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); 00832 } 00833 inline int v_signmask(const v_int16x8& a) 00834 { return v_signmask(v_reinterpret_as_u16(a)); } 00835 00836 inline int v_signmask(const v_uint32x4& a) 00837 { 00838 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000)); 00839 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0)); 00840 uint64x2_t v1 = vpaddlq_u32(v0); 00841 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2); 00842 } 00843 inline int v_signmask(const v_int32x4& a) 00844 { return v_signmask(v_reinterpret_as_u32(a)); } 00845 inline int v_signmask(const v_float32x4& a) 00846 { return v_signmask(v_reinterpret_as_u32(a)); } 00847 #if CV_SIMD128_64F 00848 inline int v_signmask(const v_uint64x2& a) 00849 { 00850 int64x1_t m0 = vdup_n_s64(0); 00851 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0)); 00852 return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1); 00853 } 00854 inline int v_signmask(const v_float64x2& a) 00855 { return v_signmask(v_reinterpret_as_u64(a)); } 00856 #endif 00857 00858 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \ 00859 inline bool v_check_all(const v_##_Tpvec& a) \ 00860 { \ 00861 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \ 00862 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ 00863 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \ 00864 } \ 00865 inline bool v_check_any(const v_##_Tpvec& a) \ 00866 { \ 00867 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \ 00868 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ 00869 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \ 00870 } 00871 00872 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7) 00873 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15) 00874 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31) 00875 #if CV_SIMD128_64F 00876 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63) 00877 #endif 00878 00879 inline bool v_check_all(const v_int8x16& a) 00880 { return v_check_all(v_reinterpret_as_u8(a)); } 00881 inline bool v_check_all(const v_int16x8& a) 00882 { return v_check_all(v_reinterpret_as_u16(a)); } 00883 inline bool v_check_all(const v_int32x4& a) 00884 { return v_check_all(v_reinterpret_as_u32(a)); } 00885 inline bool v_check_all(const v_float32x4& a) 00886 { return v_check_all(v_reinterpret_as_u32(a)); } 00887 00888 inline bool v_check_any(const v_int8x16& a) 00889 { return v_check_any(v_reinterpret_as_u8(a)); } 00890 inline bool v_check_any(const v_int16x8& a) 00891 { return v_check_any(v_reinterpret_as_u16(a)); } 00892 inline bool v_check_any(const v_int32x4& a) 00893 { return v_check_any(v_reinterpret_as_u32(a)); } 00894 inline bool v_check_any(const v_float32x4& a) 00895 { return v_check_any(v_reinterpret_as_u32(a)); } 00896 00897 #if CV_SIMD128_64F 00898 inline bool v_check_all(const v_int64x2& a) 00899 { return v_check_all(v_reinterpret_as_u64(a)); } 00900 inline bool v_check_all(const v_float64x2& a) 00901 { return v_check_all(v_reinterpret_as_u64(a)); } 00902 inline bool v_check_any(const v_int64x2& a) 00903 { return v_check_any(v_reinterpret_as_u64(a)); } 00904 inline bool v_check_any(const v_float64x2& a) 00905 { return v_check_any(v_reinterpret_as_u64(a)); } 00906 #endif 00907 00908 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \ 00909 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ 00910 { \ 00911 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \ 00912 } 00913 00914 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8) 00915 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8) 00916 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16) 00917 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16) 00918 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32) 00919 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32) 00920 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32) 00921 #if CV_SIMD128_64F 00922 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64) 00923 #endif 00924 00925 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \ 00926 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ 00927 { \ 00928 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \ 00929 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \ 00930 } \ 00931 inline _Tpwvec v_load_expand(const _Tp* ptr) \ 00932 { \ 00933 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \ 00934 } 00935 00936 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8) 00937 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8) 00938 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16) 00939 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16) 00940 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32) 00941 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32) 00942 00943 inline v_uint32x4 v_load_expand_q(const uchar* ptr) 00944 { 00945 uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr); 00946 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0)); 00947 return v_uint32x4(vmovl_u16(v1)); 00948 } 00949 00950 inline v_int32x4 v_load_expand_q(const schar* ptr) 00951 { 00952 int8x8_t v0 = vcreate_s8(*(unsigned*)ptr); 00953 int16x4_t v1 = vget_low_s16(vmovl_s8(v0)); 00954 return v_int32x4(vmovl_s16(v1)); 00955 } 00956 00957 #if defined(__aarch64__) 00958 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \ 00959 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ 00960 { \ 00961 b0.val = vzip1q_##suffix(a0.val, a1.val); \ 00962 b1.val = vzip2q_##suffix(a0.val, a1.val); \ 00963 } \ 00964 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ 00965 { \ 00966 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \ 00967 } \ 00968 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ 00969 { \ 00970 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \ 00971 } \ 00972 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ 00973 { \ 00974 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \ 00975 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \ 00976 } 00977 #else 00978 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \ 00979 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ 00980 { \ 00981 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \ 00982 b0.val = p.val[0]; \ 00983 b1.val = p.val[1]; \ 00984 } \ 00985 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ 00986 { \ 00987 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \ 00988 } \ 00989 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ 00990 { \ 00991 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \ 00992 } \ 00993 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ 00994 { \ 00995 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \ 00996 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \ 00997 } 00998 #endif 00999 01000 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8) 01001 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8) 01002 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16) 01003 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16) 01004 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32) 01005 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32) 01006 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32) 01007 #if CV_SIMD128_64F 01008 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64) 01009 #endif 01010 01011 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \ 01012 template <int s> \ 01013 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \ 01014 { \ 01015 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \ 01016 } 01017 01018 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8) 01019 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8) 01020 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16) 01021 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16) 01022 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32) 01023 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32) 01024 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64) 01025 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64) 01026 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32) 01027 #if CV_SIMD128_64F 01028 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64) 01029 #endif 01030 01031 inline v_int32x4 v_round(const v_float32x4& a) 01032 { 01033 static const int32x4_t v_sign = vdupq_n_s32(1 << 31), 01034 v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); 01035 01036 int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); 01037 return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); 01038 } 01039 01040 inline v_int32x4 v_floor(const v_float32x4& a) 01041 { 01042 int32x4_t a1 = vcvtq_s32_f32(a.val); 01043 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val); 01044 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask))); 01045 } 01046 01047 inline v_int32x4 v_ceil(const v_float32x4& a) 01048 { 01049 int32x4_t a1 = vcvtq_s32_f32(a.val); 01050 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1)); 01051 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask))); 01052 } 01053 01054 inline v_int32x4 v_trunc(const v_float32x4& a) 01055 { return v_int32x4(vcvtq_s32_f32(a.val)); } 01056 01057 #if CV_SIMD128_64F 01058 inline v_int32x4 v_round(const v_float64x2& a) 01059 { 01060 static const int32x2_t zero = vdup_n_s32(0); 01061 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero)); 01062 } 01063 01064 inline v_int32x4 v_floor(const v_float64x2& a) 01065 { 01066 static const int32x2_t zero = vdup_n_s32(0); 01067 int64x2_t a1 = vcvtq_s64_f64(a.val); 01068 uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val); 01069 a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask)); 01070 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero)); 01071 } 01072 01073 inline v_int32x4 v_ceil(const v_float64x2& a) 01074 { 01075 static const int32x2_t zero = vdup_n_s32(0); 01076 int64x2_t a1 = vcvtq_s64_f64(a.val); 01077 uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1)); 01078 a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask)); 01079 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero)); 01080 } 01081 01082 inline v_int32x4 v_trunc(const v_float64x2& a) 01083 { 01084 static const int32x2_t zero = vdup_n_s32(0); 01085 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero)); 01086 } 01087 #endif 01088 01089 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \ 01090 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \ 01091 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \ 01092 v_##_Tpvec& b0, v_##_Tpvec& b1, \ 01093 v_##_Tpvec& b2, v_##_Tpvec& b3) \ 01094 { \ 01095 /* m00 m01 m02 m03 */ \ 01096 /* m10 m11 m12 m13 */ \ 01097 /* m20 m21 m22 m23 */ \ 01098 /* m30 m31 m32 m33 */ \ 01099 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \ 01100 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \ 01101 /* m00 m10 m02 m12 */ \ 01102 /* m01 m11 m03 m13 */ \ 01103 /* m20 m30 m22 m32 */ \ 01104 /* m21 m31 m23 m33 */ \ 01105 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \ 01106 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \ 01107 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \ 01108 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \ 01109 } 01110 01111 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32) 01112 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) 01113 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) 01114 01115 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ 01116 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ 01117 { \ 01118 _Tpvec##x2_t v = vld2q_##suffix(ptr); \ 01119 a.val = v.val[0]; \ 01120 b.val = v.val[1]; \ 01121 } \ 01122 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ 01123 { \ 01124 _Tpvec##x3_t v = vld3q_##suffix(ptr); \ 01125 a.val = v.val[0]; \ 01126 b.val = v.val[1]; \ 01127 c.val = v.val[2]; \ 01128 } \ 01129 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ 01130 v_##_Tpvec& c, v_##_Tpvec& d) \ 01131 { \ 01132 _Tpvec##x4_t v = vld4q_##suffix(ptr); \ 01133 a.val = v.val[0]; \ 01134 b.val = v.val[1]; \ 01135 c.val = v.val[2]; \ 01136 d.val = v.val[3]; \ 01137 } \ 01138 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \ 01139 { \ 01140 _Tpvec##x2_t v; \ 01141 v.val[0] = a.val; \ 01142 v.val[1] = b.val; \ 01143 vst2q_##suffix(ptr, v); \ 01144 } \ 01145 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ 01146 { \ 01147 _Tpvec##x3_t v; \ 01148 v.val[0] = a.val; \ 01149 v.val[1] = b.val; \ 01150 v.val[2] = c.val; \ 01151 vst3q_##suffix(ptr, v); \ 01152 } \ 01153 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ 01154 const v_##_Tpvec& c, const v_##_Tpvec& d) \ 01155 { \ 01156 _Tpvec##x4_t v; \ 01157 v.val[0] = a.val; \ 01158 v.val[1] = b.val; \ 01159 v.val[2] = c.val; \ 01160 v.val[3] = d.val; \ 01161 vst4q_##suffix(ptr, v); \ 01162 } 01163 01164 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) 01165 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) 01166 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) 01167 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16) 01168 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32) 01169 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32) 01170 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) 01171 #if CV_SIMD128_64F 01172 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64) 01173 #endif 01174 01175 inline v_float32x4 v_cvt_f32(const v_int32x4& a) 01176 { 01177 return v_float32x4(vcvtq_f32_s32(a.val)); 01178 } 01179 01180 #if CV_SIMD128_64F 01181 inline v_float32x4 v_cvt_f32(const v_float64x2& a) 01182 { 01183 float32x2_t zero = vdup_n_f32(0.0f); 01184 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero)); 01185 } 01186 01187 inline v_float64x2 v_cvt_f64(const v_int32x4& a) 01188 { 01189 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val)))); 01190 } 01191 01192 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) 01193 { 01194 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val)))); 01195 } 01196 01197 inline v_float64x2 v_cvt_f64(const v_float32x4& a) 01198 { 01199 return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val))); 01200 } 01201 01202 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) 01203 { 01204 return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val))); 01205 } 01206 #endif 01207 01208 #if defined (HAVE_FP16) 01209 inline v_float32x4 v_cvt_f32(const v_float16x4& a) 01210 { 01211 return v_float32x4(vcvt_f32_f16(a.val)); 01212 } 01213 01214 inline v_float16x4 v_cvt_f16(const v_float32x4& a) 01215 { 01216 return v_float16x4(vcvt_f16_f32(a.val)); 01217 } 01218 #endif 01219 01220 //! @name Check SIMD support 01221 //! @{ 01222 //! @brief Check CPU capability of SIMD operation 01223 static inline bool hasSIMD128() 01224 { 01225 return checkHardwareSupport(CV_CPU_NEON); 01226 } 01227 01228 //! @} 01229 01230 //! @endcond 01231 01232 } 01233 01234 #endif
Generated on Tue Jul 12 2022 18:20:17 by
