Renesas / opencv-lib

Dependents:   RZ_A2M_Mbed_samples

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers intrin_neon.hpp Source File

intrin_neon.hpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                          License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
00016 // Copyright (C) 2015, Itseez Inc., all rights reserved.
00017 // Third party copyrights are property of their respective owners.
00018 //
00019 // Redistribution and use in source and binary forms, with or without modification,
00020 // are permitted provided that the following conditions are met:
00021 //
00022 //   * Redistribution's of source code must retain the above copyright notice,
00023 //     this list of conditions and the following disclaimer.
00024 //
00025 //   * Redistribution's in binary form must reproduce the above copyright notice,
00026 //     this list of conditions and the following disclaimer in the documentation
00027 //     and/or other materials provided with the distribution.
00028 //
00029 //   * The name of the copyright holders may not be used to endorse or promote products
00030 //     derived from this software without specific prior written permission.
00031 //
00032 // This software is provided by the copyright holders and contributors "as is" and
00033 // any express or implied warranties, including, but not limited to, the implied
00034 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00035 // In no event shall the Intel Corporation or contributors be liable for any direct,
00036 // indirect, incidental, special, exemplary, or consequential damages
00037 // (including, but not limited to, procurement of substitute goods or services;
00038 // loss of use, data, or profits; or business interruption) however caused
00039 // and on any theory of liability, whether in contract, strict liability,
00040 // or tort (including negligence or otherwise) arising in any way out of
00041 // the use of this software, even if advised of the possibility of such damage.
00042 //
00043 //M*/
00044 
00045 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
00046 #define OPENCV_HAL_INTRIN_NEON_HPP
00047 
00048 #include <algorithm>
00049 #include "opencv2/core/utility.hpp"
00050 
00051 namespace cv
00052 {
00053 
00054 //! @cond IGNORED
00055 
00056 #define CV_SIMD128 1
00057 #if defined(__aarch64__)
00058 #define CV_SIMD128_64F 1
00059 #else
00060 #define CV_SIMD128_64F 0
00061 #endif
00062 
00063 #if CV_SIMD128_64F
00064 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
00065 template <typename T> static inline \
00066 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
00067 template <typename T> static inline \
00068 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
00069 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
00070 OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
00071 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
00072 OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
00073 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
00074 OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
00075 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
00076 OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
00077 OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
00078 #endif
00079 
00080 struct v_uint8x16
00081 {
00082     typedef uchar lane_type;
00083     enum { nlanes = 16 };
00084 
00085     v_uint8x16() {}
00086     explicit v_uint8x16(uint8x16_t v) : val(v) {}
00087     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
00088                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
00089     {
00090         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
00091         val = vld1q_u8(v);
00092     }
00093     uchar get0() const
00094     {
00095         return vgetq_lane_u8(val, 0);
00096     }
00097 
00098     uint8x16_t val;
00099 };
00100 
00101 struct v_int8x16
00102 {
00103     typedef schar lane_type;
00104     enum { nlanes = 16 };
00105 
00106     v_int8x16() {}
00107     explicit v_int8x16(int8x16_t v) : val(v) {}
00108     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
00109                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
00110     {
00111         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
00112         val = vld1q_s8(v);
00113     }
00114     schar get0() const
00115     {
00116         return vgetq_lane_s8(val, 0);
00117     }
00118 
00119     int8x16_t val;
00120 };
00121 
00122 struct v_uint16x8
00123 {
00124     typedef ushort lane_type;
00125     enum { nlanes = 8 };
00126 
00127     v_uint16x8() {}
00128     explicit v_uint16x8(uint16x8_t v) : val(v) {}
00129     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
00130     {
00131         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
00132         val = vld1q_u16(v);
00133     }
00134     ushort get0() const
00135     {
00136         return vgetq_lane_u16(val, 0);
00137     }
00138 
00139     uint16x8_t val;
00140 };
00141 
00142 struct v_int16x8
00143 {
00144     typedef short lane_type;
00145     enum { nlanes = 8 };
00146 
00147     v_int16x8() {}
00148     explicit v_int16x8(int16x8_t v) : val(v) {}
00149     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
00150     {
00151         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
00152         val = vld1q_s16(v);
00153     }
00154     short get0() const
00155     {
00156         return vgetq_lane_s16(val, 0);
00157     }
00158 
00159     int16x8_t val;
00160 };
00161 
00162 struct v_uint32x4
00163 {
00164     typedef unsigned lane_type;
00165     enum { nlanes = 4 };
00166 
00167     v_uint32x4() {}
00168     explicit v_uint32x4(uint32x4_t v) : val(v) {}
00169     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
00170     {
00171         unsigned v[] = {v0, v1, v2, v3};
00172         val = vld1q_u32(v);
00173     }
00174     unsigned get0() const
00175     {
00176         return vgetq_lane_u32(val, 0);
00177     }
00178 
00179     uint32x4_t val;
00180 };
00181 
00182 struct v_int32x4
00183 {
00184     typedef int lane_type;
00185     enum { nlanes = 4 };
00186 
00187     v_int32x4() {}
00188     explicit v_int32x4(int32x4_t v) : val(v) {}
00189     v_int32x4(int v0, int v1, int v2, int v3)
00190     {
00191         int v[] = {v0, v1, v2, v3};
00192         val = vld1q_s32(v);
00193     }
00194     int get0() const
00195     {
00196         return vgetq_lane_s32(val, 0);
00197     }
00198     int32x4_t val;
00199 };
00200 
00201 struct v_float32x4
00202 {
00203     typedef float lane_type;
00204     enum { nlanes = 4 };
00205 
00206     v_float32x4() {}
00207     explicit v_float32x4(float32x4_t v) : val(v) {}
00208     v_float32x4(float v0, float v1, float v2, float v3)
00209     {
00210         float v[] = {v0, v1, v2, v3};
00211         val = vld1q_f32(v);
00212     }
00213     float get0() const
00214     {
00215         return vgetq_lane_f32(val, 0);
00216     }
00217     float32x4_t val;
00218 };
00219 
00220 struct v_uint64x2
00221 {
00222     typedef uint64 lane_type;
00223     enum { nlanes = 2 };
00224 
00225     v_uint64x2() {}
00226     explicit v_uint64x2(uint64x2_t v) : val(v) {}
00227     v_uint64x2(unsigned v0, unsigned v1)
00228     {
00229         uint64 v[] = {v0, v1};
00230         val = vld1q_u64(v);
00231     }
00232     uint64 get0() const
00233     {
00234         return vgetq_lane_u64(val, 0);
00235     }
00236     uint64x2_t val;
00237 };
00238 
00239 struct v_int64x2
00240 {
00241     typedef int64 lane_type;
00242     enum { nlanes = 2 };
00243 
00244     v_int64x2() {}
00245     explicit v_int64x2(int64x2_t v) : val(v) {}
00246     v_int64x2(int v0, int v1)
00247     {
00248         int64 v[] = {v0, v1};
00249         val = vld1q_s64(v);
00250     }
00251     int64 get0() const
00252     {
00253         return vgetq_lane_s64(val, 0);
00254     }
00255     int64x2_t val;
00256 };
00257 
00258 #if CV_SIMD128_64F
00259 struct v_float64x2
00260 {
00261     typedef double lane_type;
00262     enum { nlanes = 2 };
00263 
00264     v_float64x2() {}
00265     explicit v_float64x2(float64x2_t v) : val(v) {}
00266     v_float64x2(double v0, double v1)
00267     {
00268         double v[] = {v0, v1};
00269         val = vld1q_f64(v);
00270     }
00271     double get0() const
00272     {
00273         return vgetq_lane_f64(val, 0);
00274     }
00275     float64x2_t val;
00276 };
00277 #endif
00278 
00279 #if defined (HAVE_FP16)
00280 // Workaround for old comiplers
00281 template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
00282 { return (int16x4_t)a; }
00283 template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
00284 { return (float16x4_t)a; }
00285 template <typename T> static inline float16x4_t vld1_f16(const T* ptr)
00286 { return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); }
00287 template <typename T> static inline void vst1_f16(T* ptr, float16x4_t a)
00288 { vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); }
00289 
00290 struct v_float16x4
00291 {
00292     typedef short lane_type;
00293     enum { nlanes = 4 };
00294 
00295     v_float16x4() {}
00296     explicit v_float16x4(float16x4_t v) : val(v) {}
00297     v_float16x4(short v0, short v1, short v2, short v3)
00298     {
00299         short v[] = {v0, v1, v2, v3};
00300         val = vld1_f16(v);
00301     }
00302     short get0() const
00303     {
00304         return vget_lane_s16(vreinterpret_s16_f16(val), 0);
00305     }
00306     float16x4_t val;
00307 };
00308 #endif
00309 
00310 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
00311 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
00312 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
00313 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
00314 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
00315 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
00316 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
00317 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
00318 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
00319 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
00320 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
00321 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
00322 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
00323 
00324 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
00325 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
00326 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
00327 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
00328 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
00329 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
00330 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
00331 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
00332 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
00333 #if CV_SIMD128_64F
00334 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
00335 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
00336 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
00337 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
00338 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
00339 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
00340 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
00341 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
00342 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
00343 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
00344 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
00345 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
00346 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
00347 #endif
00348 
00349 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
00350 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
00351 { \
00352     hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
00353     return _Tpvec(vcombine_##suffix(a1, b1)); \
00354 } \
00355 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
00356 { \
00357     hreg a1 = vqmov##op##_##wsuffix(a.val); \
00358     vst1_##suffix(ptr, a1); \
00359 } \
00360 template<int n> inline \
00361 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
00362 { \
00363     hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
00364     hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
00365     return _Tpvec(vcombine_##suffix(a1, b1)); \
00366 } \
00367 template<int n> inline \
00368 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
00369 { \
00370     hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
00371     vst1_##suffix(ptr, a1); \
00372 }
00373 
00374 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
00375 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
00376 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
00377 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
00378 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
00379 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
00380 
00381 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
00382 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
00383 
00384 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
00385                             const v_float32x4& m1, const v_float32x4& m2,
00386                             const v_float32x4& m3)
00387 {
00388     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
00389     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
00390     res = vmlaq_lane_f32(res, m1.val, vl, 1);
00391     res = vmlaq_lane_f32(res, m2.val, vh, 0);
00392     res = vmlaq_lane_f32(res, m3.val, vh, 1);
00393     return v_float32x4(res);
00394 }
00395 
00396 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
00397 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
00398 { \
00399     return _Tpvec(intrin(a.val, b.val)); \
00400 } \
00401 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
00402 { \
00403     a.val = intrin(a.val, b.val); \
00404     return a; \
00405 }
00406 
00407 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
00408 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
00409 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
00410 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
00411 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
00412 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
00413 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
00414 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
00415 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
00416 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
00417 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
00418 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
00419 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
00420 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
00421 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
00422 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
00423 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
00424 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
00425 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
00426 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
00427 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
00428 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
00429 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
00430 #if CV_SIMD128_64F
00431 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
00432 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
00433 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
00434 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
00435 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
00436 #else
00437 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
00438 {
00439     float32x4_t reciprocal = vrecpeq_f32(b.val);
00440     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
00441     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
00442     return v_float32x4(vmulq_f32(a.val, reciprocal));
00443 }
00444 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
00445 {
00446     float32x4_t reciprocal = vrecpeq_f32(b.val);
00447     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
00448     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
00449     a.val = vmulq_f32(a.val, reciprocal);
00450     return a;
00451 }
00452 #endif
00453 
00454 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
00455                          v_int32x4& c, v_int32x4& d)
00456 {
00457     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
00458     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
00459 }
00460 
00461 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
00462                          v_uint32x4& c, v_uint32x4& d)
00463 {
00464     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
00465     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
00466 }
00467 
00468 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
00469                          v_uint64x2& c, v_uint64x2& d)
00470 {
00471     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
00472     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
00473 }
00474 
00475 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
00476 {
00477     int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
00478     int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
00479     int32x4x2_t cd = vuzpq_s32(c, d);
00480     return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
00481 }
00482 
00483 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
00484     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
00485     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
00486     OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
00487     inline _Tpvec operator ~ (const _Tpvec& a) \
00488     { \
00489         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
00490     }
00491 
00492 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
00493 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
00494 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
00495 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
00496 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
00497 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
00498 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
00499 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
00500 
00501 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
00502 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
00503 { \
00504     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
00505 } \
00506 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
00507 { \
00508     a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
00509     return a; \
00510 }
00511 
00512 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
00513 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
00514 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
00515 
00516 inline v_float32x4 operator ~ (const v_float32x4& a)
00517 {
00518     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
00519 }
00520 
00521 #if CV_SIMD128_64F
00522 inline v_float32x4 v_sqrt(const v_float32x4& x)
00523 {
00524     return v_float32x4(vsqrtq_f32(x.val));
00525 }
00526 
00527 inline v_float32x4 v_invsqrt(const v_float32x4& x)
00528 {
00529     v_float32x4 one = v_setall_f32(1.0f);
00530     return one / v_sqrt(x);
00531 }
00532 #else
00533 inline v_float32x4 v_sqrt(const v_float32x4& x)
00534 {
00535     float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
00536     float32x4_t e = vrsqrteq_f32(x1);
00537     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
00538     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
00539     return v_float32x4(vmulq_f32(x.val, e));
00540 }
00541 
00542 inline v_float32x4 v_invsqrt(const v_float32x4& x)
00543 {
00544     float32x4_t e = vrsqrteq_f32(x.val);
00545     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
00546     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
00547     return v_float32x4(e);
00548 }
00549 #endif
00550 
00551 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
00552 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
00553 
00554 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
00555 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
00556 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
00557 
00558 inline v_float32x4 v_abs(v_float32x4 x)
00559 { return v_float32x4(vabsq_f32(x.val)); }
00560 
00561 #if CV_SIMD128_64F
00562 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
00563 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
00564 { \
00565     return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
00566 } \
00567 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
00568 { \
00569     a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
00570     return a; \
00571 }
00572 
00573 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
00574 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
00575 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
00576 
00577 inline v_float64x2 operator ~ (const v_float64x2& a)
00578 {
00579     return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
00580 }
00581 
00582 inline v_float64x2 v_sqrt(const v_float64x2& x)
00583 {
00584     return v_float64x2(vsqrtq_f64(x.val));
00585 }
00586 
00587 inline v_float64x2 v_invsqrt(const v_float64x2& x)
00588 {
00589     v_float64x2 one = v_setall_f64(1.0f);
00590     return one / v_sqrt(x);
00591 }
00592 
00593 inline v_float64x2 v_abs(v_float64x2 x)
00594 { return v_float64x2(vabsq_f64(x.val)); }
00595 #endif
00596 
00597 // TODO: exp, log, sin, cos
00598 
00599 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
00600 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
00601 { \
00602     return _Tpvec(intrin(a.val, b.val)); \
00603 }
00604 
00605 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
00606 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
00607 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
00608 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
00609 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
00610 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
00611 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
00612 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
00613 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
00614 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
00615 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
00616 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
00617 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
00618 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
00619 #if CV_SIMD128_64F
00620 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
00621 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
00622 #endif
00623 
00624 #if CV_SIMD128_64F
00625 inline int64x2_t vmvnq_s64(int64x2_t a)
00626 {
00627     int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
00628     return veorq_s64(a, vx);
00629 }
00630 inline uint64x2_t vmvnq_u64(uint64x2_t a)
00631 {
00632     uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
00633     return veorq_u64(a, vx);
00634 }
00635 #endif
00636 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
00637 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
00638 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
00639 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
00640 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
00641 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
00642 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
00643 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
00644 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
00645 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
00646 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
00647 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
00648 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
00649 
00650 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
00651 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
00652 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
00653 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
00654 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
00655 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
00656 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
00657 #if CV_SIMD128_64F
00658 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
00659 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
00660 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
00661 #endif
00662 
00663 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
00664 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
00665 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
00666 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
00667 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
00668 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
00669 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
00670 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
00671 
00672 // TODO: absdiff for signed integers
00673 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
00674 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
00675 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
00676 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
00677 #if CV_SIMD128_64F
00678 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
00679 #endif
00680 
00681 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
00682 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
00683 { \
00684     return _Tpvec2(cast(intrin(a.val, b.val))); \
00685 }
00686 
00687 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
00688 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
00689 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
00690 
00691 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
00692 {
00693     v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
00694     return v_sqrt(x);
00695 }
00696 
00697 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
00698 {
00699     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
00700 }
00701 
00702 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
00703 {
00704     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
00705 }
00706 
00707 #if CV_SIMD128_64F
00708 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
00709 {
00710     v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
00711     return v_sqrt(x);
00712 }
00713 
00714 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
00715 {
00716     return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
00717 }
00718 
00719 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
00720 {
00721     return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val)));
00722 }
00723 #endif
00724 
00725 // trade efficiency for convenience
00726 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
00727 inline _Tpvec operator << (const _Tpvec& a, int n) \
00728 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
00729 inline _Tpvec operator >> (const _Tpvec& a, int n) \
00730 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
00731 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
00732 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
00733 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
00734 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
00735 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
00736 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
00737 
00738 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
00739 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
00740 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
00741 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
00742 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
00743 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
00744 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
00745 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
00746 
00747 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
00748 inline _Tpvec v_load(const _Tp* ptr) \
00749 { return _Tpvec(vld1q_##suffix(ptr)); } \
00750 inline _Tpvec v_load_aligned(const _Tp* ptr) \
00751 { return _Tpvec(vld1q_##suffix(ptr)); } \
00752 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
00753 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
00754 inline void v_store(_Tp* ptr, const _Tpvec& a) \
00755 { vst1q_##suffix(ptr, a.val); } \
00756 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
00757 { vst1q_##suffix(ptr, a.val); } \
00758 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
00759 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
00760 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
00761 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
00762 
00763 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
00764 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
00765 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
00766 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
00767 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
00768 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
00769 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
00770 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
00771 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
00772 #if CV_SIMD128_64F
00773 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
00774 #endif
00775 
00776 #if defined (HAVE_FP16)
00777 // Workaround for old comiplers
00778 inline v_float16x4 v_load_f16(const short* ptr)
00779 { return v_float16x4(vld1_f16(ptr)); }
00780 inline void v_store_f16(short* ptr, v_float16x4& a)
00781 { vst1_f16(ptr, a.val); }
00782 #endif
00783 
00784 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
00785 inline scalartype v_reduce_##func(const _Tpvec& a) \
00786 { \
00787     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
00788     a0 = vp##vectorfunc##_##suffix(a0, a0); \
00789     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
00790 }
00791 
00792 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
00793 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
00794 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
00795 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
00796 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
00797 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
00798 
00799 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
00800 inline scalartype v_reduce_##func(const _Tpvec& a) \
00801 { \
00802     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
00803     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
00804 }
00805 
00806 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
00807 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
00808 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
00809 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
00810 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
00811 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
00812 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
00813 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
00814 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
00815 
00816 inline int v_signmask(const v_uint8x16& a)
00817 {
00818     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
00819     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
00820     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
00821     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
00822 }
00823 inline int v_signmask(const v_int8x16& a)
00824 { return v_signmask(v_reinterpret_as_u8(a)); }
00825 
00826 inline int v_signmask(const v_uint16x8& a)
00827 {
00828     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
00829     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
00830     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
00831     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
00832 }
00833 inline int v_signmask(const v_int16x8& a)
00834 { return v_signmask(v_reinterpret_as_u16(a)); }
00835 
00836 inline int v_signmask(const v_uint32x4& a)
00837 {
00838     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
00839     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
00840     uint64x2_t v1 = vpaddlq_u32(v0);
00841     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
00842 }
00843 inline int v_signmask(const v_int32x4& a)
00844 { return v_signmask(v_reinterpret_as_u32(a)); }
00845 inline int v_signmask(const v_float32x4& a)
00846 { return v_signmask(v_reinterpret_as_u32(a)); }
00847 #if CV_SIMD128_64F
00848 inline int v_signmask(const v_uint64x2& a)
00849 {
00850     int64x1_t m0 = vdup_n_s64(0);
00851     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
00852     return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
00853 }
00854 inline int v_signmask(const v_float64x2& a)
00855 { return v_signmask(v_reinterpret_as_u64(a)); }
00856 #endif
00857 
00858 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
00859 inline bool v_check_all(const v_##_Tpvec& a) \
00860 { \
00861     _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
00862     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
00863     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
00864 } \
00865 inline bool v_check_any(const v_##_Tpvec& a) \
00866 { \
00867     _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
00868     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
00869     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
00870 }
00871 
00872 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
00873 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
00874 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
00875 #if CV_SIMD128_64F
00876 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63)
00877 #endif
00878 
00879 inline bool v_check_all(const v_int8x16& a)
00880 { return v_check_all(v_reinterpret_as_u8(a)); }
00881 inline bool v_check_all(const v_int16x8& a)
00882 { return v_check_all(v_reinterpret_as_u16(a)); }
00883 inline bool v_check_all(const v_int32x4& a)
00884 { return v_check_all(v_reinterpret_as_u32(a)); }
00885 inline bool v_check_all(const v_float32x4& a)
00886 { return v_check_all(v_reinterpret_as_u32(a)); }
00887 
00888 inline bool v_check_any(const v_int8x16& a)
00889 { return v_check_any(v_reinterpret_as_u8(a)); }
00890 inline bool v_check_any(const v_int16x8& a)
00891 { return v_check_any(v_reinterpret_as_u16(a)); }
00892 inline bool v_check_any(const v_int32x4& a)
00893 { return v_check_any(v_reinterpret_as_u32(a)); }
00894 inline bool v_check_any(const v_float32x4& a)
00895 { return v_check_any(v_reinterpret_as_u32(a)); }
00896 
00897 #if CV_SIMD128_64F
00898 inline bool v_check_all(const v_int64x2& a)
00899 { return v_check_all(v_reinterpret_as_u64(a)); }
00900 inline bool v_check_all(const v_float64x2& a)
00901 { return v_check_all(v_reinterpret_as_u64(a)); }
00902 inline bool v_check_any(const v_int64x2& a)
00903 { return v_check_any(v_reinterpret_as_u64(a)); }
00904 inline bool v_check_any(const v_float64x2& a)
00905 { return v_check_any(v_reinterpret_as_u64(a)); }
00906 #endif
00907 
00908 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
00909 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
00910 { \
00911     return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
00912 }
00913 
00914 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
00915 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
00916 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
00917 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
00918 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
00919 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
00920 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
00921 #if CV_SIMD128_64F
00922 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
00923 #endif
00924 
00925 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
00926 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
00927 { \
00928     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
00929     b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
00930 } \
00931 inline _Tpwvec v_load_expand(const _Tp* ptr) \
00932 { \
00933     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
00934 }
00935 
00936 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
00937 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
00938 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
00939 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
00940 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
00941 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
00942 
00943 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
00944 {
00945     uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
00946     uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
00947     return v_uint32x4(vmovl_u16(v1));
00948 }
00949 
00950 inline v_int32x4 v_load_expand_q(const schar* ptr)
00951 {
00952     int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
00953     int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
00954     return v_int32x4(vmovl_s16(v1));
00955 }
00956 
00957 #if defined(__aarch64__)
00958 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
00959 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
00960 { \
00961     b0.val = vzip1q_##suffix(a0.val, a1.val); \
00962     b1.val = vzip2q_##suffix(a0.val, a1.val); \
00963 } \
00964 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
00965 { \
00966     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
00967 } \
00968 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
00969 { \
00970     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
00971 } \
00972 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
00973 { \
00974     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
00975     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
00976 }
00977 #else
00978 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
00979 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
00980 { \
00981     _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
00982     b0.val = p.val[0]; \
00983     b1.val = p.val[1]; \
00984 } \
00985 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
00986 { \
00987     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
00988 } \
00989 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
00990 { \
00991     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
00992 } \
00993 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
00994 { \
00995     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
00996     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
00997 }
00998 #endif
00999 
01000 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
01001 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
01002 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
01003 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
01004 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
01005 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
01006 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
01007 #if CV_SIMD128_64F
01008 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
01009 #endif
01010 
01011 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
01012 template <int s> \
01013 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
01014 { \
01015     return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
01016 }
01017 
01018 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
01019 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
01020 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
01021 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
01022 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
01023 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
01024 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
01025 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
01026 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
01027 #if CV_SIMD128_64F
01028 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
01029 #endif
01030 
01031 inline v_int32x4 v_round(const v_float32x4& a)
01032 {
01033     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
01034         v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
01035 
01036     int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
01037     return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
01038 }
01039 
01040 inline v_int32x4 v_floor(const v_float32x4& a)
01041 {
01042     int32x4_t a1 = vcvtq_s32_f32(a.val);
01043     uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
01044     return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
01045 }
01046 
01047 inline v_int32x4 v_ceil(const v_float32x4& a)
01048 {
01049     int32x4_t a1 = vcvtq_s32_f32(a.val);
01050     uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
01051     return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
01052 }
01053 
01054 inline v_int32x4 v_trunc(const v_float32x4& a)
01055 { return v_int32x4(vcvtq_s32_f32(a.val)); }
01056 
01057 #if CV_SIMD128_64F
01058 inline v_int32x4 v_round(const v_float64x2& a)
01059 {
01060     static const int32x2_t zero = vdup_n_s32(0);
01061     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
01062 }
01063 
01064 inline v_int32x4 v_floor(const v_float64x2& a)
01065 {
01066     static const int32x2_t zero = vdup_n_s32(0);
01067     int64x2_t a1 = vcvtq_s64_f64(a.val);
01068     uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
01069     a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
01070     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
01071 }
01072 
01073 inline v_int32x4 v_ceil(const v_float64x2& a)
01074 {
01075     static const int32x2_t zero = vdup_n_s32(0);
01076     int64x2_t a1 = vcvtq_s64_f64(a.val);
01077     uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
01078     a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
01079     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
01080 }
01081 
01082 inline v_int32x4 v_trunc(const v_float64x2& a)
01083 {
01084     static const int32x2_t zero = vdup_n_s32(0);
01085     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
01086 }
01087 #endif
01088 
01089 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
01090 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
01091                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
01092                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
01093                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
01094 { \
01095     /* m00 m01 m02 m03 */ \
01096     /* m10 m11 m12 m13 */ \
01097     /* m20 m21 m22 m23 */ \
01098     /* m30 m31 m32 m33 */ \
01099     _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
01100     _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
01101     /* m00 m10 m02 m12 */ \
01102     /* m01 m11 m03 m13 */ \
01103     /* m20 m30 m22 m32 */ \
01104     /* m21 m31 m23 m33 */ \
01105     b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
01106     b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
01107     b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
01108     b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
01109 }
01110 
01111 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
01112 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
01113 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
01114 
01115 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
01116 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
01117 { \
01118     _Tpvec##x2_t v = vld2q_##suffix(ptr); \
01119     a.val = v.val[0]; \
01120     b.val = v.val[1]; \
01121 } \
01122 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
01123 { \
01124     _Tpvec##x3_t v = vld3q_##suffix(ptr); \
01125     a.val = v.val[0]; \
01126     b.val = v.val[1]; \
01127     c.val = v.val[2]; \
01128 } \
01129 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
01130                                 v_##_Tpvec& c, v_##_Tpvec& d) \
01131 { \
01132     _Tpvec##x4_t v = vld4q_##suffix(ptr); \
01133     a.val = v.val[0]; \
01134     b.val = v.val[1]; \
01135     c.val = v.val[2]; \
01136     d.val = v.val[3]; \
01137 } \
01138 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
01139 { \
01140     _Tpvec##x2_t v; \
01141     v.val[0] = a.val; \
01142     v.val[1] = b.val; \
01143     vst2q_##suffix(ptr, v); \
01144 } \
01145 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
01146 { \
01147     _Tpvec##x3_t v; \
01148     v.val[0] = a.val; \
01149     v.val[1] = b.val; \
01150     v.val[2] = c.val; \
01151     vst3q_##suffix(ptr, v); \
01152 } \
01153 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
01154                                const v_##_Tpvec& c, const v_##_Tpvec& d) \
01155 { \
01156     _Tpvec##x4_t v; \
01157     v.val[0] = a.val; \
01158     v.val[1] = b.val; \
01159     v.val[2] = c.val; \
01160     v.val[3] = d.val; \
01161     vst4q_##suffix(ptr, v); \
01162 }
01163 
01164 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
01165 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
01166 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
01167 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
01168 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
01169 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
01170 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
01171 #if CV_SIMD128_64F
01172 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
01173 #endif
01174 
01175 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
01176 {
01177     return v_float32x4(vcvtq_f32_s32(a.val));
01178 }
01179 
01180 #if CV_SIMD128_64F
01181 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
01182 {
01183     float32x2_t zero = vdup_n_f32(0.0f);
01184     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
01185 }
01186 
01187 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
01188 {
01189     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
01190 }
01191 
01192 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
01193 {
01194     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
01195 }
01196 
01197 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
01198 {
01199     return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
01200 }
01201 
01202 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
01203 {
01204     return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
01205 }
01206 #endif
01207 
01208 #if defined (HAVE_FP16)
01209 inline v_float32x4 v_cvt_f32(const v_float16x4& a)
01210 {
01211     return v_float32x4(vcvt_f32_f16(a.val));
01212 }
01213 
01214 inline v_float16x4 v_cvt_f16(const v_float32x4& a)
01215 {
01216     return v_float16x4(vcvt_f16_f32(a.val));
01217 }
01218 #endif
01219 
01220 //! @name Check SIMD support
01221 //! @{
01222 //! @brief Check CPU capability of SIMD operation
01223 static inline bool hasSIMD128()
01224 {
01225     return checkHardwareSupport(CV_CPU_NEON);
01226 }
01227 
01228 //! @}
01229 
01230 //! @endcond
01231 
01232 }
01233 
01234 #endif