Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers intrin_neon.hpp Source File

intrin_neon.hpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                          License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
00016 // Copyright (C) 2015, Itseez Inc., all rights reserved.
00017 // Third party copyrights are property of their respective owners.
00018 //
00019 // Redistribution and use in source and binary forms, with or without modification,
00020 // are permitted provided that the following conditions are met:
00021 //
00022 //   * Redistribution's of source code must retain the above copyright notice,
00023 //     this list of conditions and the following disclaimer.
00024 //
00025 //   * Redistribution's in binary form must reproduce the above copyright notice,
00026 //     this list of conditions and the following disclaimer in the documentation
00027 //     and/or other materials provided with the distribution.
00028 //
00029 //   * The name of the copyright holders may not be used to endorse or promote products
00030 //     derived from this software without specific prior written permission.
00031 //
00032 // This software is provided by the copyright holders and contributors "as is" and
00033 // any express or implied warranties, including, but not limited to, the implied
00034 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00035 // In no event shall the Intel Corporation or contributors be liable for any direct,
00036 // indirect, incidental, special, exemplary, or consequential damages
00037 // (including, but not limited to, procurement of substitute goods or services;
00038 // loss of use, data, or profits; or business interruption) however caused
00039 // and on any theory of liability, whether in contract, strict liability,
00040 // or tort (including negligence or otherwise) arising in any way out of
00041 // the use of this software, even if advised of the possibility of such damage.
00042 //
00043 //M*/
00044 
00045 #ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
00046 #define __OPENCV_HAL_INTRIN_NEON_HPP__
00047 
00048 #include <algorithm>
00049 
00050 namespace cv
00051 {
00052 
00053 //! @cond IGNORED
00054 
00055 #define CV_SIMD128 1
00056 
00057 struct v_uint8x16
00058 {
00059     typedef uchar lane_type;
00060     enum { nlanes = 16 };
00061 
00062     v_uint8x16() {}
00063     explicit v_uint8x16(uint8x16_t v) : val(v) {}
00064     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
00065                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
00066     {
00067         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
00068         val = vld1q_u8(v);
00069     }
00070     uchar get0() const
00071     {
00072         return vgetq_lane_u8(val, 0);
00073     }
00074 
00075     uint8x16_t val;
00076 };
00077 
00078 struct v_int8x16
00079 {
00080     typedef schar lane_type;
00081     enum { nlanes = 16 };
00082 
00083     v_int8x16() {}
00084     explicit v_int8x16(int8x16_t v) : val(v) {}
00085     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
00086                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
00087     {
00088         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
00089         val = vld1q_s8(v);
00090     }
00091     schar get0() const
00092     {
00093         return vgetq_lane_s8(val, 0);
00094     }
00095 
00096     int8x16_t val;
00097 };
00098 
00099 struct v_uint16x8
00100 {
00101     typedef ushort lane_type;
00102     enum { nlanes = 8 };
00103 
00104     v_uint16x8() {}
00105     explicit v_uint16x8(uint16x8_t v) : val(v) {}
00106     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
00107     {
00108         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
00109         val = vld1q_u16(v);
00110     }
00111     ushort get0() const
00112     {
00113         return vgetq_lane_u16(val, 0);
00114     }
00115 
00116     uint16x8_t val;
00117 };
00118 
00119 struct v_int16x8
00120 {
00121     typedef short lane_type;
00122     enum { nlanes = 8 };
00123 
00124     v_int16x8() {}
00125     explicit v_int16x8(int16x8_t v) : val(v) {}
00126     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
00127     {
00128         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
00129         val = vld1q_s16(v);
00130     }
00131     short get0() const
00132     {
00133         return vgetq_lane_s16(val, 0);
00134     }
00135 
00136     int16x8_t val;
00137 };
00138 
00139 struct v_uint32x4
00140 {
00141     typedef unsigned lane_type;
00142     enum { nlanes = 4 };
00143 
00144     v_uint32x4() {}
00145     explicit v_uint32x4(uint32x4_t v) : val(v) {}
00146     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
00147     {
00148         unsigned v[] = {v0, v1, v2, v3};
00149         val = vld1q_u32(v);
00150     }
00151     unsigned get0() const
00152     {
00153         return vgetq_lane_u32(val, 0);
00154     }
00155 
00156     uint32x4_t val;
00157 };
00158 
00159 struct v_int32x4
00160 {
00161     typedef int lane_type;
00162     enum { nlanes = 4 };
00163 
00164     v_int32x4() {}
00165     explicit v_int32x4(int32x4_t v) : val(v) {}
00166     v_int32x4(int v0, int v1, int v2, int v3)
00167     {
00168         int v[] = {v0, v1, v2, v3};
00169         val = vld1q_s32(v);
00170     }
00171     int get0() const
00172     {
00173         return vgetq_lane_s32(val, 0);
00174     }
00175     int32x4_t val;
00176 };
00177 
00178 struct v_float32x4
00179 {
00180     typedef float lane_type;
00181     enum { nlanes = 4 };
00182 
00183     v_float32x4() {}
00184     explicit v_float32x4(float32x4_t v) : val(v) {}
00185     v_float32x4(float v0, float v1, float v2, float v3)
00186     {
00187         float v[] = {v0, v1, v2, v3};
00188         val = vld1q_f32(v);
00189     }
00190     float get0() const
00191     {
00192         return vgetq_lane_f32(val, 0);
00193     }
00194     float32x4_t val;
00195 };
00196 
00197 struct v_uint64x2
00198 {
00199     typedef uint64 lane_type;
00200     enum { nlanes = 2 };
00201 
00202     v_uint64x2() {}
00203     explicit v_uint64x2(uint64x2_t v) : val(v) {}
00204     v_uint64x2(unsigned v0, unsigned v1)
00205     {
00206         uint64 v[] = {v0, v1};
00207         val = vld1q_u64(v);
00208     }
00209     uint64 get0() const
00210     {
00211         return vgetq_lane_u64(val, 0);
00212     }
00213     uint64x2_t val;
00214 };
00215 
00216 struct v_int64x2
00217 {
00218     typedef int64 lane_type;
00219     enum { nlanes = 2 };
00220 
00221     v_int64x2() {}
00222     explicit v_int64x2(int64x2_t v) : val(v) {}
00223     v_int64x2(int v0, int v1)
00224     {
00225         int64 v[] = {v0, v1};
00226         val = vld1q_s64(v);
00227     }
00228     int64 get0() const
00229     {
00230         return vgetq_lane_s64(val, 0);
00231     }
00232     int64x2_t val;
00233 };
00234 
00235 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
00236 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
00237 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
00238 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
00239 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
00240 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
00241 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
00242 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
00243 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
00244 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
00245 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
00246 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
00247 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
00248 
00249 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
00250 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
00251 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
00252 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
00253 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
00254 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
00255 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
00256 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
00257 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
00258 
00259 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
00260 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
00261 { \
00262     hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
00263     return _Tpvec(vcombine_##suffix(a1, b1)); \
00264 } \
00265 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
00266 { \
00267     hreg a1 = vqmov##op##_##wsuffix(a.val); \
00268     vst1_##suffix(ptr, a1); \
00269 } \
00270 template<int n> inline \
00271 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
00272 { \
00273     hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
00274     hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
00275     return _Tpvec(vcombine_##suffix(a1, b1)); \
00276 } \
00277 template<int n> inline \
00278 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
00279 { \
00280     hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
00281     vst1_##suffix(ptr, a1); \
00282 }
00283 
00284 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
00285 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
00286 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
00287 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
00288 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
00289 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
00290 
00291 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
00292 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
00293 
00294 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
00295                             const v_float32x4& m1, const v_float32x4& m2,
00296                             const v_float32x4& m3)
00297 {
00298     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
00299     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
00300     res = vmlaq_lane_f32(res, m1.val, vl, 1);
00301     res = vmlaq_lane_f32(res, m2.val, vh, 0);
00302     res = vmlaq_lane_f32(res, m3.val, vh, 1);
00303     return v_float32x4(res);
00304 }
00305 
00306 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
00307 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
00308 { \
00309     return _Tpvec(intrin(a.val, b.val)); \
00310 } \
00311 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
00312 { \
00313     a.val = intrin(a.val, b.val); \
00314     return a; \
00315 }
00316 
00317 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
00318 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
00319 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
00320 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
00321 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
00322 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
00323 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
00324 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
00325 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
00326 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
00327 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
00328 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
00329 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
00330 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
00331 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
00332 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
00333 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
00334 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
00335 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
00336 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
00337 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
00338 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
00339 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
00340 
00341 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
00342 {
00343     float32x4_t reciprocal = vrecpeq_f32(b.val);
00344     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
00345     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
00346     return v_float32x4(vmulq_f32(a.val, reciprocal));
00347 }
00348 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
00349 {
00350     float32x4_t reciprocal = vrecpeq_f32(b.val);
00351     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
00352     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
00353     a.val = vmulq_f32(a.val, reciprocal);
00354     return a;
00355 }
00356 
00357 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
00358                          v_int32x4& c, v_int32x4& d)
00359 {
00360     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
00361     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
00362 }
00363 
00364 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
00365                          v_uint32x4& c, v_uint32x4& d)
00366 {
00367     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
00368     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
00369 }
00370 
00371 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
00372                          v_uint64x2& c, v_uint64x2& d)
00373 {
00374     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
00375     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
00376 }
00377 
00378 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
00379 {
00380     int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
00381     int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
00382     int32x4x2_t cd = vuzpq_s32(c, d);
00383     return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
00384 }
00385 
00386 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
00387     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
00388     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
00389     OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
00390     inline _Tpvec operator ~ (const _Tpvec& a) \
00391     { \
00392         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
00393     }
00394 
00395 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
00396 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
00397 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
00398 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
00399 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
00400 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
00401 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
00402 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
00403 
00404 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
00405 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
00406 { \
00407     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
00408 } \
00409 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
00410 { \
00411     a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
00412     return a; \
00413 }
00414 
00415 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
00416 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
00417 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
00418 
00419 inline v_float32x4 operator ~ (const v_float32x4& a)
00420 {
00421     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
00422 }
00423 
00424 inline v_float32x4 v_sqrt(const v_float32x4& x)
00425 {
00426     float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
00427     float32x4_t e = vrsqrteq_f32(x1);
00428     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
00429     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
00430     return v_float32x4(vmulq_f32(x.val, e));
00431 }
00432 
00433 inline v_float32x4 v_invsqrt(const v_float32x4& x)
00434 {
00435     float32x4_t e = vrsqrteq_f32(x.val);
00436     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
00437     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
00438     return v_float32x4(e);
00439 }
00440 
00441 inline v_float32x4 v_abs(v_float32x4 x)
00442 { return v_float32x4(vabsq_f32(x.val)); }
00443 
00444 // TODO: exp, log, sin, cos
00445 
00446 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
00447 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
00448 { \
00449     return _Tpvec(intrin(a.val, b.val)); \
00450 }
00451 
00452 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
00453 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
00454 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
00455 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
00456 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
00457 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
00458 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
00459 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
00460 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
00461 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
00462 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
00463 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
00464 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
00465 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
00466 
00467 
00468 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
00469 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
00470 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
00471 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
00472 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
00473 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
00474 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
00475 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
00476 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
00477 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
00478 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
00479 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
00480 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
00481 
00482 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
00483 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
00484 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
00485 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
00486 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
00487 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
00488 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
00489 
00490 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
00491 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
00492 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
00493 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
00494 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
00495 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
00496 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
00497 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
00498 
00499 // TODO: absdiff for signed integers
00500 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
00501 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
00502 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
00503 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
00504 
00505 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
00506 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
00507 { \
00508     return _Tpvec2(cast(intrin(a.val, b.val))); \
00509 }
00510 
00511 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
00512 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
00513 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
00514 
00515 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
00516 {
00517     v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
00518     return v_sqrt(x);
00519 }
00520 
00521 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
00522 {
00523     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
00524 }
00525 
00526 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
00527 {
00528     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
00529 }
00530 
00531 // trade efficiency for convenience
00532 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
00533 inline _Tpvec operator << (const _Tpvec& a, int n) \
00534 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
00535 inline _Tpvec operator >> (const _Tpvec& a, int n) \
00536 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
00537 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
00538 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
00539 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
00540 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
00541 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
00542 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
00543 
00544 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
00545 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
00546 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
00547 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
00548 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
00549 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
00550 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
00551 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
00552 
00553 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
00554 inline _Tpvec v_load(const _Tp* ptr) \
00555 { return _Tpvec(vld1q_##suffix(ptr)); } \
00556 inline _Tpvec v_load_aligned(const _Tp* ptr) \
00557 { return _Tpvec(vld1q_##suffix(ptr)); } \
00558 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
00559 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
00560 inline void v_store(_Tp* ptr, const _Tpvec& a) \
00561 { vst1q_##suffix(ptr, a.val); } \
00562 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
00563 { vst1q_##suffix(ptr, a.val); } \
00564 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
00565 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
00566 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
00567 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
00568 
00569 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
00570 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
00571 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
00572 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
00573 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
00574 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
00575 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
00576 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
00577 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
00578 
00579 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
00580 inline scalartype v_reduce_##func(const _Tpvec& a) \
00581 { \
00582     scalartype CV_DECL_ALIGNED(16) buf[4]; \
00583     v_store_aligned(buf, a); \
00584     scalartype s0 = scalar_func(buf[0], buf[1]); \
00585     scalartype s1 = scalar_func(buf[2], buf[3]); \
00586     return scalar_func(s0, s1); \
00587 }
00588 
00589 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
00590 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
00591 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
00592 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
00593 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
00594 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
00595 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
00596 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
00597 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
00598 
00599 inline int v_signmask(const v_uint8x16& a)
00600 {
00601     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
00602     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
00603     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
00604     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
00605 }
00606 inline int v_signmask(const v_int8x16& a)
00607 { return v_signmask(v_reinterpret_as_u8(a)); }
00608 
00609 inline int v_signmask(const v_uint16x8& a)
00610 {
00611     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
00612     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
00613     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
00614     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
00615 }
00616 inline int v_signmask(const v_int16x8& a)
00617 { return v_signmask(v_reinterpret_as_u16(a)); }
00618 
00619 inline int v_signmask(const v_uint32x4& a)
00620 {
00621     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
00622     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
00623     uint64x2_t v1 = vpaddlq_u32(v0);
00624     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
00625 }
00626 inline int v_signmask(const v_int32x4& a)
00627 { return v_signmask(v_reinterpret_as_u32(a)); }
00628 inline int v_signmask(const v_float32x4& a)
00629 { return v_signmask(v_reinterpret_as_u32(a)); }
00630 
00631 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
00632 inline bool v_check_all(const v_##_Tpvec& a) \
00633 { \
00634     _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
00635     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
00636     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
00637 } \
00638 inline bool v_check_any(const v_##_Tpvec& a) \
00639 { \
00640     _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
00641     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
00642     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
00643 }
00644 
00645 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
00646 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
00647 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
00648 
00649 inline bool v_check_all(const v_int8x16& a)
00650 { return v_check_all(v_reinterpret_as_u8(a)); }
00651 inline bool v_check_all(const v_int16x8& a)
00652 { return v_check_all(v_reinterpret_as_u16(a)); }
00653 inline bool v_check_all(const v_int32x4& a)
00654 { return v_check_all(v_reinterpret_as_u32(a)); }
00655 inline bool v_check_all(const v_float32x4& a)
00656 { return v_check_all(v_reinterpret_as_u32(a)); }
00657 
00658 inline bool v_check_any(const v_int8x16& a)
00659 { return v_check_any(v_reinterpret_as_u8(a)); }
00660 inline bool v_check_any(const v_int16x8& a)
00661 { return v_check_any(v_reinterpret_as_u16(a)); }
00662 inline bool v_check_any(const v_int32x4& a)
00663 { return v_check_any(v_reinterpret_as_u32(a)); }
00664 inline bool v_check_any(const v_float32x4& a)
00665 { return v_check_any(v_reinterpret_as_u32(a)); }
00666 
00667 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
00668 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
00669 { \
00670     return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
00671 }
00672 
00673 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
00674 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
00675 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
00676 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
00677 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
00678 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
00679 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
00680 
00681 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
00682 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
00683 { \
00684     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
00685     b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
00686 } \
00687 inline _Tpwvec v_load_expand(const _Tp* ptr) \
00688 { \
00689     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
00690 }
00691 
00692 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
00693 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
00694 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
00695 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
00696 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
00697 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
00698 
00699 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
00700 {
00701     uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
00702     uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
00703     return v_uint32x4(vmovl_u16(v1));
00704 }
00705 
00706 inline v_int32x4 v_load_expand_q(const schar* ptr)
00707 {
00708     int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
00709     int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
00710     return v_int32x4(vmovl_s16(v1));
00711 }
00712 
00713 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
00714 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
00715 { \
00716     _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
00717     b0.val = p.val[0]; \
00718     b1.val = p.val[1]; \
00719 } \
00720 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
00721 { \
00722     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
00723 } \
00724 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
00725 { \
00726     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
00727 } \
00728 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
00729 { \
00730     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
00731     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
00732 }
00733 
00734 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
00735 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
00736 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
00737 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
00738 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
00739 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
00740 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
00741 
00742 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
00743 template <int s> \
00744 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
00745 { \
00746     return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
00747 }
00748 
00749 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
00750 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
00751 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
00752 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
00753 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
00754 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
00755 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
00756 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
00757 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
00758 
00759 inline v_int32x4 v_round(const v_float32x4& a)
00760 {
00761     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
00762         v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
00763 
00764     int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
00765     return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
00766 }
00767 
00768 inline v_int32x4 v_floor(const v_float32x4& a)
00769 {
00770     int32x4_t a1 = vcvtq_s32_f32(a.val);
00771     uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
00772     return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
00773 }
00774 
00775 inline v_int32x4 v_ceil(const v_float32x4& a)
00776 {
00777     int32x4_t a1 = vcvtq_s32_f32(a.val);
00778     uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
00779     return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
00780 }
00781 
00782 inline v_int32x4 v_trunc(const v_float32x4& a)
00783 { return v_int32x4(vcvtq_s32_f32(a.val)); }
00784 
00785 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
00786 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
00787                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
00788                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
00789                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
00790 { \
00791     /* m00 m01 m02 m03 */ \
00792     /* m10 m11 m12 m13 */ \
00793     /* m20 m21 m22 m23 */ \
00794     /* m30 m31 m32 m33 */ \
00795     _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
00796     _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
00797     /* m00 m10 m02 m12 */ \
00798     /* m01 m11 m03 m13 */ \
00799     /* m20 m30 m22 m32 */ \
00800     /* m21 m31 m23 m33 */ \
00801     b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
00802     b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
00803     b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
00804     b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
00805 }
00806 
00807 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
00808 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
00809 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
00810 
00811 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
00812 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
00813 { \
00814     _Tpvec##x3_t v = vld3q_##suffix(ptr); \
00815     a.val = v.val[0]; \
00816     b.val = v.val[1]; \
00817     c.val = v.val[2]; \
00818 } \
00819 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
00820                                 v_##_Tpvec& c, v_##_Tpvec& d) \
00821 { \
00822     _Tpvec##x4_t v = vld4q_##suffix(ptr); \
00823     a.val = v.val[0]; \
00824     b.val = v.val[1]; \
00825     c.val = v.val[2]; \
00826     d.val = v.val[3]; \
00827 } \
00828 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
00829 { \
00830     _Tpvec##x3_t v; \
00831     v.val[0] = a.val; \
00832     v.val[1] = b.val; \
00833     v.val[2] = c.val; \
00834     vst3q_##suffix(ptr, v); \
00835 } \
00836 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
00837                                const v_##_Tpvec& c, const v_##_Tpvec& d) \
00838 { \
00839     _Tpvec##x4_t v; \
00840     v.val[0] = a.val; \
00841     v.val[1] = b.val; \
00842     v.val[2] = c.val; \
00843     v.val[3] = d.val; \
00844     vst4q_##suffix(ptr, v); \
00845 }
00846 
00847 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
00848 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
00849 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
00850 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
00851 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
00852 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
00853 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
00854 
00855 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
00856 {
00857     return v_float32x4(vcvtq_f32_s32(a.val));
00858 }
00859 
00860 //! @endcond
00861 
00862 }
00863 
00864 #endif
00865