Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arithm_simd.hpp Source File

arithm_simd.hpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                          License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
00016 // Copyright (C) 2015, Itseez Inc., all rights reserved.
00017 // Third party copyrights are property of their respective owners.
00018 //
00019 // Redistribution and use in source and binary forms, with or without modification,
00020 // are permitted provided that the following conditions are met:
00021 //
00022 //   * Redistribution's of source code must retain the above copyright notice,
00023 //     this list of conditions and the following disclaimer.
00024 //
00025 //   * Redistribution's in binary form must reproduce the above copyright notice,
00026 //     this list of conditions and the following disclaimer in the documentation
00027 //     and/or other materials provided with the distribution.
00028 //
00029 //   * The name of the copyright holders may not be used to endorse or promote products
00030 //     derived from this software without specific prior written permission.
00031 //
00032 // This software is provided by the copyright holders and contributors "as is" and
00033 // any express or implied warranties, including, but not limited to, the implied
00034 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00035 // In no event shall the Intel Corporation or contributors be liable for any direct,
00036 // indirect, incidental, special, exemplary, or consequential damages
00037 // (including, but not limited to, procurement of substitute goods or services;
00038 // loss of use, data, or profits; or business interruption) however caused
00039 // and on any theory of liability, whether in contract, strict liability,
00040 // or tort (including negligence or otherwise) arising in any way out of
00041 // the use of this software, even if advised of the possibility of such damage.
00042 //
00043 //M*/
00044 
00045 #ifndef __OPENCV_ARITHM_SIMD_HPP__
00046 #define __OPENCV_ARITHM_SIMD_HPP__
00047 
00048 namespace cv {
00049 
00050 struct NOP {};
00051 
00052 #if CV_SSE2 || CV_NEON
00053 #define IF_SIMD(op) op
00054 #else
00055 #define IF_SIMD(op) NOP
00056 #endif
00057 
00058 
00059 #if CV_SSE2 || CV_NEON
00060 
00061 #define FUNCTOR_TEMPLATE(name)          \
00062     template<typename T> struct name {}
00063 
00064 FUNCTOR_TEMPLATE(VLoadStore128);
00065 #if CV_SSE2
00066 FUNCTOR_TEMPLATE(VLoadStore64);
00067 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
00068 #if CV_AVX2
00069 FUNCTOR_TEMPLATE(VLoadStore256);
00070 FUNCTOR_TEMPLATE(VLoadStore256Aligned);
00071 #endif
00072 #endif
00073 
00074 #endif
00075 
00076 #if CV_AVX2
00077 
00078 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
00079     template <>                                                                                  \
00080     struct name<template_arg>{                                                                   \
00081         typedef register_type reg_type;                                                          \
00082         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
00083         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
00084     }
00085 
00086 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
00087     template <>                                                                     \
00088     struct name<template_arg>{                                                      \
00089         typedef register_type reg_type;                                             \
00090         static reg_type load(const template_arg * p) { return load_body (p); }      \
00091         static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
00092     }
00093 
00094 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
00095     template<>                                                                 \
00096     struct name<template_arg>                                                  \
00097     {                                                                          \
00098         VLoadStore256<template_arg>::reg_type operator()(                      \
00099                         const VLoadStore256<template_arg>::reg_type & a,       \
00100                         const VLoadStore256<template_arg>::reg_type & b) const \
00101         {                                                                      \
00102             body;                                                              \
00103         }                                                                      \
00104     }
00105 
00106 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
00107     template<>                                                                 \
00108     struct name<template_arg>                                                  \
00109     {                                                                          \
00110         VLoadStore256<template_arg>::reg_type operator()(                      \
00111                         const VLoadStore256<template_arg>::reg_type & a,       \
00112                         const VLoadStore256<template_arg>::reg_type &  ) const \
00113         {                                                                      \
00114             body;                                                              \
00115         }                                                                      \
00116     }
00117 
00118 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
00119 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
00120 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
00121 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
00122 FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
00123 FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
00124 FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
00125 
00126 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
00127 FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
00128 FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
00129 
00130 FUNCTOR_TEMPLATE(VAdd);
00131 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
00132 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
00133 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
00134 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
00135 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
00136 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
00137 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
00138 
00139 FUNCTOR_TEMPLATE(VSub);
00140 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
00141 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
00142 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
00143 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
00144 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
00145 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
00146 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
00147 
00148 FUNCTOR_TEMPLATE(VMin);
00149 FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
00150 FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
00151 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
00152 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
00153 FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
00154 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
00155 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
00156 
00157 FUNCTOR_TEMPLATE(VMax);
00158 FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
00159 FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
00160 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
00161 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
00162 FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
00163 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
00164 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
00165 
00166 
00167 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
00168                                                            0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
00169 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
00170                                                            0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
00171 
00172 FUNCTOR_TEMPLATE(VAbsDiff);
00173 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
00174         return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
00175     );
00176 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
00177         __m256i d = _mm256_subs_epi8(a, b);
00178         __m256i m = _mm256_cmpgt_epi8(b, a);
00179         return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
00180     );
00181 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
00182         return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
00183     );
00184 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
00185         __m256i M = _mm256_max_epi16(a, b);
00186         __m256i m = _mm256_min_epi16(a, b);
00187         return _mm256_subs_epi16(M, m);
00188     );
00189 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
00190         __m256i d = _mm256_sub_epi32(a, b);
00191         __m256i m = _mm256_cmpgt_epi32(b, a);
00192         return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
00193     );
00194 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
00195         return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
00196     );
00197 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
00198         return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
00199     );
00200 
00201 FUNCTOR_TEMPLATE(VAnd);
00202 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
00203 FUNCTOR_TEMPLATE(VOr);
00204 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
00205 FUNCTOR_TEMPLATE(VXor);
00206 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
00207 FUNCTOR_TEMPLATE(VNot);
00208 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
00209 
00210 #elif CV_SSE2
00211 
00212 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
00213     template <>                                                                                  \
00214     struct name<template_arg>{                                                                   \
00215         typedef register_type reg_type;                                                          \
00216         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
00217         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
00218     }
00219 
00220 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
00221     template <>                                                                \
00222     struct name<template_arg>{                                                 \
00223         typedef register_type reg_type;                                        \
00224         static reg_type load(const template_arg * p) { return load_body (p); } \
00225         static void store(template_arg * p, reg_type v) { store_body (p, v); } \
00226     }
00227 
00228 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
00229     template<>                                                                 \
00230     struct name<template_arg>                                                  \
00231     {                                                                          \
00232         VLoadStore128<template_arg>::reg_type operator()(                      \
00233                         const VLoadStore128<template_arg>::reg_type & a,       \
00234                         const VLoadStore128<template_arg>::reg_type & b) const \
00235         {                                                                      \
00236             body;                                                              \
00237         }                                                                      \
00238     }
00239 
00240 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
00241     template<>                                                                 \
00242     struct name<template_arg>                                                  \
00243     {                                                                          \
00244         VLoadStore128<template_arg>::reg_type operator()(                      \
00245                         const VLoadStore128<template_arg>::reg_type & a,       \
00246                         const VLoadStore128<template_arg>::reg_type &  ) const \
00247         {                                                                      \
00248             body;                                                              \
00249         }                                                                      \
00250     }
00251 
00252 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
00253 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
00254 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
00255 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
00256 FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
00257 FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
00258 FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
00259 
00260 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
00261 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
00262 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
00263 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
00264 
00265 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
00266 FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
00267 FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
00268 
00269 FUNCTOR_TEMPLATE(VAdd);
00270 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
00271 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
00272 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
00273 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
00274 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
00275 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
00276 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
00277 
00278 FUNCTOR_TEMPLATE(VSub);
00279 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
00280 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
00281 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
00282 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
00283 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
00284 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
00285 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
00286 
00287 FUNCTOR_TEMPLATE(VMin);
00288 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
00289 FUNCTOR_CLOSURE_2arg(VMin, schar,
00290         __m128i m = _mm_cmpgt_epi8(a, b);
00291         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
00292     );
00293 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
00294 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
00295 FUNCTOR_CLOSURE_2arg(VMin,    int,
00296         __m128i m = _mm_cmpgt_epi32(a, b);
00297         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
00298     );
00299 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
00300 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
00301 
00302 FUNCTOR_TEMPLATE(VMax);
00303 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
00304 FUNCTOR_CLOSURE_2arg(VMax, schar,
00305         __m128i m = _mm_cmpgt_epi8(b, a);
00306         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
00307     );
00308 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
00309 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
00310 FUNCTOR_CLOSURE_2arg(VMax,    int,
00311         __m128i m = _mm_cmpgt_epi32(b, a);
00312         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
00313     );
00314 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
00315 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
00316 
00317 
00318 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
00319 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
00320 
00321 FUNCTOR_TEMPLATE(VAbsDiff);
00322 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
00323         return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
00324     );
00325 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
00326         __m128i d = _mm_subs_epi8(a, b);
00327         __m128i m = _mm_cmpgt_epi8(b, a);
00328         return _mm_subs_epi8(_mm_xor_si128(d, m), m);
00329     );
00330 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
00331         return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
00332     );
00333 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
00334         __m128i M = _mm_max_epi16(a, b);
00335         __m128i m = _mm_min_epi16(a, b);
00336         return _mm_subs_epi16(M, m);
00337     );
00338 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
00339         __m128i d = _mm_sub_epi32(a, b);
00340         __m128i m = _mm_cmpgt_epi32(b, a);
00341         return _mm_sub_epi32(_mm_xor_si128(d, m), m);
00342     );
00343 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
00344         return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
00345     );
00346 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
00347         return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
00348     );
00349 
00350 FUNCTOR_TEMPLATE(VAnd);
00351 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
00352 FUNCTOR_TEMPLATE(VOr);
00353 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
00354 FUNCTOR_TEMPLATE(VXor);
00355 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
00356 FUNCTOR_TEMPLATE(VNot);
00357 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
00358 #endif
00359 
00360 #if CV_NEON
00361 
00362 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
00363     template <>                                                                \
00364     struct name<template_arg>{                                                 \
00365         typedef register_type reg_type;                                        \
00366         static reg_type load(const template_arg * p) { return load_body (p);}; \
00367         static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
00368     }
00369 
00370 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
00371     template<>                                                         \
00372     struct name<template_arg>                                          \
00373     {                                                                  \
00374         VLoadStore128<template_arg>::reg_type operator()(              \
00375                         VLoadStore128<template_arg>::reg_type a,       \
00376                         VLoadStore128<template_arg>::reg_type b) const \
00377         {                                                              \
00378             return body;                                               \
00379         };                                                             \
00380     }
00381 
00382 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
00383     template<>                                                         \
00384     struct name<template_arg>                                          \
00385     {                                                                  \
00386         VLoadStore128<template_arg>::reg_type operator()(              \
00387                         VLoadStore128<template_arg>::reg_type a,       \
00388                         VLoadStore128<template_arg>::reg_type  ) const \
00389         {                                                              \
00390             return body;                                               \
00391         };                                                             \
00392     }
00393 
00394 FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
00395 FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
00396 FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
00397 FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
00398 FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
00399 FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
00400 
00401 FUNCTOR_TEMPLATE(VAdd);
00402 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
00403 FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
00404 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
00405 FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
00406 FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
00407 FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
00408 
00409 FUNCTOR_TEMPLATE(VSub);
00410 FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
00411 FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
00412 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
00413 FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
00414 FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
00415 FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
00416 
00417 FUNCTOR_TEMPLATE(VMin);
00418 FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
00419 FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
00420 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
00421 FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
00422 FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
00423 FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
00424 
00425 FUNCTOR_TEMPLATE(VMax);
00426 FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
00427 FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
00428 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
00429 FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
00430 FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
00431 FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
00432 
00433 FUNCTOR_TEMPLATE(VAbsDiff);
00434 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
00435 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
00436 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
00437 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
00438 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
00439 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
00440 
00441 FUNCTOR_TEMPLATE(VAnd);
00442 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
00443 FUNCTOR_TEMPLATE(VOr);
00444 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
00445 FUNCTOR_TEMPLATE(VXor);
00446 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
00447 FUNCTOR_TEMPLATE(VNot);
00448 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
00449 #endif
00450 
00451 
00452 template <typename T>
00453 struct Cmp_SIMD
00454 {
00455     explicit Cmp_SIMD(int)
00456     {
00457     }
00458 
00459     int operator () (const T *, const T *, uchar *, int) const
00460     {
00461         return 0;
00462     }
00463 };
00464 
00465 #if CV_NEON
00466 
00467 template <>
00468 struct Cmp_SIMD<schar>
00469 {
00470     explicit Cmp_SIMD(int code_) :
00471         code(code_)
00472     {
00473         // CV_Assert(code == CMP_GT || code == CMP_LE ||
00474         //           code == CMP_EQ || code == CMP_NE);
00475 
00476         v_mask = vdupq_n_u8(255);
00477     }
00478 
00479     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
00480     {
00481         int x = 0;
00482 
00483         if (code == CMP_GT)
00484             for ( ; x <= width - 16; x += 16)
00485                 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
00486         else if (code == CMP_LE)
00487             for ( ; x <= width - 16; x += 16)
00488                 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
00489         else if (code == CMP_EQ)
00490             for ( ; x <= width - 16; x += 16)
00491                 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
00492         else if (code == CMP_NE)
00493             for ( ; x <= width - 16; x += 16)
00494                 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
00495 
00496         return x;
00497     }
00498 
00499     int code;
00500     uint8x16_t v_mask;
00501 };
00502 
00503 template <>
00504 struct Cmp_SIMD<ushort>
00505 {
00506     explicit Cmp_SIMD(int code_) :
00507         code(code_)
00508     {
00509         // CV_Assert(code == CMP_GT || code == CMP_LE ||
00510         //           code == CMP_EQ || code == CMP_NE);
00511 
00512         v_mask = vdup_n_u8(255);
00513     }
00514 
00515     int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
00516     {
00517         int x = 0;
00518 
00519         if (code == CMP_GT)
00520             for ( ; x <= width - 8; x += 8)
00521             {
00522                 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
00523                 vst1_u8(dst + x, vmovn_u16(v_dst));
00524             }
00525         else if (code == CMP_LE)
00526             for ( ; x <= width - 8; x += 8)
00527             {
00528                 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
00529                 vst1_u8(dst + x, vmovn_u16(v_dst));
00530             }
00531         else if (code == CMP_EQ)
00532             for ( ; x <= width - 8; x += 8)
00533             {
00534                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
00535                 vst1_u8(dst + x, vmovn_u16(v_dst));
00536             }
00537         else if (code == CMP_NE)
00538             for ( ; x <= width - 8; x += 8)
00539             {
00540                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
00541                 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
00542             }
00543 
00544         return x;
00545     }
00546 
00547     int code;
00548     uint8x8_t v_mask;
00549 };
00550 
00551 template <>
00552 struct Cmp_SIMD<int>
00553 {
00554     explicit Cmp_SIMD(int code_) :
00555         code(code_)
00556     {
00557         // CV_Assert(code == CMP_GT || code == CMP_LE ||
00558         //           code == CMP_EQ || code == CMP_NE);
00559 
00560         v_mask = vdup_n_u8(255);
00561     }
00562 
00563     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
00564     {
00565         int x = 0;
00566 
00567         if (code == CMP_GT)
00568             for ( ; x <= width - 8; x += 8)
00569             {
00570                 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
00571                 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
00572                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
00573             }
00574         else if (code == CMP_LE)
00575             for ( ; x <= width - 8; x += 8)
00576             {
00577                 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
00578                 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
00579                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
00580             }
00581         else if (code == CMP_EQ)
00582             for ( ; x <= width - 8; x += 8)
00583             {
00584                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
00585                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
00586                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
00587             }
00588         else if (code == CMP_NE)
00589             for ( ; x <= width - 8; x += 8)
00590             {
00591                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
00592                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
00593                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
00594                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
00595             }
00596 
00597         return x;
00598     }
00599 
00600     int code;
00601     uint8x8_t v_mask;
00602 };
00603 
00604 template <>
00605 struct Cmp_SIMD<float>
00606 {
00607     explicit Cmp_SIMD(int code_) :
00608         code(code_)
00609     {
00610         // CV_Assert(code == CMP_GT || code == CMP_LE ||
00611         //           code == CMP_EQ || code == CMP_NE);
00612 
00613         v_mask = vdup_n_u8(255);
00614     }
00615 
00616     int operator () (const float * src1, const float * src2, uchar * dst, int width) const
00617     {
00618         int x = 0;
00619 
00620         if (code == CMP_GT)
00621             for ( ; x <= width - 8; x += 8)
00622             {
00623                 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
00624                 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
00625                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
00626             }
00627         else if (code == CMP_LE)
00628             for ( ; x <= width - 8; x += 8)
00629             {
00630                 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
00631                 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
00632                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
00633             }
00634         else if (code == CMP_EQ)
00635             for ( ; x <= width - 8; x += 8)
00636             {
00637                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
00638                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
00639                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
00640             }
00641         else if (code == CMP_NE)
00642             for ( ; x <= width - 8; x += 8)
00643             {
00644                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
00645                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
00646                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
00647                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
00648             }
00649 
00650         return x;
00651     }
00652 
00653     int code;
00654     uint8x8_t v_mask;
00655 };
00656 
00657 #elif CV_SSE2
00658 
00659 template <>
00660 struct Cmp_SIMD<schar>
00661 {
00662     explicit Cmp_SIMD(int code_) :
00663         code(code_)
00664     {
00665         // CV_Assert(code == CMP_GT || code == CMP_LE ||
00666         //           code == CMP_EQ || code == CMP_NE);
00667 
00668         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
00669 
00670         v_mask = _mm_set1_epi8(-1);
00671     }
00672 
00673     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
00674     {
00675         int x = 0;
00676 
00677         if (!haveSSE)
00678             return x;
00679 
00680         if (code == CMP_GT)
00681             for ( ; x <= width - 16; x += 16)
00682                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
00683                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
00684         else if (code == CMP_LE)
00685             for ( ; x <= width - 16; x += 16)
00686             {
00687                 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
00688                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
00689                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
00690             }
00691         else if (code == CMP_EQ)
00692             for ( ; x <= width - 16; x += 16)
00693                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
00694                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
00695         else if (code == CMP_NE)
00696             for ( ; x <= width - 16; x += 16)
00697             {
00698                 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
00699                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
00700                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
00701             }
00702 
00703         return x;
00704     }
00705 
00706     int code;
00707     __m128i v_mask;
00708     bool haveSSE;
00709 };
00710 
00711 template <>
00712 struct Cmp_SIMD<int>
00713 {
00714     explicit Cmp_SIMD(int code_) :
00715         code(code_)
00716     {
00717         // CV_Assert(code == CMP_GT || code == CMP_LE ||
00718         //           code == CMP_EQ || code == CMP_NE);
00719 
00720         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
00721 
00722         v_mask = _mm_set1_epi32(0xffffffff);
00723     }
00724 
00725     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
00726     {
00727         int x = 0;
00728 
00729         if (!haveSSE)
00730             return x;
00731 
00732         if (code == CMP_GT)
00733             for ( ; x <= width - 8; x += 8)
00734             {
00735                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
00736                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
00737                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
00738                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
00739 
00740                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
00741             }
00742         else if (code == CMP_LE)
00743             for ( ; x <= width - 8; x += 8)
00744             {
00745                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
00746                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
00747                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
00748                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
00749 
00750                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
00751             }
00752         else if (code == CMP_EQ)
00753             for ( ; x <= width - 8; x += 8)
00754             {
00755                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
00756                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
00757                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
00758                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
00759 
00760                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
00761             }
00762         else if (code == CMP_NE)
00763             for ( ; x <= width - 8; x += 8)
00764             {
00765                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
00766                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
00767                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
00768                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
00769 
00770                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
00771             }
00772 
00773         return x;
00774     }
00775 
00776     int code;
00777     __m128i v_mask;
00778     bool haveSSE;
00779 };
00780 
00781 #endif
00782 
00783 
00784 template <typename T, typename WT>
00785 struct Mul_SIMD
00786 {
00787     int operator() (const T *, const T *, T *, int, WT) const
00788     {
00789         return 0;
00790     }
00791 };
00792 
00793 #if CV_NEON
00794 
00795 template <>
00796 struct Mul_SIMD<uchar, float>
00797 {
00798     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
00799     {
00800         int x = 0;
00801 
00802         if( scale == 1.0f )
00803             for ( ; x <= width - 8; x += 8)
00804             {
00805                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
00806                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
00807 
00808                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
00809                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
00810                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
00811                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
00812 
00813                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
00814                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
00815                 vst1_u8(dst + x, vqmovn_u16(v_dst));
00816             }
00817         else
00818         {
00819             float32x4_t v_scale = vdupq_n_f32(scale);
00820             for ( ; x <= width - 8; x += 8)
00821             {
00822                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
00823                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
00824 
00825                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
00826                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
00827                 v_dst1 = vmulq_f32(v_dst1, v_scale);
00828                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
00829                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
00830                 v_dst2 = vmulq_f32(v_dst2, v_scale);
00831 
00832                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
00833                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
00834                 vst1_u8(dst + x, vqmovn_u16(v_dst));
00835             }
00836         }
00837 
00838         return x;
00839     }
00840 };
00841 
00842 template <>
00843 struct Mul_SIMD<schar, float>
00844 {
00845     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
00846     {
00847         int x = 0;
00848 
00849         if( scale == 1.0f )
00850             for ( ; x <= width - 8; x += 8)
00851             {
00852                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
00853                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
00854 
00855                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
00856                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
00857                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
00858                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
00859 
00860                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
00861                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
00862                 vst1_s8(dst + x, vqmovn_s16(v_dst));
00863             }
00864         else
00865         {
00866             float32x4_t v_scale = vdupq_n_f32(scale);
00867             for ( ; x <= width - 8; x += 8)
00868             {
00869                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
00870                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
00871 
00872                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
00873                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
00874                 v_dst1 = vmulq_f32(v_dst1, v_scale);
00875                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
00876                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
00877                 v_dst2 = vmulq_f32(v_dst2, v_scale);
00878 
00879                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
00880                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
00881                 vst1_s8(dst + x, vqmovn_s16(v_dst));
00882             }
00883         }
00884 
00885         return x;
00886     }
00887 };
00888 
00889 template <>
00890 struct Mul_SIMD<ushort, float>
00891 {
00892     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
00893     {
00894         int x = 0;
00895 
00896         if( scale == 1.0f )
00897             for ( ; x <= width - 8; x += 8)
00898             {
00899                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
00900 
00901                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
00902                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
00903                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
00904                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
00905 
00906                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
00907                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
00908                 vst1q_u16(dst + x, v_dst);
00909             }
00910         else
00911         {
00912             float32x4_t v_scale = vdupq_n_f32(scale);
00913             for ( ; x <= width - 8; x += 8)
00914             {
00915                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
00916 
00917                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
00918                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
00919                 v_dst1 = vmulq_f32(v_dst1, v_scale);
00920                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
00921                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
00922                 v_dst2 = vmulq_f32(v_dst2, v_scale);
00923 
00924                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
00925                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
00926                 vst1q_u16(dst + x, v_dst);
00927             }
00928         }
00929 
00930         return x;
00931     }
00932 };
00933 
00934 template <>
00935 struct Mul_SIMD<short, float>
00936 {
00937     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
00938     {
00939         int x = 0;
00940 
00941         if( scale == 1.0f )
00942             for ( ; x <= width - 8; x += 8)
00943             {
00944                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
00945 
00946                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
00947                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
00948                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
00949                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
00950 
00951                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
00952                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
00953                 vst1q_s16(dst + x, v_dst);
00954             }
00955         else
00956         {
00957             float32x4_t v_scale = vdupq_n_f32(scale);
00958             for ( ; x <= width - 8; x += 8)
00959             {
00960                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
00961 
00962                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
00963                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
00964                 v_dst1 = vmulq_f32(v_dst1, v_scale);
00965                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
00966                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
00967                 v_dst2 = vmulq_f32(v_dst2, v_scale);
00968 
00969                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
00970                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
00971                 vst1q_s16(dst + x, v_dst);
00972             }
00973         }
00974 
00975         return x;
00976     }
00977 };
00978 
00979 template <>
00980 struct Mul_SIMD<float, float>
00981 {
00982     int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
00983     {
00984         int x = 0;
00985 
00986         if( scale == 1.0f )
00987             for ( ; x <= width - 8; x += 8)
00988             {
00989                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
00990                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
00991                 vst1q_f32(dst + x, v_dst1);
00992                 vst1q_f32(dst + x + 4, v_dst2);
00993             }
00994         else
00995         {
00996             float32x4_t v_scale = vdupq_n_f32(scale);
00997             for ( ; x <= width - 8; x += 8)
00998             {
00999                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
01000                 v_dst1 = vmulq_f32(v_dst1, v_scale);
01001 
01002                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
01003                 v_dst2 = vmulq_f32(v_dst2, v_scale);
01004 
01005                 vst1q_f32(dst + x, v_dst1);
01006                 vst1q_f32(dst + x + 4, v_dst2);
01007             }
01008         }
01009 
01010         return x;
01011     }
01012 };
01013 
01014 #elif CV_SSE2
01015 
01016 #if CV_SSE4_1
01017 
01018 template <>
01019 struct Mul_SIMD<ushort, float>
01020 {
01021     Mul_SIMD()
01022     {
01023         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
01024     }
01025 
01026     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
01027     {
01028         int x = 0;
01029 
01030         if (!haveSSE)
01031             return x;
01032 
01033         __m128i v_zero = _mm_setzero_si128();
01034 
01035         if( scale != 1.0f )
01036         {
01037             __m128 v_scale = _mm_set1_ps(scale);
01038             for ( ; x <= width - 8; x += 8)
01039             {
01040                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
01041                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
01042 
01043                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
01044                                            _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
01045                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
01046 
01047                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
01048                                            _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
01049                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
01050 
01051                 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
01052                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
01053             }
01054         }
01055 
01056         return x;
01057     }
01058 
01059     bool haveSSE;
01060 };
01061 
01062 #endif
01063 
01064 template <>
01065 struct Mul_SIMD<schar, float>
01066 {
01067     Mul_SIMD()
01068     {
01069         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
01070     }
01071 
01072     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
01073     {
01074         int x = 0;
01075 
01076         if (!haveSSE)
01077             return x;
01078 
01079         __m128i v_zero = _mm_setzero_si128();
01080 
01081         if( scale == 1.0f )
01082             for ( ; x <= width - 8; x += 8)
01083             {
01084                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
01085                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
01086 
01087                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
01088                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
01089 
01090                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
01091                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
01092 
01093                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
01094                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
01095 
01096                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
01097                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
01098             }
01099         else
01100         {
01101             __m128 v_scale = _mm_set1_ps(scale);
01102             for ( ; x <= width - 8; x += 8)
01103             {
01104                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
01105                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
01106 
01107                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
01108                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
01109 
01110                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
01111                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
01112                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
01113 
01114                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
01115                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
01116                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
01117 
01118                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
01119                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
01120             }
01121         }
01122 
01123         return x;
01124     }
01125 
01126     bool haveSSE;
01127 };
01128 
01129 template <>
01130 struct Mul_SIMD<short, float>
01131 {
01132     Mul_SIMD()
01133     {
01134         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
01135     }
01136 
01137     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
01138     {
01139         int x = 0;
01140 
01141         if (!haveSSE)
01142             return x;
01143 
01144         __m128i v_zero = _mm_setzero_si128();
01145 
01146         if( scale != 1.0f )
01147         {
01148             __m128 v_scale = _mm_set1_ps(scale);
01149             for ( ; x <= width - 8; x += 8)
01150             {
01151                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
01152                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
01153 
01154                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
01155                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
01156                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
01157 
01158                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
01159                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
01160                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
01161 
01162                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
01163                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
01164             }
01165         }
01166 
01167         return x;
01168     }
01169 
01170     bool haveSSE;
01171 };
01172 
01173 #endif
01174 
01175 template <typename T>
01176 struct Div_SIMD
01177 {
01178     int operator() (const T *, const T *, T *, int, double) const
01179     {
01180         return 0;
01181     }
01182 };
01183 
01184 template <typename T>
01185 struct Recip_SIMD
01186 {
01187     int operator() (const T *, T *, int, double) const
01188     {
01189         return 0;
01190     }
01191 };
01192 
01193 
01194 #if CV_SIMD128
01195 
01196 template <>
01197 struct Div_SIMD<uchar>
01198 {
01199     bool haveSIMD;
01200     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01201 
01202     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
01203     {
01204         int x = 0;
01205 
01206         if (!haveSIMD)
01207             return x;
01208 
01209         v_float32x4 v_scale = v_setall_f32((float)scale);
01210         v_uint16x8 v_zero = v_setzero_u16();
01211 
01212         for ( ; x <= width - 8; x += 8)
01213         {
01214             v_uint16x8 v_src1 = v_load_expand(src1 + x);
01215             v_uint16x8 v_src2 = v_load_expand(src2 + x);
01216 
01217             v_uint32x4 t0, t1, t2, t3;
01218             v_expand(v_src1, t0, t1);
01219             v_expand(v_src2, t2, t3);
01220 
01221             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
01222             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
01223 
01224             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
01225             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
01226 
01227             f0 = f0 * v_scale / f2;
01228             f1 = f1 * v_scale / f3;
01229 
01230             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
01231             v_uint16x8 res = v_pack_u(i0, i1);
01232 
01233             res = v_select(v_src2 == v_zero, v_zero, res);
01234             v_pack_store(dst + x, res);
01235         }
01236 
01237         return x;
01238     }
01239 };
01240 
01241 
01242 template <>
01243 struct Div_SIMD<schar>
01244 {
01245     bool haveSIMD;
01246     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01247 
01248     int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
01249     {
01250         int x = 0;
01251 
01252         if (!haveSIMD)
01253             return x;
01254 
01255         v_float32x4 v_scale = v_setall_f32((float)scale);
01256         v_int16x8 v_zero = v_setzero_s16();
01257 
01258         for ( ; x <= width - 8; x += 8)
01259         {
01260             v_int16x8 v_src1 = v_load_expand(src1 + x);
01261             v_int16x8 v_src2 = v_load_expand(src2 + x);
01262 
01263             v_int32x4 t0, t1, t2, t3;
01264             v_expand(v_src1, t0, t1);
01265             v_expand(v_src2, t2, t3);
01266 
01267             v_float32x4 f0 = v_cvt_f32(t0);
01268             v_float32x4 f1 = v_cvt_f32(t1);
01269 
01270             v_float32x4 f2 = v_cvt_f32(t2);
01271             v_float32x4 f3 = v_cvt_f32(t3);
01272 
01273             f0 = f0 * v_scale / f2;
01274             f1 = f1 * v_scale / f3;
01275 
01276             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
01277             v_int16x8 res = v_pack(i0, i1);
01278 
01279             res = v_select(v_src2 == v_zero, v_zero, res);
01280             v_pack_store(dst + x, res);
01281         }
01282 
01283         return x;
01284     }
01285 };
01286 
01287 
01288 template <>
01289 struct Div_SIMD<ushort>
01290 {
01291     bool haveSIMD;
01292     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01293 
01294     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
01295     {
01296         int x = 0;
01297 
01298         if (!haveSIMD)
01299             return x;
01300 
01301         v_float32x4 v_scale = v_setall_f32((float)scale);
01302         v_uint16x8 v_zero = v_setzero_u16();
01303 
01304         for ( ; x <= width - 8; x += 8)
01305         {
01306             v_uint16x8 v_src1 = v_load(src1 + x);
01307             v_uint16x8 v_src2 = v_load(src2 + x);
01308 
01309             v_uint32x4 t0, t1, t2, t3;
01310             v_expand(v_src1, t0, t1);
01311             v_expand(v_src2, t2, t3);
01312 
01313             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
01314             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
01315 
01316             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
01317             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
01318 
01319             f0 = f0 * v_scale / f2;
01320             f1 = f1 * v_scale / f3;
01321 
01322             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
01323             v_uint16x8 res = v_pack_u(i0, i1);
01324 
01325             res = v_select(v_src2 == v_zero, v_zero, res);
01326             v_store(dst + x, res);
01327         }
01328 
01329         return x;
01330     }
01331 };
01332 
01333 template <>
01334 struct Div_SIMD<short>
01335 {
01336     bool haveSIMD;
01337     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01338 
01339     int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
01340     {
01341         int x = 0;
01342 
01343         if (!haveSIMD)
01344             return x;
01345 
01346         v_float32x4 v_scale = v_setall_f32((float)scale);
01347         v_int16x8 v_zero = v_setzero_s16();
01348 
01349         for ( ; x <= width - 8; x += 8)
01350         {
01351             v_int16x8 v_src1 = v_load(src1 + x);
01352             v_int16x8 v_src2 = v_load(src2 + x);
01353 
01354             v_int32x4 t0, t1, t2, t3;
01355             v_expand(v_src1, t0, t1);
01356             v_expand(v_src2, t2, t3);
01357 
01358             v_float32x4 f0 = v_cvt_f32(t0);
01359             v_float32x4 f1 = v_cvt_f32(t1);
01360 
01361             v_float32x4 f2 = v_cvt_f32(t2);
01362             v_float32x4 f3 = v_cvt_f32(t3);
01363 
01364             f0 = f0 * v_scale / f2;
01365             f1 = f1 * v_scale / f3;
01366 
01367             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
01368             v_int16x8 res = v_pack(i0, i1);
01369 
01370             res = v_select(v_src2 == v_zero, v_zero, res);
01371             v_store(dst + x, res);
01372         }
01373 
01374         return x;
01375     }
01376 };
01377 
01378 template <>
01379 struct Div_SIMD<int>
01380 {
01381     bool haveSIMD;
01382     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01383 
01384     int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
01385     {
01386         int x = 0;
01387 
01388         if (!haveSIMD)
01389             return x;
01390 
01391         v_float32x4 v_scale = v_setall_f32((float)scale);
01392         v_int32x4 v_zero = v_setzero_s32();
01393 
01394         for ( ; x <= width - 8; x += 8)
01395         {
01396             v_int32x4 t0 = v_load(src1 + x);
01397             v_int32x4 t1 = v_load(src1 + x + 4);
01398             v_int32x4 t2 = v_load(src2 + x);
01399             v_int32x4 t3 = v_load(src2 + x + 4);
01400 
01401             v_float32x4 f0 = v_cvt_f32(t0);
01402             v_float32x4 f1 = v_cvt_f32(t1);
01403             v_float32x4 f2 = v_cvt_f32(t2);
01404             v_float32x4 f3 = v_cvt_f32(t3);
01405 
01406             f0 = f0 * v_scale / f2;
01407             f1 = f1 * v_scale / f3;
01408 
01409             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
01410 
01411             res0 = v_select(t2 == v_zero, v_zero, res0);
01412             res1 = v_select(t3 == v_zero, v_zero, res1);
01413             v_store(dst + x, res0);
01414             v_store(dst + x + 4, res1);
01415         }
01416 
01417         return x;
01418     }
01419 };
01420 
01421 
01422 template <>
01423 struct Div_SIMD<float>
01424 {
01425     bool haveSIMD;
01426     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01427 
01428     int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
01429     {
01430         int x = 0;
01431 
01432         if (!haveSIMD)
01433             return x;
01434 
01435         v_float32x4 v_scale = v_setall_f32((float)scale);
01436         v_float32x4 v_zero = v_setzero_f32();
01437 
01438         for ( ; x <= width - 8; x += 8)
01439         {
01440             v_float32x4 f0 = v_load(src1 + x);
01441             v_float32x4 f1 = v_load(src1 + x + 4);
01442             v_float32x4 f2 = v_load(src2 + x);
01443             v_float32x4 f3 = v_load(src2 + x + 4);
01444 
01445             v_float32x4 res0 = f0 * v_scale / f2;
01446             v_float32x4 res1 = f1 * v_scale / f3;
01447 
01448             res0 = v_select(f2 == v_zero, v_zero, res0);
01449             res1 = v_select(f3 == v_zero, v_zero, res1);
01450 
01451             v_store(dst + x, res0);
01452             v_store(dst + x + 4, res1);
01453         }
01454 
01455         return x;
01456     }
01457 };
01458 
01459 
01460 ///////////////////////// RECIPROCAL //////////////////////
01461 
01462 template <>
01463 struct Recip_SIMD<uchar>
01464 {
01465     bool haveSIMD;
01466     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01467 
01468     int operator() (const uchar * src2, uchar * dst, int width, double scale) const
01469     {
01470         int x = 0;
01471 
01472         if (!haveSIMD)
01473             return x;
01474 
01475         v_float32x4 v_scale = v_setall_f32((float)scale);
01476         v_uint16x8 v_zero = v_setzero_u16();
01477 
01478         for ( ; x <= width - 8; x += 8)
01479         {
01480             v_uint16x8 v_src2 = v_load_expand(src2 + x);
01481 
01482             v_uint32x4 t0, t1;
01483             v_expand(v_src2, t0, t1);
01484 
01485             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
01486             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
01487 
01488             f0 = v_scale / f0;
01489             f1 = v_scale / f1;
01490 
01491             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
01492             v_uint16x8 res = v_pack_u(i0, i1);
01493 
01494             res = v_select(v_src2 == v_zero, v_zero, res);
01495             v_pack_store(dst + x, res);
01496         }
01497 
01498         return x;
01499     }
01500 };
01501 
01502 
01503 template <>
01504 struct Recip_SIMD<schar>
01505 {
01506     bool haveSIMD;
01507     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01508 
01509     int operator() (const schar * src2, schar * dst, int width, double scale) const
01510     {
01511         int x = 0;
01512 
01513         if (!haveSIMD)
01514             return x;
01515 
01516         v_float32x4 v_scale = v_setall_f32((float)scale);
01517         v_int16x8 v_zero = v_setzero_s16();
01518 
01519         for ( ; x <= width - 8; x += 8)
01520         {
01521             v_int16x8 v_src2 = v_load_expand(src2 + x);
01522 
01523             v_int32x4 t0, t1;
01524             v_expand(v_src2, t0, t1);
01525 
01526             v_float32x4 f0 = v_cvt_f32(t0);
01527             v_float32x4 f1 = v_cvt_f32(t1);
01528 
01529             f0 = v_scale / f0;
01530             f1 = v_scale / f1;
01531 
01532             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
01533             v_int16x8 res = v_pack(i0, i1);
01534 
01535             res = v_select(v_src2 == v_zero, v_zero, res);
01536             v_pack_store(dst + x, res);
01537         }
01538 
01539         return x;
01540     }
01541 };
01542 
01543 
01544 template <>
01545 struct Recip_SIMD<ushort>
01546 {
01547     bool haveSIMD;
01548     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01549 
01550     int operator() (const ushort * src2, ushort * dst, int width, double scale) const
01551     {
01552         int x = 0;
01553 
01554         if (!haveSIMD)
01555             return x;
01556 
01557         v_float32x4 v_scale = v_setall_f32((float)scale);
01558         v_uint16x8 v_zero = v_setzero_u16();
01559 
01560         for ( ; x <= width - 8; x += 8)
01561         {
01562             v_uint16x8 v_src2 = v_load(src2 + x);
01563 
01564             v_uint32x4 t0, t1;
01565             v_expand(v_src2, t0, t1);
01566 
01567             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
01568             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
01569 
01570             f0 = v_scale / f0;
01571             f1 = v_scale / f1;
01572 
01573             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
01574             v_uint16x8 res = v_pack_u(i0, i1);
01575 
01576             res = v_select(v_src2 == v_zero, v_zero, res);
01577             v_store(dst + x, res);
01578         }
01579 
01580         return x;
01581     }
01582 };
01583 
01584 template <>
01585 struct Recip_SIMD<short>
01586 {
01587     bool haveSIMD;
01588     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01589 
01590     int operator() (const short * src2, short * dst, int width, double scale) const
01591     {
01592         int x = 0;
01593 
01594         if (!haveSIMD)
01595             return x;
01596 
01597         v_float32x4 v_scale = v_setall_f32((float)scale);
01598         v_int16x8 v_zero = v_setzero_s16();
01599 
01600         for ( ; x <= width - 8; x += 8)
01601         {
01602             v_int16x8 v_src2 = v_load(src2 + x);
01603 
01604             v_int32x4 t0, t1;
01605             v_expand(v_src2, t0, t1);
01606 
01607             v_float32x4 f0 = v_cvt_f32(t0);
01608             v_float32x4 f1 = v_cvt_f32(t1);
01609 
01610             f0 = v_scale / f0;
01611             f1 = v_scale / f1;
01612 
01613             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
01614             v_int16x8 res = v_pack(i0, i1);
01615 
01616             res = v_select(v_src2 == v_zero, v_zero, res);
01617             v_store(dst + x, res);
01618         }
01619 
01620         return x;
01621     }
01622 };
01623 
01624 template <>
01625 struct Recip_SIMD<int>
01626 {
01627     bool haveSIMD;
01628     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01629 
01630     int operator() (const int * src2, int * dst, int width, double scale) const
01631     {
01632         int x = 0;
01633 
01634         if (!haveSIMD)
01635             return x;
01636 
01637         v_float32x4 v_scale = v_setall_f32((float)scale);
01638         v_int32x4 v_zero = v_setzero_s32();
01639 
01640         for ( ; x <= width - 8; x += 8)
01641         {
01642             v_int32x4 t0 = v_load(src2 + x);
01643             v_int32x4 t1 = v_load(src2 + x + 4);
01644 
01645             v_float32x4 f0 = v_cvt_f32(t0);
01646             v_float32x4 f1 = v_cvt_f32(t1);
01647 
01648             f0 = v_scale / f0;
01649             f1 = v_scale / f1;
01650 
01651             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
01652 
01653             res0 = v_select(t0 == v_zero, v_zero, res0);
01654             res1 = v_select(t1 == v_zero, v_zero, res1);
01655             v_store(dst + x, res0);
01656             v_store(dst + x + 4, res1);
01657         }
01658 
01659         return x;
01660     }
01661 };
01662 
01663 
01664 template <>
01665 struct Recip_SIMD<float>
01666 {
01667     bool haveSIMD;
01668     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01669 
01670     int operator() (const float * src2, float * dst, int width, double scale) const
01671     {
01672         int x = 0;
01673 
01674         if (!haveSIMD)
01675             return x;
01676 
01677         v_float32x4 v_scale = v_setall_f32((float)scale);
01678         v_float32x4 v_zero = v_setzero_f32();
01679 
01680         for ( ; x <= width - 8; x += 8)
01681         {
01682             v_float32x4 f0 = v_load(src2 + x);
01683             v_float32x4 f1 = v_load(src2 + x + 4);
01684 
01685             v_float32x4 res0 = v_scale / f0;
01686             v_float32x4 res1 = v_scale / f1;
01687 
01688             res0 = v_select(f0 == v_zero, v_zero, res0);
01689             res1 = v_select(f1 == v_zero, v_zero, res1);
01690 
01691             v_store(dst + x, res0);
01692             v_store(dst + x + 4, res1);
01693         }
01694 
01695         return x;
01696     }
01697 };
01698 
01699 #if CV_SIMD128_64F
01700 
01701 template <>
01702 struct Div_SIMD<double>
01703 {
01704     bool haveSIMD;
01705     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01706 
01707     int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
01708     {
01709         int x = 0;
01710 
01711         if (!haveSIMD)
01712             return x;
01713 
01714         v_float64x2 v_scale = v_setall_f64(scale);
01715         v_float64x2 v_zero = v_setzero_f64();
01716 
01717         for ( ; x <= width - 4; x += 4)
01718         {
01719             v_float64x2 f0 = v_load(src1 + x);
01720             v_float64x2 f1 = v_load(src1 + x + 2);
01721             v_float64x2 f2 = v_load(src2 + x);
01722             v_float64x2 f3 = v_load(src2 + x + 2);
01723 
01724             v_float64x2 res0 = f0 * v_scale / f2;
01725             v_float64x2 res1 = f1 * v_scale / f3;
01726 
01727             res0 = v_select(f0 == v_zero, v_zero, res0);
01728             res1 = v_select(f1 == v_zero, v_zero, res1);
01729 
01730             v_store(dst + x, res0);
01731             v_store(dst + x + 2, res1);
01732         }
01733 
01734         return x;
01735     }
01736 };
01737 
01738 template <>
01739 struct Recip_SIMD<double>
01740 {
01741     bool haveSIMD;
01742     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
01743 
01744     int operator() (const double * src2, double * dst, int width, double scale) const
01745     {
01746         int x = 0;
01747 
01748         if (!haveSIMD)
01749             return x;
01750 
01751         v_float64x2 v_scale = v_setall_f64(scale);
01752         v_float64x2 v_zero = v_setzero_f64();
01753 
01754         for ( ; x <= width - 4; x += 4)
01755         {
01756             v_float64x2 f0 = v_load(src2 + x);
01757             v_float64x2 f1 = v_load(src2 + x + 2);
01758 
01759             v_float64x2 res0 = v_scale / f0;
01760             v_float64x2 res1 = v_scale / f1;
01761 
01762             res0 = v_select(f0 == v_zero, v_zero, res0);
01763             res1 = v_select(f1 == v_zero, v_zero, res1);
01764 
01765             v_store(dst + x, res0);
01766             v_store(dst + x + 2, res1);
01767         }
01768 
01769         return x;
01770     }
01771 };
01772 
01773 #endif
01774 
01775 #endif
01776 
01777 
01778 template <typename T, typename WT>
01779 struct AddWeighted_SIMD
01780 {
01781     int operator() (const T *, const T *, T *, int, WT, WT, WT) const
01782     {
01783         return 0;
01784     }
01785 };
01786 
01787 #if CV_SSE2
01788 
01789 template <>
01790 struct AddWeighted_SIMD<schar, float>
01791 {
01792     AddWeighted_SIMD()
01793     {
01794         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
01795     }
01796 
01797     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
01798     {
01799         int x = 0;
01800 
01801         if (!haveSSE2)
01802             return x;
01803 
01804         __m128i v_zero = _mm_setzero_si128();
01805         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
01806                v_gamma = _mm_set1_ps(gamma);
01807 
01808         for( ; x <= width - 8; x += 8 )
01809         {
01810             __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
01811             __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
01812 
01813             __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
01814             __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
01815 
01816             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
01817             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
01818                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
01819 
01820             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
01821             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
01822                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
01823 
01824             __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
01825                                               _mm_cvtps_epi32(v_dstf1));
01826 
01827             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
01828         }
01829 
01830         return x;
01831     }
01832 
01833     bool haveSSE2;
01834 };
01835 
01836 template <>
01837 struct AddWeighted_SIMD<short, float>
01838 {
01839     AddWeighted_SIMD()
01840     {
01841         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
01842     }
01843 
01844     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
01845     {
01846         int x = 0;
01847 
01848         if (!haveSSE2)
01849             return x;
01850 
01851         __m128i v_zero = _mm_setzero_si128();
01852         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
01853                v_gamma = _mm_set1_ps(gamma);
01854 
01855         for( ; x <= width - 8; x += 8 )
01856         {
01857             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
01858             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
01859 
01860             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
01861             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
01862                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
01863 
01864             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
01865             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
01866                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
01867 
01868             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
01869                                                                    _mm_cvtps_epi32(v_dstf1)));
01870         }
01871 
01872         return x;
01873     }
01874 
01875     bool haveSSE2;
01876 };
01877 
01878 #if CV_SSE4_1
01879 
01880 template <>
01881 struct AddWeighted_SIMD<ushort, float>
01882 {
01883     AddWeighted_SIMD()
01884     {
01885         haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
01886     }
01887 
01888     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
01889     {
01890         int x = 0;
01891 
01892         if (!haveSSE4_1)
01893             return x;
01894 
01895         __m128i v_zero = _mm_setzero_si128();
01896         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
01897                v_gamma = _mm_set1_ps(gamma);
01898 
01899         for( ; x <= width - 8; x += 8 )
01900         {
01901             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
01902             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
01903 
01904             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
01905             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
01906                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
01907 
01908             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
01909             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
01910                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
01911 
01912             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
01913                                                                     _mm_cvtps_epi32(v_dstf1)));
01914         }
01915 
01916         return x;
01917     }
01918 
01919     bool haveSSE4_1;
01920 };
01921 
01922 #endif
01923 
01924 #elif CV_NEON
01925 
01926 template <>
01927 struct AddWeighted_SIMD<schar, float>
01928 {
01929     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
01930     {
01931         int x = 0;
01932 
01933         float32x4_t g = vdupq_n_f32 (gamma);
01934 
01935         for( ; x <= width - 8; x += 8 )
01936         {
01937             int8x8_t in1 = vld1_s8(src1 + x);
01938             int16x8_t in1_16 = vmovl_s8(in1);
01939             float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
01940             float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
01941 
01942             int8x8_t in2 = vld1_s8(src2+x);
01943             int16x8_t in2_16 = vmovl_s8(in2);
01944             float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
01945             float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
01946 
01947             float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
01948             float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
01949             out_f_l = vaddq_f32(out_f_l, g);
01950             out_f_h = vaddq_f32(out_f_h, g);
01951 
01952             int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
01953             int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
01954 
01955             int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
01956             int8x8_t out = vqmovn_s16(out_16);
01957 
01958             vst1_s8(dst + x, out);
01959         }
01960 
01961         return x;
01962     }
01963 };
01964 
01965 template <>
01966 struct AddWeighted_SIMD<ushort, float>
01967 {
01968     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
01969     {
01970         int x = 0;
01971 
01972         float32x4_t g = vdupq_n_f32(gamma);
01973 
01974         for( ; x <= width - 8; x += 8 )
01975         {
01976             uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
01977 
01978             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
01979             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
01980             uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
01981 
01982             v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
01983             v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
01984             uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
01985 
01986             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
01987         }
01988 
01989         return x;
01990     }
01991 };
01992 
01993 template <>
01994 struct AddWeighted_SIMD<short, float>
01995 {
01996     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
01997     {
01998         int x = 0;
01999 
02000         float32x4_t g = vdupq_n_f32(gamma);
02001 
02002         for( ; x <= width - 8; x += 8 )
02003         {
02004             int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
02005 
02006             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
02007             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
02008             int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
02009 
02010             v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
02011             v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
02012             int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
02013 
02014             vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
02015         }
02016 
02017         return x;
02018     }
02019 };
02020 
02021 #endif
02022 
02023 }
02024 
02025 #endif // __OPENCV_ARITHM_SIMD_HPP__
02026