Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers merge.cpp Source File

merge.cpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 #include "precomp.hpp"
00045 
00046 namespace cv { namespace hal {
00047 
00048 #if CV_NEON
00049 template<typename T> struct VMerge2;
00050 template<typename T> struct VMerge3;
00051 template<typename T> struct VMerge4;
00052 
00053 #define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
00054     template<>                                                                    \
00055     struct name<data_type>{                                                       \
00056         void operator()(const data_type* src0, const data_type* src1,             \
00057                         data_type* dst){                                          \
00058             reg_type r;                                                           \
00059             r.val[0] = load_func(src0);                                           \
00060             r.val[1] = load_func(src1);                                           \
00061             store_func(dst, r);                                                   \
00062         }                                                                         \
00063     }
00064 
00065 #define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
00066     template<>                                                                    \
00067     struct name<data_type>{                                                       \
00068         void operator()(const data_type* src0, const data_type* src1,             \
00069                         const data_type* src2, data_type* dst){                   \
00070             reg_type r;                                                           \
00071             r.val[0] = load_func(src0);                                           \
00072             r.val[1] = load_func(src1);                                           \
00073             r.val[2] = load_func(src2);                                           \
00074             store_func(dst, r);                                                   \
00075         }                                                                         \
00076     }
00077 
00078 #define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
00079     template<>                                                                    \
00080     struct name<data_type>{                                                       \
00081         void operator()(const data_type* src0, const data_type* src1,             \
00082                         const data_type* src2, const data_type* src3,             \
00083                         data_type* dst){                                          \
00084             reg_type r;                                                           \
00085             r.val[0] = load_func(src0);                                           \
00086             r.val[1] = load_func(src1);                                           \
00087             r.val[2] = load_func(src2);                                           \
00088             r.val[3] = load_func(src3);                                           \
00089             store_func(dst, r);                                                   \
00090         }                                                                         \
00091     }
00092 
00093 MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
00094 MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
00095 MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
00096 MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
00097 
00098 MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
00099 MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
00100 MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
00101 MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
00102 
00103 MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
00104 MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
00105 MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
00106 MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
00107 
00108 #elif CV_SSE2
00109 
00110 template <typename T>
00111 struct VMerge2
00112 {
00113     VMerge2() : support(false) { }
00114     void operator()(const T *, const T *, T *) const { }
00115 
00116     bool support;
00117 };
00118 
00119 template <typename T>
00120 struct VMerge3
00121 {
00122     VMerge3() : support(false) { }
00123     void operator()(const T *, const T *, const T *, T *) const { }
00124 
00125     bool support;
00126 };
00127 
00128 template <typename T>
00129 struct VMerge4
00130 {
00131     VMerge4() : support(false) { }
00132     void operator()(const T *, const T *, const T *, const T *, T *) const { }
00133 
00134     bool support;
00135 };
00136 
00137 #define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
00138 template <>                                                                                \
00139 struct VMerge2<data_type>                                                                  \
00140 {                                                                                          \
00141     enum                                                                                   \
00142     {                                                                                      \
00143         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
00144     };                                                                                     \
00145                                                                                            \
00146     VMerge2()                                                                              \
00147     {                                                                                      \
00148         support = checkHardwareSupport(se);                                                \
00149     }                                                                                      \
00150                                                                                            \
00151     void operator()(const data_type * src0, const data_type * src1,                        \
00152                     data_type * dst) const                                                 \
00153     {                                                                                      \
00154         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
00155         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
00156         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
00157         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
00158                                                                                            \
00159         _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
00160                                                                                            \
00161         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
00162         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
00163         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
00164         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
00165     }                                                                                      \
00166                                                                                            \
00167     bool support;                                                                          \
00168 }
00169 
00170 #define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
00171 template <>                                                                                \
00172 struct VMerge3<data_type>                                                                  \
00173 {                                                                                          \
00174     enum                                                                                   \
00175     {                                                                                      \
00176         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
00177     };                                                                                     \
00178                                                                                            \
00179     VMerge3()                                                                              \
00180     {                                                                                      \
00181         support = checkHardwareSupport(se);                                                \
00182     }                                                                                      \
00183                                                                                            \
00184     void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
00185                     data_type * dst) const                                                 \
00186     {                                                                                      \
00187         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
00188         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
00189         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
00190         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
00191         reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
00192         reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
00193                                                                                            \
00194         _mm_interleave(v_src0, v_src1, v_src2,                                             \
00195                        v_src3, v_src4, v_src5);                                            \
00196                                                                                            \
00197         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
00198         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
00199         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
00200         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
00201         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
00202         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
00203     }                                                                                      \
00204                                                                                            \
00205     bool support;                                                                          \
00206 }
00207 
00208 #define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
00209 template <>                                                                                \
00210 struct VMerge4<data_type>                                                                  \
00211 {                                                                                          \
00212     enum                                                                                   \
00213     {                                                                                      \
00214         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
00215     };                                                                                     \
00216                                                                                            \
00217     VMerge4()                                                                              \
00218     {                                                                                      \
00219         support = checkHardwareSupport(se);                                                \
00220     }                                                                                      \
00221                                                                                            \
00222     void operator()(const data_type * src0, const data_type * src1,                        \
00223                     const data_type * src2, const data_type * src3,                        \
00224                     data_type * dst) const                                                 \
00225     {                                                                                      \
00226         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
00227         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
00228         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
00229         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
00230         reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
00231         reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
00232         reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
00233         reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
00234                                                                                            \
00235         _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
00236                        v_src4, v_src5, v_src6, v_src7);                                    \
00237                                                                                            \
00238         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
00239         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
00240         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
00241         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
00242         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
00243         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
00244         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
00245         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
00246     }                                                                                      \
00247                                                                                            \
00248     bool support;                                                                          \
00249 }
00250 
00251 MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
00252 MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
00253 MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
00254 
00255 #if CV_SSE4_1
00256 MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
00257 MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
00258 MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
00259 #endif
00260 
00261 MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
00262 MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
00263 MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
00264 
00265 #endif
00266 
00267 template<typename T> static void
00268 merge_( const T** src, T* dst, int len, int cn )
00269 {
00270     int k = cn % 4 ? cn % 4 : 4;
00271     int i, j;
00272     if( k == 1 )
00273     {
00274         const T* src0 = src[0];
00275         for( i = j = 0; i < len; i++, j += cn )
00276             dst[j] = src0[i];
00277     }
00278     else if( k == 2 )
00279     {
00280         const T *src0 = src[0], *src1 = src[1];
00281         i = j = 0;
00282 #if CV_NEON
00283         if(cn == 2)
00284         {
00285             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
00286             int inc_j = 2 * inc_i;
00287 
00288             VMerge2<T> vmerge;
00289             for( ; i < len - inc_i; i += inc_i, j += inc_j)
00290                 vmerge(src0 + i, src1 + i, dst + j);
00291         }
00292 #elif CV_SSE2
00293         if(cn == 2)
00294         {
00295             int inc_i = 32/sizeof(T);
00296             int inc_j = 2 * inc_i;
00297 
00298             VMerge2<T> vmerge;
00299             if (vmerge.support)
00300                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
00301                     vmerge(src0 + i, src1 + i, dst + j);
00302         }
00303 #endif
00304         for( ; i < len; i++, j += cn )
00305         {
00306             dst[j] = src0[i];
00307             dst[j+1] = src1[i];
00308         }
00309     }
00310     else if( k == 3 )
00311     {
00312         const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
00313         i = j = 0;
00314 #if CV_NEON
00315         if(cn == 3)
00316         {
00317             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
00318             int inc_j = 3 * inc_i;
00319 
00320             VMerge3<T> vmerge;
00321             for( ; i < len - inc_i; i += inc_i, j += inc_j)
00322                 vmerge(src0 + i, src1 + i, src2 + i, dst + j);
00323         }
00324 #elif CV_SSE2
00325         if(cn == 3)
00326         {
00327             int inc_i = 32/sizeof(T);
00328             int inc_j = 3 * inc_i;
00329 
00330             VMerge3<T> vmerge;
00331             if (vmerge.support)
00332                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
00333                     vmerge(src0 + i, src1 + i, src2 + i, dst + j);
00334         }
00335 #endif
00336         for( ; i < len; i++, j += cn )
00337         {
00338             dst[j] = src0[i];
00339             dst[j+1] = src1[i];
00340             dst[j+2] = src2[i];
00341         }
00342     }
00343     else
00344     {
00345         const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
00346         i = j = 0;
00347 #if CV_NEON
00348         if(cn == 4)
00349         {
00350             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
00351             int inc_j = 4 * inc_i;
00352 
00353             VMerge4<T> vmerge;
00354             for( ; i < len - inc_i; i += inc_i, j += inc_j)
00355                 vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
00356         }
00357 #elif CV_SSE2
00358         if(cn == 4)
00359         {
00360             int inc_i = 32/sizeof(T);
00361             int inc_j = 4 * inc_i;
00362 
00363             VMerge4<T> vmerge;
00364             if (vmerge.support)
00365                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
00366                     vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
00367         }
00368 #endif
00369         for( ; i < len; i++, j += cn )
00370         {
00371             dst[j] = src0[i]; dst[j+1] = src1[i];
00372             dst[j+2] = src2[i]; dst[j+3] = src3[i];
00373         }
00374     }
00375 
00376     for( ; k < cn; k += 4 )
00377     {
00378         const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
00379         for( i = 0, j = k; i < len; i++, j += cn )
00380         {
00381             dst[j] = src0[i]; dst[j+1] = src1[i];
00382             dst[j+2] = src2[i]; dst[j+3] = src3[i];
00383         }
00384     }
00385 }
00386 
00387 
00388 void merge8u(const uchar** src, uchar* dst, int len, int cn )
00389 {
00390     CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
00391     merge_(src, dst, len, cn);
00392 }
00393 
00394 void merge16u(const ushort** src, ushort* dst, int len, int cn )
00395 {
00396     CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
00397     merge_(src, dst, len, cn);
00398 }
00399 
00400 void merge32s(const int** src, int* dst, int len, int cn )
00401 {
00402     CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
00403     merge_(src, dst, len, cn);
00404 }
00405 
00406 void merge64s(const int64** src, int64* dst, int len, int cn )
00407 {
00408     CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
00409     merge_(src, dst, len, cn);
00410 }
00411 
00412 }}
00413