Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers pyramids.cpp Source File

pyramids.cpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 #include "precomp.hpp"
00045 #include "opencl_kernels_imgproc.hpp"
00046 
00047 namespace cv
00048 {
00049 
00050 template<typename T, int shift> struct FixPtCast
00051 {
00052     typedef int type1;
00053     typedef T rtype;
00054     rtype operator ()(type1 arg) const { return (T)((arg + (1 << (shift-1))) >> shift); }
00055 };
00056 
00057 template<typename T, int shift> struct FltCast
00058 {
00059     typedef T type1;
00060     typedef T rtype;
00061     rtype operator ()(type1 arg) const { return arg*(T)(1./(1 << shift)); }
00062 };
00063 
00064 template<typename T1, typename T2> struct PyrDownNoVec
00065 {
00066     int operator()(T1**, T2*, int, int) const { return 0; }
00067 };
00068 
00069 template<typename T1, typename T2> struct PyrUpNoVec
00070 {
00071     int operator()(T1**, T2**, int, int) const { return 0; }
00072 };
00073 
00074 #if CV_SSE2
00075 
00076 struct PyrDownVec_32s8u
00077 {
00078     int operator()(int** src, uchar* dst, int, int width) const
00079     {
00080         if( !checkHardwareSupport(CV_CPU_SSE2) )
00081             return 0;
00082 
00083         int x = 0;
00084         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
00085         __m128i delta = _mm_set1_epi16(128);
00086 
00087         for( ; x <= width - 16; x += 16 )
00088         {
00089             __m128i r0, r1, r2, r3, r4, t0, t1;
00090             r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)),
00091                                  _mm_load_si128((const __m128i*)(row0 + x + 4)));
00092             r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)),
00093                                  _mm_load_si128((const __m128i*)(row1 + x + 4)));
00094             r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)),
00095                                  _mm_load_si128((const __m128i*)(row2 + x + 4)));
00096             r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)),
00097                                  _mm_load_si128((const __m128i*)(row3 + x + 4)));
00098             r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)),
00099                                  _mm_load_si128((const __m128i*)(row4 + x + 4)));
00100             r0 = _mm_add_epi16(r0, r4);
00101             r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
00102             r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
00103             t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
00104             r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)),
00105                                  _mm_load_si128((const __m128i*)(row0 + x + 12)));
00106             r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)),
00107                                  _mm_load_si128((const __m128i*)(row1 + x + 12)));
00108             r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)),
00109                                  _mm_load_si128((const __m128i*)(row2 + x + 12)));
00110             r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)),
00111                                  _mm_load_si128((const __m128i*)(row3 + x + 12)));
00112             r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)),
00113                                  _mm_load_si128((const __m128i*)(row4 + x + 12)));
00114             r0 = _mm_add_epi16(r0, r4);
00115             r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
00116             r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
00117             t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
00118             t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8);
00119             t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8);
00120             _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1));
00121         }
00122 
00123         for( ; x <= width - 4; x += 4 )
00124         {
00125             __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128();
00126             r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z);
00127             r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z);
00128             r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z);
00129             r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z);
00130             r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z);
00131             r0 = _mm_add_epi16(r0, r4);
00132             r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
00133             r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
00134             r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
00135             r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8);
00136             *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0));
00137         }
00138 
00139         return x;
00140     }
00141 };
00142 
00143 struct PyrDownVec_32f
00144 {
00145     int operator()(float** src, float* dst, int, int width) const
00146     {
00147         if( !checkHardwareSupport(CV_CPU_SSE) )
00148             return 0;
00149 
00150         int x = 0;
00151         const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
00152         __m128 _4 = _mm_set1_ps(4.f), _scale = _mm_set1_ps(1.f/256);
00153         for( ; x <= width - 8; x += 8 )
00154         {
00155             __m128 r0, r1, r2, r3, r4, t0, t1;
00156             r0 = _mm_load_ps(row0 + x);
00157             r1 = _mm_load_ps(row1 + x);
00158             r2 = _mm_load_ps(row2 + x);
00159             r3 = _mm_load_ps(row3 + x);
00160             r4 = _mm_load_ps(row4 + x);
00161             r0 = _mm_add_ps(r0, r4);
00162             r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
00163             r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
00164             t0 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
00165 
00166             r0 = _mm_load_ps(row0 + x + 4);
00167             r1 = _mm_load_ps(row1 + x + 4);
00168             r2 = _mm_load_ps(row2 + x + 4);
00169             r3 = _mm_load_ps(row3 + x + 4);
00170             r4 = _mm_load_ps(row4 + x + 4);
00171             r0 = _mm_add_ps(r0, r4);
00172             r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
00173             r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
00174             t1 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
00175 
00176             t0 = _mm_mul_ps(t0, _scale);
00177             t1 = _mm_mul_ps(t1, _scale);
00178 
00179             _mm_storeu_ps(dst + x, t0);
00180             _mm_storeu_ps(dst + x + 4, t1);
00181         }
00182 
00183         return x;
00184     }
00185 };
00186 
00187 #if CV_SSE4_1
00188 
00189 struct PyrDownVec_32s16u
00190 {
00191     PyrDownVec_32s16u()
00192     {
00193         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
00194     }
00195 
00196     int operator()(int** src, ushort* dst, int, int width) const
00197     {
00198         int x = 0;
00199 
00200         if (!haveSSE)
00201             return x;
00202 
00203         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
00204         __m128i v_delta = _mm_set1_epi32(128);
00205 
00206         for( ; x <= width - 8; x += 8 )
00207         {
00208             __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
00209                     v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
00210             __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
00211                     v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
00212             __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
00213                     v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
00214             __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
00215                     v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
00216             __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
00217                     v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
00218 
00219             v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
00220             v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
00221 
00222             v_r10 = _mm_slli_epi32(v_r10, 2);
00223             __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
00224 
00225             v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
00226             v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
00227             v_r11 = _mm_slli_epi32(v_r11, 2);
00228             __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
00229 
00230             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1));
00231         }
00232 
00233         return x;
00234     }
00235 
00236     bool haveSSE;
00237 };
00238 
00239 #else
00240 
00241 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
00242 
00243 #endif // CV_SSE4_1
00244 
00245 struct PyrDownVec_32s16s
00246 {
00247     PyrDownVec_32s16s()
00248     {
00249         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
00250     }
00251 
00252     int operator()(int** src, short* dst, int, int width) const
00253     {
00254         int x = 0;
00255 
00256         if (!haveSSE)
00257             return x;
00258 
00259         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
00260         __m128i v_delta = _mm_set1_epi32(128);
00261 
00262         for( ; x <= width - 8; x += 8 )
00263         {
00264             __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
00265                     v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
00266             __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
00267                     v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
00268             __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
00269                     v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
00270             __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
00271                     v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
00272             __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
00273                     v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
00274 
00275             v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
00276             v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
00277 
00278             v_r10 = _mm_slli_epi32(v_r10, 2);
00279             __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
00280 
00281             v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
00282             v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
00283             v_r11 = _mm_slli_epi32(v_r11, 2);
00284             __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
00285 
00286             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1));
00287         }
00288 
00289         return x;
00290     }
00291 
00292     bool haveSSE;
00293 };
00294 
00295 struct PyrUpVec_32s8u
00296 {
00297     int operator()(int** src, uchar** dst, int, int width) const
00298     {
00299         int x = 0;
00300 
00301         if (!checkHardwareSupport(CV_CPU_SSE2))
00302             return x;
00303 
00304         uchar *dst0 = dst[0], *dst1 = dst[1];
00305         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
00306         __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128();
00307 
00308         for( ; x <= width - 16; x += 16 )
00309         {
00310             __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
00311                                            _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
00312             __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
00313                                            _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
00314             __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
00315                                            _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
00316 
00317             __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
00318             __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
00319             __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
00320 
00321             v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)),
00322                                    _mm_loadu_si128((__m128i const *)(row0 + x + 12)));
00323             v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)),
00324                                    _mm_loadu_si128((__m128i const *)(row1 + x + 12)));
00325             v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)),
00326                                    _mm_loadu_si128((__m128i const *)(row2 + x + 12)));
00327 
00328             v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
00329             __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
00330             __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
00331 
00332             _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6),
00333                                                                      _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6)));
00334             _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6),
00335                                                                      _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6)));
00336         }
00337 
00338         for( ; x <= width - 8; x += 8 )
00339         {
00340             __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
00341                                            _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
00342             __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
00343                                            _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
00344             __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
00345                                            _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
00346 
00347             __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
00348             __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
00349             __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
00350 
00351             _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero));
00352             _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero));
00353         }
00354 
00355         return x;
00356     }
00357 };
00358 
00359 struct PyrUpVec_32s16s
00360 {
00361     int operator()(int** src, short** dst, int, int width) const
00362     {
00363         int x = 0;
00364 
00365         if (!checkHardwareSupport(CV_CPU_SSE2))
00366             return x;
00367 
00368         short *dst0 = dst[0], *dst1 = dst[1];
00369         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
00370         __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
00371 
00372         for( ; x <= width - 8; x += 8 )
00373         {
00374             __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
00375                     v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
00376                     v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
00377             __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
00378             __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
00379             __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
00380 
00381             v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
00382             v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
00383             v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
00384             v_2r1 = _mm_slli_epi32(v_r1, 1);
00385             v_4r1 = _mm_slli_epi32(v_r1, 2);
00386             __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
00387             __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
00388 
00389             _mm_storeu_si128((__m128i *)(dst0 + x),
00390                 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
00391                                 _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
00392             _mm_storeu_si128((__m128i *)(dst1 + x),
00393                 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
00394                                 _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
00395         }
00396 
00397         for( ; x <= width - 4; x += 4 )
00398         {
00399             __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
00400                     v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
00401                     v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
00402             __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
00403 
00404             __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
00405             __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
00406 
00407             _mm_storel_epi64((__m128i *)(dst0 + x),
00408                 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
00409             _mm_storel_epi64((__m128i *)(dst1 + x),
00410                 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
00411         }
00412 
00413         return x;
00414     }
00415 };
00416 
00417 #if CV_SSE4_1
00418 
00419 struct PyrUpVec_32s16u
00420 {
00421     int operator()(int** src, ushort** dst, int, int width) const
00422     {
00423         int x = 0;
00424 
00425         if (!checkHardwareSupport(CV_CPU_SSE4_1))
00426             return x;
00427 
00428         ushort *dst0 = dst[0], *dst1 = dst[1];
00429         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
00430         __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
00431 
00432         for( ; x <= width - 8; x += 8 )
00433         {
00434             __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
00435                     v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
00436                     v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
00437             __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
00438             __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
00439             __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
00440 
00441             v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
00442             v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
00443             v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
00444             v_2r1 = _mm_slli_epi32(v_r1, 1);
00445             v_4r1 = _mm_slli_epi32(v_r1, 2);
00446             __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
00447             __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
00448 
00449             _mm_storeu_si128((__m128i *)(dst0 + x),
00450                 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
00451                                  _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
00452             _mm_storeu_si128((__m128i *)(dst1 + x),
00453                 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
00454                                  _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
00455         }
00456 
00457         for( ; x <= width - 4; x += 4 )
00458         {
00459             __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
00460                     v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
00461                     v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
00462             __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
00463 
00464             __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
00465             __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
00466 
00467             _mm_storel_epi64((__m128i *)(dst0 + x),
00468                 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
00469             _mm_storel_epi64((__m128i *)(dst1 + x),
00470                 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
00471         }
00472 
00473         return x;
00474     }
00475 };
00476 
00477 #else
00478 
00479 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
00480 
00481 #endif // CV_SSE4_1
00482 
00483 struct PyrUpVec_32f
00484 {
00485     int operator()(float** src, float** dst, int, int width) const
00486     {
00487         int x = 0;
00488 
00489         if (!checkHardwareSupport(CV_CPU_SSE2))
00490             return x;
00491 
00492         const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
00493         float *dst0 = dst[0], *dst1 = dst[1];
00494         __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
00495                v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
00496 
00497         for( ; x <= width - 8; x += 8 )
00498         {
00499             __m128 v_r0 = _mm_loadu_ps(row0 + x);
00500             __m128 v_r1 = _mm_loadu_ps(row1 + x);
00501             __m128 v_r2 = _mm_loadu_ps(row2 + x);
00502 
00503             _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
00504             _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
00505 
00506             v_r0 = _mm_loadu_ps(row0 + x + 4);
00507             v_r1 = _mm_loadu_ps(row1 + x + 4);
00508             v_r2 = _mm_loadu_ps(row2 + x + 4);
00509 
00510             _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
00511             _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
00512         }
00513 
00514         return x;
00515     }
00516 };
00517 
00518 #elif CV_NEON
00519 
00520 struct PyrDownVec_32s8u
00521 {
00522     int operator()(int** src, uchar* dst, int, int width) const
00523     {
00524         int x = 0;
00525         const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1],
00526                            *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3],
00527                            *row4 = (unsigned int*)src[4];
00528         uint16x8_t v_delta = vdupq_n_u16(128);
00529 
00530         for( ; x <= width - 16; x += 16 )
00531         {
00532             uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
00533             uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
00534             uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
00535             uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4)));
00536             uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4)));
00537 
00538             v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
00539             v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
00540             uint16x8_t v_dst0 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
00541 
00542             v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
00543             v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
00544             v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
00545             v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12)));
00546             v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12)));
00547 
00548             v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
00549             v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
00550             uint16x8_t v_dst1 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
00551 
00552             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)),
00553                                           vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8))));
00554         }
00555 
00556         return x;
00557     }
00558 };
00559 
00560 struct PyrDownVec_32s16u
00561 {
00562     int operator()(int** src, ushort* dst, int, int width) const
00563     {
00564         int x = 0;
00565         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
00566         int32x4_t v_delta = vdupq_n_s32(128);
00567 
00568         for( ; x <= width - 8; x += 8 )
00569         {
00570             int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
00571             int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
00572             int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
00573             int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
00574             int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
00575 
00576             v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
00577             v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
00578 
00579             v_r10 = vshlq_n_s32(v_r10, 2);
00580             int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
00581 
00582             v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
00583             v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
00584             v_r11 = vshlq_n_s32(v_r11, 2);
00585             int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
00586 
00587             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1)));
00588         }
00589 
00590         return x;
00591     }
00592 };
00593 
00594 struct PyrDownVec_32s16s
00595 {
00596     int operator()(int** src, short* dst, int, int width) const
00597     {
00598         int x = 0;
00599         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
00600         int32x4_t v_delta = vdupq_n_s32(128);
00601 
00602         for( ; x <= width - 8; x += 8 )
00603         {
00604             int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
00605             int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
00606             int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
00607             int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
00608             int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
00609 
00610             v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
00611             v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
00612             v_r10 = vshlq_n_s32(v_r10, 2);
00613             int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
00614 
00615             v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
00616             v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
00617             v_r11 = vshlq_n_s32(v_r11, 2);
00618             int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
00619 
00620             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
00621         }
00622 
00623         return x;
00624     }
00625 };
00626 
00627 struct PyrDownVec_32f
00628 {
00629     int operator()(float** src, float* dst, int, int width) const
00630     {
00631         int x = 0;
00632         const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
00633         float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f);
00634 
00635         for( ; x <= width - 8; x += 8 )
00636         {
00637             float32x4_t v_r0 = vld1q_f32(row0 + x);
00638             float32x4_t v_r1 = vld1q_f32(row1 + x);
00639             float32x4_t v_r2 = vld1q_f32(row2 + x);
00640             float32x4_t v_r3 = vld1q_f32(row3 + x);
00641             float32x4_t v_r4 = vld1q_f32(row4 + x);
00642 
00643             v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
00644             v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
00645             vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
00646 
00647             v_r0 = vld1q_f32(row0 + x + 4);
00648             v_r1 = vld1q_f32(row1 + x + 4);
00649             v_r2 = vld1q_f32(row2 + x + 4);
00650             v_r3 = vld1q_f32(row3 + x + 4);
00651             v_r4 = vld1q_f32(row4 + x + 4);
00652 
00653             v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
00654             v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
00655             vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
00656         }
00657 
00658         return x;
00659     }
00660 };
00661 
00662 struct PyrUpVec_32s8u
00663 {
00664     int operator()(int** src, uchar** dst, int, int width) const
00665     {
00666         int x = 0;
00667         uchar *dst0 = dst[0], *dst1 = dst[1];
00668         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
00669         uint16x8_t v_delta = vdupq_n_u16(32);
00670 
00671         for( ; x <= width - 16; x += 16 )
00672         {
00673             uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
00674             uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
00675             uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
00676 
00677             uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
00678             uint16x8_t v_dst00 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
00679             uint16x8_t v_dst10 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
00680 
00681             v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
00682             v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
00683             v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
00684 
00685             v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
00686             uint16x8_t v_dst01 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
00687             uint16x8_t v_dst11 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
00688 
00689             vst1q_u8(dst0 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst00, v_delta), 6)),
00690                                            vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst01, v_delta), 6))));
00691             vst1q_u8(dst1 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst10, v_delta), 6)),
00692                                            vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst11, v_delta), 6))));
00693         }
00694 
00695         for( ; x <= width - 8; x += 8 )
00696         {
00697             uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
00698             uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
00699             uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
00700 
00701             uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
00702             uint16x8_t v_dst0 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
00703             uint16x8_t v_dst1 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
00704 
00705             vst1_u8(dst0 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 6)));
00706             vst1_u8(dst1 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 6)));
00707         }
00708 
00709         return x;
00710     }
00711 };
00712 
00713 struct PyrUpVec_32s16u
00714 {
00715     int operator()(int** src, ushort** dst, int, int width) const
00716     {
00717         int x = 0;
00718         ushort *dst0 = dst[0], *dst1 = dst[1];
00719         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
00720         uint32x4_t v_delta = vdupq_n_u32(32);
00721 
00722         for( ; x <= width - 8; x += 8 )
00723         {
00724             uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
00725             uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
00726             uint32x4_t v_dst00 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
00727             uint32x4_t v_dst10 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
00728 
00729             v_r0 = vld1q_u32(row0 + x + 4);
00730             v_r1 = vld1q_u32(row1 + x + 4);
00731             v_r2 = vld1q_u32(row2 + x + 4);
00732             v_2r1 = vshlq_n_u32(v_r1, 1);
00733             v_4r1 = vshlq_n_u32(v_r1, 2);
00734             uint32x4_t v_dst01 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
00735             uint32x4_t v_dst11 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
00736 
00737             vst1q_u16(dst0 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst00, v_delta), 6)),
00738                                              vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst01, v_delta), 6))));
00739             vst1q_u16(dst1 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst10, v_delta), 6)),
00740                                              vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst11, v_delta), 6))));
00741         }
00742 
00743         for( ; x <= width - 4; x += 4 )
00744         {
00745             uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
00746             uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
00747 
00748             uint32x4_t v_dst0 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
00749             uint32x4_t v_dst1 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
00750 
00751             vst1_u16(dst0 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0, v_delta), 6)));
00752             vst1_u16(dst1 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1, v_delta), 6)));
00753         }
00754 
00755         return x;
00756     }
00757 };
00758 
00759 struct PyrUpVec_32s16s
00760 {
00761     int operator()(int** src, short** dst, int, int width) const
00762     {
00763         int x = 0;
00764         short *dst0 = dst[0], *dst1 = dst[1];
00765         const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
00766         int32x4_t v_delta = vdupq_n_s32(32);
00767 
00768         for( ; x <= width - 8; x += 8 )
00769         {
00770             int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
00771             int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
00772             int32x4_t v_dst00 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
00773             int32x4_t v_dst10 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
00774 
00775             v_r0 = vld1q_s32(row0 + x + 4);
00776             v_r1 = vld1q_s32(row1 + x + 4);
00777             v_r2 = vld1q_s32(row2 + x + 4);
00778             v_2r1 = vshlq_n_s32(v_r1, 1);
00779             v_4r1 = vshlq_n_s32(v_r1, 2);
00780             int32x4_t v_dst01 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
00781             int32x4_t v_dst11 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
00782 
00783             vst1q_s16(dst0 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst00, v_delta), 6)),
00784                                              vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst01, v_delta), 6))));
00785             vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst10, v_delta), 6)),
00786                                              vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst11, v_delta), 6))));
00787         }
00788 
00789         for( ; x <= width - 4; x += 4 )
00790         {
00791             int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
00792             int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
00793 
00794             int32x4_t v_dst0 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
00795             int32x4_t v_dst1 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
00796 
00797             vst1_s16(dst0 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst0, v_delta), 6)));
00798             vst1_s16(dst1 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst1, v_delta), 6)));
00799         }
00800 
00801         return x;
00802     }
00803 };
00804 
00805 struct PyrUpVec_32f
00806 {
00807     int operator()(float** src, float** dst, int, int width) const
00808     {
00809         int x = 0;
00810         const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
00811         float *dst0 = dst[0], *dst1 = dst[1];
00812         float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f);
00813 
00814         for( ; x <= width - 8; x += 8 )
00815         {
00816             float32x4_t v_r0 = vld1q_f32(row0 + x);
00817             float32x4_t v_r1 = vld1q_f32(row1 + x);
00818             float32x4_t v_r2 = vld1q_f32(row2 + x);
00819 
00820             vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
00821             vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
00822 
00823             v_r0 = vld1q_f32(row0 + x + 4);
00824             v_r1 = vld1q_f32(row1 + x + 4);
00825             v_r2 = vld1q_f32(row2 + x + 4);
00826 
00827             vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
00828             vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
00829         }
00830 
00831         return x;
00832     }
00833 };
00834 
00835 #else
00836 
00837 typedef PyrDownNoVec<int, uchar> PyrDownVec_32s8u;
00838 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
00839 typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
00840 typedef PyrDownNoVec<float, float> PyrDownVec_32f;
00841 
00842 typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
00843 typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
00844 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
00845 typedef PyrUpNoVec<float, float> PyrUpVec_32f;
00846 
00847 #endif
00848 
00849 template<class CastOp, class VecOp> void
00850 pyrDown_( const Mat& _src, Mat& _dst, int borderType )
00851 {
00852     const int PD_SZ = 5;
00853     typedef typename CastOp::type1 WT;
00854     typedef typename CastOp::rtype T;
00855 
00856     CV_Assert( !_src.empty() );
00857     Size ssize = _src.size(), dsize = _dst.size();
00858     int cn = _src.channels();
00859     int bufstep = (int)alignSize(dsize.width*cn, 16);
00860     AutoBuffer<WT> _buf(bufstep*PD_SZ + 16);
00861     WT* buf = alignPtr((WT*)_buf, 16);
00862     int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)];
00863     AutoBuffer<int> _tabM(dsize.width*cn);
00864     int* tabM = _tabM;
00865     WT* rows[PD_SZ];
00866     CastOp castOp;
00867     VecOp vecOp;
00868 
00869     CV_Assert( ssize.width > 0 && ssize.height > 0 &&
00870                std::abs(dsize.width*2 - ssize.width) <= 2 &&
00871                std::abs(dsize.height*2 - ssize.height) <= 2 );
00872     int k, x, sy0 = -PD_SZ/2, sy = sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);
00873 
00874     for( x = 0; x <= PD_SZ+1; x++ )
00875     {
00876         int sx0 = borderInterpolate(x - PD_SZ/2, ssize.width, borderType)*cn;
00877         int sx1 = borderInterpolate(x + width0*2 - PD_SZ/2, ssize.width, borderType)*cn;
00878         for( k = 0; k < cn; k++ )
00879         {
00880             tabL[x*cn + k] = sx0 + k;
00881             tabR[x*cn + k] = sx1 + k;
00882         }
00883     }
00884 
00885     ssize.width *= cn;
00886     dsize.width *= cn;
00887     width0 *= cn;
00888 
00889     for( x = 0; x < dsize.width; x++ )
00890         tabM[x] = (x/cn)*2*cn + x % cn;
00891 
00892     for( int y = 0; y < dsize.height; y++ )
00893     {
00894         T* dst = _dst.ptr<T>(y);
00895         WT *row0, *row1, *row2, *row3, *row4;
00896 
00897         // fill the ring buffer (horizontal convolution and decimation)
00898         for( ; sy <= y*2 + 2; sy++ )
00899         {
00900             WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep;
00901             int _sy = borderInterpolate(sy, ssize.height, borderType);
00902             const T* src = _src.ptr<T>(_sy);
00903             int limit = cn;
00904             const int* tab = tabL;
00905 
00906             for( x = 0;;)
00907             {
00908                 for( ; x < limit; x++ )
00909                 {
00910                     row[x] = src[tab[x+cn*2]]*6 + (src[tab[x+cn]] + src[tab[x+cn*3]])*4 +
00911                         src[tab[x]] + src[tab[x+cn*4]];
00912                 }
00913 
00914                 if( x == dsize.width )
00915                     break;
00916 
00917                 if( cn == 1 )
00918                 {
00919                     for( ; x < width0; x++ )
00920                         row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 +
00921                             src[x*2 - 2] + src[x*2 + 2];
00922                 }
00923                 else if( cn == 3 )
00924                 {
00925                     for( ; x < width0; x += 3 )
00926                     {
00927                         const T* s = src + x*2;
00928                         WT t0 = s[0]*6 + (s[-3] + s[3])*4 + s[-6] + s[6];
00929                         WT t1 = s[1]*6 + (s[-2] + s[4])*4 + s[-5] + s[7];
00930                         WT t2 = s[2]*6 + (s[-1] + s[5])*4 + s[-4] + s[8];
00931                         row[x] = t0; row[x+1] = t1; row[x+2] = t2;
00932                     }
00933                 }
00934                 else if( cn == 4 )
00935                 {
00936                     for( ; x < width0; x += 4 )
00937                     {
00938                         const T* s = src + x*2;
00939                         WT t0 = s[0]*6 + (s[-4] + s[4])*4 + s[-8] + s[8];
00940                         WT t1 = s[1]*6 + (s[-3] + s[5])*4 + s[-7] + s[9];
00941                         row[x] = t0; row[x+1] = t1;
00942                         t0 = s[2]*6 + (s[-2] + s[6])*4 + s[-6] + s[10];
00943                         t1 = s[3]*6 + (s[-1] + s[7])*4 + s[-5] + s[11];
00944                         row[x+2] = t0; row[x+3] = t1;
00945                     }
00946                 }
00947                 else
00948                 {
00949                     for( ; x < width0; x++ )
00950                     {
00951                         int sx = tabM[x];
00952                         row[x] = src[sx]*6 + (src[sx - cn] + src[sx + cn])*4 +
00953                             src[sx - cn*2] + src[sx + cn*2];
00954                     }
00955                 }
00956 
00957                 limit = dsize.width;
00958                 tab = tabR - x;
00959             }
00960         }
00961 
00962         // do vertical convolution and decimation and write the result to the destination image
00963         for( k = 0; k < PD_SZ; k++ )
00964             rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep;
00965         row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4];
00966 
00967         x = vecOp(rows, dst, (int)_dst.step, dsize.width);
00968         for( ; x < dsize.width; x++ )
00969             dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]);
00970     }
00971 }
00972 
00973 
00974 template<class CastOp, class VecOp> void
00975 pyrUp_( const Mat& _src, Mat& _dst, int)
00976 {
00977     const int PU_SZ = 3;
00978     typedef typename CastOp::type1 WT;
00979     typedef typename CastOp::rtype T;
00980 
00981     Size ssize = _src.size(), dsize = _dst.size();
00982     int cn = _src.channels();
00983     int bufstep = (int)alignSize((dsize.width+1)*cn, 16);
00984     AutoBuffer<WT> _buf(bufstep*PU_SZ + 16);
00985     WT* buf = alignPtr((WT*)_buf, 16);
00986     AutoBuffer<int> _dtab(ssize.width*cn);
00987     int* dtab = _dtab;
00988     WT* rows[PU_SZ];
00989     T* dsts[2];
00990     CastOp castOp;
00991     VecOp vecOp;
00992 
00993     CV_Assert( std::abs(dsize.width - ssize.width*2) == dsize.width % 2 &&
00994                std::abs(dsize.height - ssize.height*2) == dsize.height % 2);
00995     int k, x, sy0 = -PU_SZ/2, sy = sy0;
00996 
00997     ssize.width *= cn;
00998     dsize.width *= cn;
00999 
01000     for( x = 0; x < ssize.width; x++ )
01001         dtab[x] = (x/cn)*2*cn + x % cn;
01002 
01003     for( int y = 0; y < ssize.height; y++ )
01004     {
01005         T* dst0 = _dst.ptr<T>(y*2);
01006         T* dst1 = _dst.ptr<T>(std::min(y*2+1, dsize.height-1));
01007         WT *row0, *row1, *row2;
01008 
01009         // fill the ring buffer (horizontal convolution and decimation)
01010         for( ; sy <= y + 1; sy++ )
01011         {
01012             WT* row = buf + ((sy - sy0) % PU_SZ)*bufstep;
01013             int _sy = borderInterpolate(sy*2, dsize.height, BORDER_REFLECT_101)/2;
01014             const T* src = _src.ptr<T>(_sy);
01015 
01016             if( ssize.width == cn )
01017             {
01018                 for( x = 0; x < cn; x++ )
01019                     row[x] = row[x + cn] = src[x]*8;
01020                 continue;
01021             }
01022 
01023             for( x = 0; x < cn; x++ )
01024             {
01025                 int dx = dtab[x];
01026                 WT t0 = src[x]*6 + src[x + cn]*2;
01027                 WT t1 = (src[x] + src[x + cn])*4;
01028                 row[dx] = t0; row[dx + cn] = t1;
01029                 dx = dtab[ssize.width - cn + x];
01030                 int sx = ssize.width - cn + x;
01031                 t0 = src[sx - cn] + src[sx]*7;
01032                 t1 = src[sx]*8;
01033                 row[dx] = t0; row[dx + cn] = t1;
01034             }
01035 
01036             for( x = cn; x < ssize.width - cn; x++ )
01037             {
01038                 int dx = dtab[x];
01039                 WT t0 = src[x-cn] + src[x]*6 + src[x+cn];
01040                 WT t1 = (src[x] + src[x+cn])*4;
01041                 row[dx] = t0;
01042                 row[dx+cn] = t1;
01043             }
01044         }
01045 
01046         // do vertical convolution and decimation and write the result to the destination image
01047         for( k = 0; k < PU_SZ; k++ )
01048             rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep;
01049         row0 = rows[0]; row1 = rows[1]; row2 = rows[2];
01050         dsts[0] = dst0; dsts[1] = dst1;
01051 
01052         x = vecOp(rows, dsts, (int)_dst.step, dsize.width);
01053         for( ; x < dsize.width; x++ )
01054         {
01055             T t1 = castOp((row1[x] + row2[x])*4);
01056             T t0 = castOp(row0[x] + row1[x]*6 + row2[x]);
01057             dst1[x] = t1; dst0[x] = t0;
01058         }
01059     }
01060 }
01061 
01062 typedef void (*PyrFunc)(const Mat&, Mat&, int);
01063 
01064 #ifdef HAVE_OPENCL
01065 
01066 static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
01067 {
01068     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
01069 
01070     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
01071     if (cn > 4 || (depth == CV_64F && !doubleSupport))
01072         return false;
01073 
01074     Size ssize = _src.size();
01075     Size dsize = _dsz.area() == 0 ? Size((ssize.width + 1) / 2, (ssize.height + 1) / 2) : _dsz;
01076     if (dsize.height < 2 || dsize.width < 2)
01077         return false;
01078 
01079     CV_Assert( ssize.width > 0 && ssize.height > 0 &&
01080             std::abs(dsize.width*2 - ssize.width) <= 2 &&
01081             std::abs(dsize.height*2 - ssize.height) <= 2 );
01082 
01083     UMat src = _src.getUMat();
01084     _dst.create( dsize, src.type() );
01085     UMat dst = _dst.getUMat();
01086 
01087     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
01088     const int local_size = 256;
01089     int kercn = 1;
01090     if (depth == CV_8U && float_depth == CV_32F && cn == 1 && ocl::Device::getDefault().isIntel())
01091         kercn = 4;
01092     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
01093                                        "BORDER_REFLECT_101" };
01094     char cvt[2][50];
01095     String buildOptions = format(
01096             "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
01097             "-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d",
01098             ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)),
01099             ocl::convertTypeStr(float_depth, depth, cn, cvt[0]),
01100             ocl::convertTypeStr(depth, float_depth, cn, cvt[1]),
01101             doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(depth),
01102             cn, kercn, float_depth, borderMap[borderType], local_size
01103     );
01104     ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions);
01105     if (k.empty())
01106         return false;
01107 
01108     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
01109 
01110     size_t localThreads[2]  = { (size_t)local_size/kercn, 1 };
01111     size_t globalThreads[2] = { ((size_t)src.cols + (kercn-1))/kercn, ((size_t)dst.rows + 1) / 2 };
01112     return k.run(2, globalThreads, localThreads, false);
01113 }
01114 
01115 static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
01116 {
01117     int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type);
01118 
01119     if (channels > 4 || borderType != BORDER_DEFAULT)
01120         return false;
01121 
01122     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
01123     if (depth == CV_64F && !doubleSupport)
01124         return false;
01125 
01126     Size ssize = _src.size();
01127     if ((_dsz.area() != 0) && (_dsz != Size(ssize.width * 2, ssize.height * 2)))
01128         return false;
01129 
01130     UMat src = _src.getUMat();
01131     Size dsize = Size(ssize.width * 2, ssize.height * 2);
01132     _dst.create( dsize, src.type() );
01133     UMat dst = _dst.getUMat();
01134 
01135     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
01136     const int local_size = 16;
01137     char cvt[2][50];
01138     String buildOptions = format(
01139             "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
01140             "-D T1=%s -D cn=%d -D LOCAL_SIZE=%d",
01141             ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
01142             ocl::convertTypeStr(float_depth, depth, channels, cvt[0]),
01143             ocl::convertTypeStr(depth, float_depth, channels, cvt[1]),
01144             doubleSupport ? " -D DOUBLE_SUPPORT" : "",
01145             ocl::typeToStr(depth), channels, local_size
01146     );
01147     size_t globalThreads[2] = { (size_t)dst.cols, (size_t)dst.rows };
01148     size_t localThreads[2] = { (size_t)local_size, (size_t)local_size };
01149     ocl::Kernel k;
01150     if (ocl::Device::getDefault().isIntel() && channels == 1)
01151     {
01152         k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
01153         globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
01154     }
01155     else
01156         k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions);
01157 
01158     if (k.empty())
01159         return false;
01160 
01161     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
01162     return k.run(2, globalThreads, localThreads, false);
01163 }
01164 
01165 #endif
01166 
01167 }
01168 
01169 #if defined(HAVE_IPP)
01170 namespace cv
01171 {
01172 static bool ipp_pyrdown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
01173 {
01174 #if IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
01175     Size dsz = _dsz.area() == 0 ? Size((_src.cols() + 1)/2, (_src.rows() + 1)/2) : _dsz;
01176     bool isolated = (borderType & BORDER_ISOLATED) != 0;
01177     int borderTypeNI = borderType & ~BORDER_ISOLATED;
01178 
01179     Mat src = _src.getMat();
01180     _dst.create( dsz, src.type() );
01181     Mat dst = _dst.getMat();
01182     int depth = src.depth();
01183 
01184 
01185     {
01186         bool isolated = (borderType & BORDER_ISOLATED) != 0;
01187         int borderTypeNI = borderType & ~BORDER_ISOLATED;
01188         if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size(src.cols*2, src.rows*2))
01189         {
01190             typedef IppStatus (CV_STDCALL * ippiPyrUp)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer);
01191             int type = src.type();
01192             CV_SUPPRESS_DEPRECATED_START
01193             ippiPyrUp pyrUpFunc = type == CV_8UC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C1R :
01194                                   type == CV_8UC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C3R :
01195                                   type == CV_32FC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C1R :
01196                                   type == CV_32FC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C3R : 0;
01197             CV_SUPPRESS_DEPRECATED_END
01198 
01199             if (pyrUpFunc)
01200             {
01201                 int bufferSize;
01202                 IppiSize srcRoi = { src.cols, src.rows };
01203                 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f;
01204                 CV_SUPPRESS_DEPRECATED_START
01205                 IppStatus ok = ippiPyrUpGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize);
01206                 CV_SUPPRESS_DEPRECATED_END
01207                 if (ok >= 0)
01208                 {
01209                     Ipp8u* buffer = ippsMalloc_8u(bufferSize);
01210                     ok = pyrUpFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer);
01211                     ippsFree(buffer);
01212 
01213                     if (ok >= 0)
01214                     {
01215                         CV_IMPL_ADD(CV_IMPL_IPP);
01216                         return true;
01217                     }
01218                 }
01219             }
01220         }
01221     }
01222 #else
01223     CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(_dsz); CV_UNUSED(borderType);
01224 #endif
01225     return false;
01226 }
01227 }
01228 #endif
01229 
01230 void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
01231 {
01232     CV_Assert(borderType != BORDER_CONSTANT);
01233 
01234 #ifdef HAVE_OPENCL
01235     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
01236                ocl_pyrDown(_src, _dst, _dsz, borderType))
01237 #endif
01238 
01239     Mat src = _src.getMat();
01240     Size dsz = _dsz.area() == 0 ? Size((src.cols + 1)/2, (src.rows + 1)/2) : _dsz;
01241     _dst.create( dsz, src.type() );
01242     Mat dst = _dst.getMat();
01243     int depth = src.depth();
01244 
01245 #ifdef HAVE_TEGRA_OPTIMIZATION
01246     if(borderType == BORDER_DEFAULT && tegra::useTegra() && tegra::pyrDown(src, dst))
01247         return;
01248 #endif
01249 
01250 #ifdef HAVE_IPP
01251     bool isolated = (borderType & BORDER_ISOLATED) != 0;
01252     int borderTypeNI = borderType & ~BORDER_ISOLATED;
01253 #endif
01254     CV_IPP_RUN(borderTypeNI == BORDER_DEFAULT && (!_src.isSubmatrix() || isolated) && dsz == Size((_src.cols() + 1)/2, (_src.rows() + 1)/2),
01255         ipp_pyrdown( _src,  _dst,  _dsz,  borderType));
01256 
01257 
01258     PyrFunc func = 0;
01259     if( depth == CV_8U )
01260         func = pyrDown_<FixPtCast<uchar, 8>, PyrDownVec_32s8u>;
01261     else if( depth == CV_16S )
01262         func = pyrDown_<FixPtCast<short, 8>, PyrDownVec_32s16s >;
01263     else if( depth == CV_16U )
01264         func = pyrDown_<FixPtCast<ushort, 8>, PyrDownVec_32s16u >;
01265     else if( depth == CV_32F )
01266         func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>;
01267     else if( depth == CV_64F )
01268         func = pyrDown_<FltCast<double, 8>, PyrDownNoVec<double, double> >;
01269     else
01270         CV_Error( CV_StsUnsupportedFormat, "" );
01271 
01272     func( src, dst, borderType );
01273 }
01274 
01275 
01276 #if defined(HAVE_IPP)
01277 namespace cv
01278 {
01279 static bool ipp_pyrup( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
01280 {
01281 #if IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
01282     Size sz = _src.dims() <= 2 ? _src.size() : Size();
01283     Size dsz = _dsz.area() == 0 ? Size(_src.cols()*2, _src.rows()*2) : _dsz;
01284 
01285     Mat src = _src.getMat();
01286     _dst.create( dsz, src.type() );
01287     Mat dst = _dst.getMat();
01288     int depth = src.depth();
01289 
01290     {
01291         bool isolated = (borderType & BORDER_ISOLATED) != 0;
01292         int borderTypeNI = borderType & ~BORDER_ISOLATED;
01293         if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size(src.cols*2, src.rows*2))
01294         {
01295             typedef IppStatus (CV_STDCALL * ippiPyrUp)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer);
01296             int type = src.type();
01297             CV_SUPPRESS_DEPRECATED_START
01298             ippiPyrUp pyrUpFunc = type == CV_8UC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C1R :
01299                                   type == CV_8UC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C3R :
01300                                   type == CV_32FC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C1R :
01301                                   type == CV_32FC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C3R : 0;
01302             CV_SUPPRESS_DEPRECATED_END
01303 
01304             if (pyrUpFunc)
01305             {
01306                 int bufferSize;
01307                 IppiSize srcRoi = { src.cols, src.rows };
01308                 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f;
01309                 CV_SUPPRESS_DEPRECATED_START
01310                 IppStatus ok = ippiPyrUpGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize);
01311                 CV_SUPPRESS_DEPRECATED_END
01312                 if (ok >= 0)
01313                 {
01314                     Ipp8u* buffer = ippsMalloc_8u(bufferSize);
01315                     ok = pyrUpFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer);
01316                     ippsFree(buffer);
01317 
01318                     if (ok >= 0)
01319                     {
01320                         CV_IMPL_ADD(CV_IMPL_IPP);
01321                         return true;
01322                     }
01323                 }
01324             }
01325         }
01326     }
01327 #else
01328     CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(_dsz); CV_UNUSED(borderType);
01329 #endif
01330     return false;
01331 }
01332 }
01333 #endif
01334 
01335 void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
01336 {
01337     CV_Assert(borderType == BORDER_DEFAULT);
01338 
01339 #ifdef HAVE_OPENCL
01340     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
01341                ocl_pyrUp(_src, _dst, _dsz, borderType))
01342 #endif
01343 
01344 
01345     Mat src = _src.getMat();
01346     Size dsz = _dsz.area() == 0 ? Size(src.cols*2, src.rows*2) : _dsz;
01347     _dst.create( dsz, src.type() );
01348     Mat dst = _dst.getMat();
01349     int depth = src.depth();
01350 
01351 #ifdef HAVE_TEGRA_OPTIMIZATION
01352     if(borderType == BORDER_DEFAULT && tegra::useTegra() && tegra::pyrUp(src, dst))
01353         return;
01354 #endif
01355 
01356 #ifdef HAVE_IPP
01357     bool isolated = (borderType & BORDER_ISOLATED) != 0;
01358     int borderTypeNI = borderType & ~BORDER_ISOLATED;
01359 #endif
01360     CV_IPP_RUN(borderTypeNI == BORDER_DEFAULT && (!_src.isSubmatrix() || isolated) && dsz == Size(_src.cols()*2, _src.rows()*2),
01361         ipp_pyrup( _src,  _dst,  _dsz,  borderType));
01362 
01363 
01364     PyrFunc func = 0;
01365     if( depth == CV_8U )
01366         func = pyrUp_<FixPtCast<uchar, 6>, PyrUpVec_32s8u >;
01367     else if( depth == CV_16S )
01368         func = pyrUp_<FixPtCast<short, 6>, PyrUpVec_32s16s >;
01369     else if( depth == CV_16U )
01370         func = pyrUp_<FixPtCast<ushort, 6>, PyrUpVec_32s16u >;
01371     else if( depth == CV_32F )
01372         func = pyrUp_<FltCast<float, 6>, PyrUpVec_32f >;
01373     else if( depth == CV_64F )
01374         func = pyrUp_<FltCast<double, 6>, PyrUpNoVec<double, double> >;
01375     else
01376         CV_Error( CV_StsUnsupportedFormat, "" );
01377 
01378     func( src, dst, borderType );
01379 }
01380 
01381 
01382 #ifdef HAVE_IPP
01383 namespace cv
01384 {
01385 static bool ipp_buildpyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType )
01386 {
01387 #if IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
01388     Mat src = _src.getMat();
01389     _dst.create( maxlevel + 1, 1, 0 );
01390     _dst.getMatRef(0) = src;
01391 
01392     int i=1;
01393 
01394     {
01395         bool isolated = (borderType & BORDER_ISOLATED) != 0;
01396         int borderTypeNI = borderType & ~BORDER_ISOLATED;
01397         if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated))
01398         {
01399             typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownInitAlloc)(void** ppState, IppiSize srcRoi, Ipp32f rate, void* pKernel, int kerSize, int mode);
01400             typedef IppStatus (CV_STDCALL * ippiPyramidLayerDown)(void* pSrc, int srcStep, IppiSize srcRoiSize, void* pDst, int dstStep, IppiSize dstRoiSize, void* pState);
01401             typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownFree)(void* pState);
01402 
01403             int type = src.type();
01404             int depth = src.depth();
01405             ippiPyramidLayerDownInitAlloc pyrInitAllocFunc = 0;
01406             ippiPyramidLayerDown pyrDownFunc = 0;
01407             ippiPyramidLayerDownFree pyrFreeFunc = 0;
01408 
01409             if (type == CV_8UC1)
01410             {
01411                 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C1R;
01412                 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C1R;
01413                 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C1R;
01414             }
01415             else if (type == CV_8UC3)
01416             {
01417                 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C3R;
01418                 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C3R;
01419                 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C3R;
01420             }
01421             else if (type == CV_32FC1)
01422             {
01423                 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C1R;
01424                 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C1R;
01425                 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C1R;
01426             }
01427             else if (type == CV_32FC3)
01428             {
01429                 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C3R;
01430                 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C3R;
01431                 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C3R;
01432             }
01433 
01434             if (pyrInitAllocFunc && pyrDownFunc && pyrFreeFunc)
01435             {
01436                 float rate = 2.f;
01437                 IppiSize srcRoi = { src.cols, src.rows };
01438                 IppiPyramid *gPyr;
01439                 IppStatus ok = ippiPyramidInitAlloc(&gPyr, maxlevel + 1, srcRoi, rate);
01440 
01441                 Ipp16s iKernel[5] = { 1, 4, 6, 4, 1 };
01442                 Ipp32f fKernel[5] = { 1.f, 4.f, 6.f, 4.f, 1.f };
01443                 void* kernel = depth >= CV_32F ? (void*) fKernel : (void*) iKernel;
01444 
01445                 if (ok >= 0) ok = pyrInitAllocFunc((void**) &(gPyr->pState), srcRoi, rate, kernel, 5, IPPI_INTER_LINEAR);
01446                 if (ok >= 0)
01447                 {
01448                     gPyr->pImage[0] = src.data;
01449                     gPyr->pStep[0] = (int) src.step;
01450                     gPyr->pRoi[0] = srcRoi;
01451                     for( ; i <= maxlevel; i++ )
01452                     {
01453                         IppiSize dstRoi;
01454                         ok = ippiGetPyramidDownROI(gPyr->pRoi[i-1], &dstRoi, rate);
01455                         Mat& dst = _dst.getMatRef(i);
01456                         dst.create(Size(dstRoi.width, dstRoi.height), type);
01457                         gPyr->pImage[i] = dst.data;
01458                         gPyr->pStep[i] = (int) dst.step;
01459                         gPyr->pRoi[i] = dstRoi;
01460 
01461                         if (ok >= 0) ok = pyrDownFunc(gPyr->pImage[i-1], gPyr->pStep[i-1], gPyr->pRoi[i-1],
01462                                                       gPyr->pImage[i], gPyr->pStep[i], gPyr->pRoi[i], gPyr->pState);
01463 
01464                         if (ok < 0)
01465                         {
01466                             pyrFreeFunc(gPyr->pState);
01467                             return false;
01468                         }
01469                         else
01470                         {
01471                             CV_IMPL_ADD(CV_IMPL_IPP);
01472                         }
01473                     }
01474                     pyrFreeFunc(gPyr->pState);
01475                 }
01476                 else
01477                 {
01478                     ippiPyramidFree(gPyr);
01479                     return false;
01480                 }
01481                 ippiPyramidFree(gPyr);
01482             }
01483             return true;
01484         }
01485         return false;
01486     }
01487 #else
01488     CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(maxlevel); CV_UNUSED(borderType);
01489 #endif
01490     return false;
01491 }
01492 }
01493 #endif
01494 
01495 void cv::buildPyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType )
01496 {
01497     CV_Assert(borderType != BORDER_CONSTANT);
01498 
01499     if (_src.dims() <= 2 && _dst.isUMatVector())
01500     {
01501         UMat  src = _src.getUMat();
01502         _dst.create( maxlevel + 1, 1, 0 );
01503         _dst.getUMatRef(0) = src;
01504         for( int i = 1; i <= maxlevel; i++ )
01505             pyrDown( _dst.getUMatRef(i-1), _dst.getUMatRef(i), Size(), borderType );
01506         return;
01507     }
01508 
01509     Mat src = _src.getMat();
01510     _dst.create( maxlevel + 1, 1, 0 );
01511     _dst.getMatRef(0) = src;
01512 
01513     int i=1;
01514 
01515     CV_IPP_RUN(((IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK) && ((borderType & ~BORDER_ISOLATED) == BORDER_DEFAULT && (!_src.isSubmatrix() || ((borderType & BORDER_ISOLATED) != 0)))),
01516         ipp_buildpyramid( _src,  _dst,  maxlevel,  borderType));
01517 
01518     for( ; i <= maxlevel; i++ )
01519         pyrDown( _dst.getMatRef(i-1), _dst.getMatRef(i), Size(), borderType );
01520 }
01521 
01522 CV_IMPL void cvPyrDown( const void* srcarr, void* dstarr, int _filter )
01523 {
01524     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
01525 
01526     CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type());
01527     cv::pyrDown( src, dst, dst.size() );
01528 }
01529 
01530 CV_IMPL void cvPyrUp( const void* srcarr, void* dstarr, int _filter )
01531 {
01532     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
01533 
01534     CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type());
01535     cv::pyrUp( src, dst, dst.size() );
01536 }
01537 
01538 
01539 CV_IMPL void
01540 cvReleasePyramid( CvMat*** _pyramid, int extra_layers )
01541 {
01542     if( !_pyramid )
01543         CV_Error( CV_StsNullPtr, "" );
01544 
01545     if( *_pyramid )
01546         for( int i = 0; i <= extra_layers; i++ )
01547             cvReleaseMat( &(*_pyramid)[i] );
01548 
01549     cvFree( _pyramid );
01550 }
01551 
01552 
01553 CV_IMPL CvMat**
01554 cvCreatePyramid( const CvArr* srcarr, int extra_layers, double rate,
01555                  const CvSize* layer_sizes, CvArr* bufarr,
01556                  int calc, int filter )
01557 {
01558     const float eps = 0.1f;
01559     uchar* ptr = 0;
01560 
01561     CvMat stub, *src = cvGetMat( srcarr, &stub );
01562 
01563     if( extra_layers < 0 )
01564         CV_Error( CV_StsOutOfRange, "The number of extra layers must be non negative" );
01565 
01566     int i, layer_step, elem_size = CV_ELEM_SIZE(src->type);
01567     CvSize layer_size, size = cvGetMatSize(src);
01568 
01569     if( bufarr )
01570     {
01571         CvMat bstub, *buf;
01572         int bufsize = 0;
01573 
01574         buf = cvGetMat( bufarr, &bstub );
01575         bufsize = buf->rows*buf->cols*CV_ELEM_SIZE(buf->type);
01576         layer_size = size;
01577         for( i = 1; i <= extra_layers; i++ )
01578         {
01579             if( !layer_sizes )
01580             {
01581                 layer_size.width = cvRound(layer_size.width*rate+eps);
01582                 layer_size.height = cvRound(layer_size.height*rate+eps);
01583             }
01584             else
01585                 layer_size = layer_sizes[i-1];
01586             layer_step = layer_size.width*elem_size;
01587             bufsize -= layer_step*layer_size.height;
01588         }
01589 
01590         if( bufsize < 0 )
01591             CV_Error( CV_StsOutOfRange, "The buffer is too small to fit the pyramid" );
01592         ptr = buf->data.ptr;
01593     }
01594 
01595     CvMat** pyramid = (CvMat**)cvAlloc( (extra_layers+1)*sizeof(pyramid[0]) );
01596     memset( pyramid, 0, (extra_layers+1)*sizeof(pyramid[0]) );
01597 
01598     pyramid[0] = cvCreateMatHeader( size.height, size.width, src->type );
01599     cvSetData( pyramid[0], src->data.ptr, src->step );
01600     layer_size = size;
01601 
01602     for( i = 1; i <= extra_layers; i++ )
01603     {
01604         if( !layer_sizes )
01605         {
01606             layer_size.width = cvRound(layer_size.width*rate + eps);
01607             layer_size.height = cvRound(layer_size.height*rate + eps);
01608         }
01609         else
01610             layer_size = layer_sizes[i];
01611 
01612         if( bufarr )
01613         {
01614             pyramid[i] = cvCreateMatHeader( layer_size.height, layer_size.width, src->type );
01615             layer_step = layer_size.width*elem_size;
01616             cvSetData( pyramid[i], ptr, layer_step );
01617             ptr += layer_step*layer_size.height;
01618         }
01619         else
01620             pyramid[i] = cvCreateMat( layer_size.height, layer_size.width, src->type );
01621 
01622         if( calc )
01623             cvPyrDown( pyramid[i-1], pyramid[i], filter );
01624             //cvResize( pyramid[i-1], pyramid[i], CV_INTER_LINEAR );
01625     }
01626 
01627     return pyramid;
01628 }
01629 
01630 /* End of file. */
01631