Opencv 3.1 project on GR-PEACH board

Fork of gr-peach-opencv-project by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers demosaicing.cpp Source File

demosaicing.cpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 #include "precomp.hpp"
00045 
00046 #include <limits>
00047 
00048 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
00049 
00050 namespace cv
00051 {
00052 
00053 
00054 //////////////////////////// Bayer Pattern -> RGB conversion /////////////////////////////
00055 
00056 template<typename T>
00057 class SIMDBayerStubInterpolator_
00058 {
00059 public:
00060     int bayer2Gray(const T*, int, T*, int, int, int, int) const
00061     {
00062         return 0;
00063     }
00064 
00065     int bayer2RGB(const T*, int, T*, int, int) const
00066     {
00067         return 0;
00068     }
00069 
00070     int bayer2RGBA(const T*, int, T*, int, int) const
00071     {
00072         return 0;
00073     }
00074 
00075     int bayer2RGB_EA(const T*, int, T*, int, int) const
00076     {
00077         return 0;
00078     }
00079 };
00080 
00081 #if CV_SSE2
00082 class SIMDBayerInterpolator_8u
00083 {
00084 public:
00085     SIMDBayerInterpolator_8u()
00086     {
00087         use_simd = checkHardwareSupport(CV_CPU_SSE2);
00088     }
00089 
00090     int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
00091                    int width, int bcoeff, int gcoeff, int rcoeff) const
00092     {
00093         if( !use_simd )
00094             return 0;
00095 
00096         __m128i _b2y = _mm_set1_epi16((short)(rcoeff*2));
00097         __m128i _g2y = _mm_set1_epi16((short)(gcoeff*2));
00098         __m128i _r2y = _mm_set1_epi16((short)(bcoeff*2));
00099         const uchar* bayer_end = bayer + width;
00100 
00101         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
00102         {
00103             __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
00104             __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
00105             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
00106 
00107             __m128i b1 = _mm_add_epi16(_mm_srli_epi16(_mm_slli_epi16(r0, 8), 7),
00108                                        _mm_srli_epi16(_mm_slli_epi16(r2, 8), 7));
00109             __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2));
00110             b1 = _mm_slli_epi16(_mm_srli_si128(b1, 2), 1);
00111 
00112             __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 7), _mm_srli_epi16(r2, 7));
00113             __m128i g1 = _mm_srli_epi16(_mm_slli_epi16(r1, 8), 7);
00114             g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
00115             g1 = _mm_slli_epi16(_mm_srli_si128(g1, 2), 2);
00116 
00117             r0 = _mm_srli_epi16(r1, 8);
00118             r1 = _mm_slli_epi16(_mm_add_epi16(r0, _mm_srli_si128(r0, 2)), 2);
00119             r0 = _mm_slli_epi16(r0, 3);
00120 
00121             g0 = _mm_add_epi16(_mm_mulhi_epi16(b0, _b2y), _mm_mulhi_epi16(g0, _g2y));
00122             g1 = _mm_add_epi16(_mm_mulhi_epi16(b1, _b2y), _mm_mulhi_epi16(g1, _g2y));
00123             g0 = _mm_add_epi16(g0, _mm_mulhi_epi16(r0, _r2y));
00124             g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(r1, _r2y));
00125             g0 = _mm_srli_epi16(g0, 2);
00126             g1 = _mm_srli_epi16(g1, 2);
00127             g0 = _mm_packus_epi16(g0, g0);
00128             g1 = _mm_packus_epi16(g1, g1);
00129             g0 = _mm_unpacklo_epi8(g0, g1);
00130             _mm_storeu_si128((__m128i*)dst, g0);
00131         }
00132 
00133         return (int)(bayer - (bayer_end - width));
00134     }
00135 
00136     int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
00137     {
00138         if( !use_simd )
00139             return 0;
00140         /*
00141          B G B G | B G B G | B G B G | B G B G
00142          G R G R | G R G R | G R G R | G R G R
00143          B G B G | B G B G | B G B G | B G B G
00144          */
00145 
00146         __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
00147         __m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128();
00148         __m128i masklo = _mm_set1_epi16(0x00ff);
00149         const uchar* bayer_end = bayer + width;
00150 
00151         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
00152         {
00153             __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
00154             __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
00155             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
00156 
00157             __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo));
00158             __m128i nextb1 = _mm_srli_si128(b1, 2);
00159             __m128i b0 = _mm_add_epi16(b1, nextb1);
00160             b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
00161             b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
00162             // b0 b2 ... b14 b1 b3 ... b15
00163             b0 = _mm_packus_epi16(b0, b1);
00164 
00165             __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8));
00166             __m128i g1 = _mm_and_si128(r1, masklo);
00167             g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
00168             g1 = _mm_srli_si128(g1, 2);
00169             g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2);
00170             // g0 g2 ... g14 g1 g3 ... g15
00171             g0 = _mm_packus_epi16(g0, g1);
00172 
00173             r0 = _mm_srli_epi16(r1, 8);
00174             r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
00175             r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
00176             // r0 r2 ... r14 r1 r3 ... r15
00177             r0 = _mm_packus_epi16(r0, r1);
00178 
00179             b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
00180             b0 = _mm_xor_si128(b0, b1);
00181             r0 = _mm_xor_si128(r0, b1);
00182 
00183             // b1 g1 b3 g3 b5 g5...
00184             b1 = _mm_unpackhi_epi8(b0, g0);
00185             // b0 g0 b2 g2 b4 g4 ....
00186             b0 = _mm_unpacklo_epi8(b0, g0);
00187 
00188             // r1 0 r3 0 r5 0 ...
00189             r1 = _mm_unpackhi_epi8(r0, z);
00190             // r0 0 r2 0 r4 0 ...
00191             r0 = _mm_unpacklo_epi8(r0, z);
00192 
00193             // 0 b0 g0 r0 0 b2 g2 r2 ...
00194             g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
00195             // 0 b8 g8 r8 0 b10 g10 r10 ...
00196             g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
00197 
00198             // b1 g1 r1 0 b3 g3 r3 0 ...
00199             r0 = _mm_unpacklo_epi16(b1, r1);
00200             // b9 g9 r9 0 b11 g11 r11 0 ...
00201             r1 = _mm_unpackhi_epi16(b1, r1);
00202 
00203             // 0 b0 g0 r0 b1 g1 r1 0 ...
00204             b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
00205             // 0 b4 g4 r4 b5 g5 r5 0 ...
00206             b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
00207 
00208             _mm_storel_epi64((__m128i*)(dst-1+0), b0);
00209             _mm_storel_epi64((__m128i*)(dst-1+6*1), _mm_srli_si128(b0, 8));
00210             _mm_storel_epi64((__m128i*)(dst-1+6*2), b1);
00211             _mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8));
00212 
00213             // 0 b8 g8 r8 b9 g9 r9 0 ...
00214             g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
00215             // 0 b12 g12 r12 b13 g13 r13 0 ...
00216             g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
00217 
00218             _mm_storel_epi64((__m128i*)(dst-1+6*4), g0);
00219             _mm_storel_epi64((__m128i*)(dst-1+6*5), _mm_srli_si128(g0, 8));
00220 
00221             _mm_storel_epi64((__m128i*)(dst-1+6*6), g1);
00222         }
00223 
00224         return (int)(bayer - (bayer_end - width));
00225     }
00226 
00227     int bayer2RGBA(const uchar*, int, uchar*, int, int) const
00228     {
00229         return 0;
00230     }
00231 
00232     int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
00233     {
00234         if (!use_simd)
00235             return 0;
00236 
00237         const uchar* bayer_end = bayer + width;
00238         __m128i masklow = _mm_set1_epi16(0x00ff);
00239         __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
00240         __m128i full = _mm_set1_epi16(-1), z = _mm_setzero_si128();
00241         __m128i mask = _mm_set1_epi16(blue > 0 ? -1 : 0);
00242 
00243         for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42)
00244         {
00245             /*
00246              B G B G | B G B G | B G B G | B G B G
00247              G R G R | G R G R | G R G R | G R G R
00248              B G B G | B G B G | B G B G | B G B G
00249              */
00250 
00251             __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
00252             __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
00253             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
00254 
00255             __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklow), _mm_and_si128(r2, masklow));
00256             __m128i nextb1 = _mm_srli_si128(b1, 2);
00257             __m128i b0 = _mm_add_epi16(b1, nextb1);
00258             b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
00259             b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
00260             // b0 b2 ... b14 b1 b3 ... b15
00261             b0 = _mm_packus_epi16(b0, b1);
00262 
00263             // vertical sum
00264             __m128i r0g = _mm_srli_epi16(r0, 8);
00265             __m128i r2g = _mm_srli_epi16(r2, 8);
00266             __m128i sumv = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(r0g, r2g), delta1), 1);
00267             // gorizontal sum
00268             __m128i g1 = _mm_and_si128(masklow, r1);
00269             __m128i nextg1 = _mm_srli_si128(g1, 2);
00270             __m128i sumg = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(g1, nextg1), delta1), 1);
00271 
00272             // gradients
00273             __m128i gradv = _mm_adds_epi16(_mm_subs_epu16(r0g, r2g), _mm_subs_epu16(r2g, r0g));
00274             __m128i gradg = _mm_adds_epi16(_mm_subs_epu16(nextg1, g1), _mm_subs_epu16(g1, nextg1));
00275             __m128i gmask = _mm_cmpgt_epi16(gradg, gradv);
00276 
00277             __m128i g0 = _mm_add_epi16(_mm_and_si128(gmask, sumv), _mm_and_si128(sumg, _mm_xor_si128(gmask, full)));
00278             // g0 g2 ... g14 g1 g3 ...
00279             g0 = _mm_packus_epi16(g0, nextg1);
00280 
00281             r0 = _mm_srli_epi16(r1, 8);
00282             r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
00283             r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
00284             // r0 r2 ... r14 r1 r3 ... r15
00285             r0 = _mm_packus_epi16(r0, r1);
00286 
00287             b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
00288             b0 = _mm_xor_si128(b0, b1);
00289             r0 = _mm_xor_si128(r0, b1);
00290 
00291             // b1 g1 b3 g3 b5 g5...
00292             b1 = _mm_unpackhi_epi8(b0, g0);
00293             // b0 g0 b2 g2 b4 g4 ....
00294             b0 = _mm_unpacklo_epi8(b0, g0);
00295 
00296             // r1 0 r3 0 r5 0 ...
00297             r1 = _mm_unpackhi_epi8(r0, z);
00298             // r0 0 r2 0 r4 0 ...
00299             r0 = _mm_unpacklo_epi8(r0, z);
00300 
00301             // 0 b0 g0 r0 0 b2 g2 r2 ...
00302             g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
00303             // 0 b8 g8 r8 0 b10 g10 r10 ...
00304             g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
00305 
00306             // b1 g1 r1 0 b3 g3 r3 0 ...
00307             r0 = _mm_unpacklo_epi16(b1, r1);
00308             // b9 g9 r9 0 b11 g11 r11 0 ...
00309             r1 = _mm_unpackhi_epi16(b1, r1);
00310 
00311             // 0 b0 g0 r0 b1 g1 r1 0 ...
00312             b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
00313             // 0 b4 g4 r4 b5 g5 r5 0 ...
00314             b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
00315 
00316             _mm_storel_epi64((__m128i*)(dst+0), b0);
00317             _mm_storel_epi64((__m128i*)(dst+6*1), _mm_srli_si128(b0, 8));
00318             _mm_storel_epi64((__m128i*)(dst+6*2), b1);
00319             _mm_storel_epi64((__m128i*)(dst+6*3), _mm_srli_si128(b1, 8));
00320 
00321             // 0 b8 g8 r8 b9 g9 r9 0 ...
00322             g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
00323             // 0 b12 g12 r12 b13 g13 r13 0 ...
00324             g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
00325 
00326             _mm_storel_epi64((__m128i*)(dst+6*4), g0);
00327             _mm_storel_epi64((__m128i*)(dst+6*5), _mm_srli_si128(g0, 8));
00328 
00329             _mm_storel_epi64((__m128i*)(dst+6*6), g1);
00330         }
00331 
00332         return int(bayer - (bayer_end - width));
00333     }
00334 
00335     bool use_simd;
00336 };
00337 #elif CV_NEON
00338 class SIMDBayerInterpolator_8u
00339 {
00340 public:
00341     SIMDBayerInterpolator_8u()
00342     {
00343     }
00344 
00345     int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
00346                    int width, int bcoeff, int gcoeff, int rcoeff) const
00347     {
00348         /*
00349          B G B G | B G B G | B G B G | B G B G
00350          G R G R | G R G R | G R G R | G R G R
00351          B G B G | B G B G | B G B G | B G B G
00352          */
00353 
00354         uint16x8_t masklo = vdupq_n_u16(255);
00355         const uchar* bayer_end = bayer + width;
00356 
00357         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
00358         {
00359             uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
00360             uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
00361             uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
00362 
00363             uint16x8_t b1_ = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
00364             uint16x8_t b1 = vextq_u16(b1_, b1_, 1);
00365             uint16x8_t b0 = vaddq_u16(b1_, b1);
00366             // b0 = b0 b2 b4 ...
00367             // b1 = b1 b3 b5 ...
00368 
00369             uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
00370             uint16x8_t g1 = vandq_u16(r1, masklo);
00371             g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
00372             uint16x8_t rot = vextq_u16(g1, g1, 1);
00373             g1 = vshlq_n_u16(rot, 2);
00374             // g0 = b0 b2 b4 ...
00375             // g1 = b1 b3 b5 ...
00376 
00377             r0 = vshrq_n_u16(r1, 8);
00378             r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
00379             r0 = vshlq_n_u16(r0, 2);
00380             // r0 = r0 r2 r4 ...
00381             // r1 = r1 r3 r5 ...
00382 
00383             b0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b0), (short)(rcoeff*2)));
00384             b1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b1), (short)(rcoeff*4)));
00385 
00386             g0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g0), (short)(gcoeff*2)));
00387             g1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g1), (short)(gcoeff*2)));
00388 
00389             r0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r0), (short)(bcoeff*2)));
00390             r1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r1), (short)(bcoeff*4)));
00391 
00392             g0 = vaddq_u16(vaddq_u16(g0, b0), r0);
00393             g1 = vaddq_u16(vaddq_u16(g1, b1), r1);
00394 
00395             uint8x8x2_t p = vzip_u8(vrshrn_n_u16(g0, 2), vrshrn_n_u16(g1, 2));
00396             vst1_u8(dst, p.val[0]);
00397             vst1_u8(dst + 8, p.val[1]);
00398         }
00399 
00400         return (int)(bayer - (bayer_end - width));
00401     }
00402 
00403     int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
00404     {
00405         /*
00406          B G B G | B G B G | B G B G | B G B G
00407          G R G R | G R G R | G R G R | G R G R
00408          B G B G | B G B G | B G B G | B G B G
00409          */
00410         uint16x8_t masklo = vdupq_n_u16(255);
00411         uint8x16x3_t pix;
00412         const uchar* bayer_end = bayer + width;
00413 
00414         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
00415         {
00416             uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
00417             uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
00418             uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
00419 
00420             uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
00421             uint16x8_t nextb1 = vextq_u16(b1, b1, 1);
00422             uint16x8_t b0 = vaddq_u16(b1, nextb1);
00423             // b0 b1 b2 ...
00424             uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1));
00425             pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]);
00426 
00427             uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
00428             uint16x8_t g1 = vandq_u16(r1, masklo);
00429             g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
00430             g1 = vextq_u16(g1, g1, 1);
00431             // g0 g1 g2 ...
00432             uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1));
00433             pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]);
00434 
00435             r0 = vshrq_n_u16(r1, 8);
00436             r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
00437             // r0 r1 r2 ...
00438             uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1));
00439             pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]);
00440 
00441             vst3q_u8(dst-1, pix);
00442         }
00443 
00444         return (int)(bayer - (bayer_end - width));
00445     }
00446 
00447     int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
00448     {
00449         /*
00450          B G B G | B G B G | B G B G | B G B G
00451          G R G R | G R G R | G R G R | G R G R
00452          B G B G | B G B G | B G B G | B G B G
00453          */
00454         uint16x8_t masklo = vdupq_n_u16(255);
00455         uint8x16x4_t pix;
00456         const uchar* bayer_end = bayer + width;
00457         pix.val[3] = vdupq_n_u8(255);
00458 
00459         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
00460         {
00461             uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
00462             uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
00463             uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
00464 
00465             uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
00466             uint16x8_t nextb1 = vextq_u16(b1, b1, 1);
00467             uint16x8_t b0 = vaddq_u16(b1, nextb1);
00468             // b0 b1 b2 ...
00469             uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1));
00470             pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]);
00471 
00472             uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
00473             uint16x8_t g1 = vandq_u16(r1, masklo);
00474             g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
00475             g1 = vextq_u16(g1, g1, 1);
00476             // g0 g1 g2 ...
00477             uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1));
00478             pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]);
00479 
00480             r0 = vshrq_n_u16(r1, 8);
00481             r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
00482             // r0 r1 r2 ...
00483             uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1));
00484             pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]);
00485 
00486             vst4q_u8(dst-1, pix);
00487         }
00488 
00489         return (int)(bayer - (bayer_end - width));
00490     }
00491 
00492     int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const
00493     {
00494         return 0;
00495     }
00496 };
00497 #else
00498 typedef SIMDBayerStubInterpolator_<uchar> SIMDBayerInterpolator_8u;
00499 #endif
00500 
00501 
00502 template<typename T, class SIMDInterpolator>
00503 class Bayer2Gray_Invoker :
00504     public ParallelLoopBody
00505 {
00506 public:
00507     Bayer2Gray_Invoker(const Mat& _srcmat, Mat& _dstmat, int _start_with_green, bool _brow,
00508         const Size& _size, int _bcoeff, int _rcoeff) :
00509         ParallelLoopBody(), srcmat(_srcmat), dstmat(_dstmat), Start_with_green(_start_with_green),
00510         Brow(_brow), size(_size), Bcoeff(_bcoeff), Rcoeff(_rcoeff)
00511     {
00512     }
00513 
00514     virtual void operator ()(const Range& range) const
00515     {
00516         SIMDInterpolator vecOp;
00517         const int G2Y = 9617;
00518         const int SHIFT = 14;
00519 
00520         const T* bayer0 = srcmat.ptr<T>();
00521         int bayer_step = (int)(srcmat.step/sizeof(T));
00522         T* dst0 = (T*)dstmat.data;
00523         int dst_step = (int)(dstmat.step/sizeof(T));
00524         int bcoeff = Bcoeff, rcoeff = Rcoeff;
00525         int start_with_green = Start_with_green;
00526         bool brow = Brow;
00527 
00528         dst0 += dst_step + 1;
00529 
00530         if (range.start % 2)
00531         {
00532             brow = !brow;
00533             std::swap(bcoeff, rcoeff);
00534             start_with_green = !start_with_green;
00535         }
00536 
00537         bayer0 += range.start * bayer_step;
00538         dst0 += range.start * dst_step;
00539 
00540         for(int i = range.start ; i < range.end; ++i, bayer0 += bayer_step, dst0 += dst_step )
00541         {
00542             unsigned t0, t1, t2;
00543             const T* bayer = bayer0;
00544             T* dst = dst0;
00545             const T* bayer_end = bayer + size.width;
00546 
00547             if( size.width <= 0 )
00548             {
00549                 dst[-1] = dst[size.width] = 0;
00550                 continue;
00551             }
00552 
00553             if( start_with_green )
00554             {
00555                 t0 = (bayer[1] + bayer[bayer_step*2+1])*rcoeff;
00556                 t1 = (bayer[bayer_step] + bayer[bayer_step+2])*bcoeff;
00557                 t2 = bayer[bayer_step+1]*(2*G2Y);
00558 
00559                 dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+1);
00560                 bayer++;
00561                 dst++;
00562             }
00563 
00564             int delta = vecOp.bayer2Gray(bayer, bayer_step, dst, size.width, bcoeff, G2Y, rcoeff);
00565             bayer += delta;
00566             dst += delta;
00567 
00568             for( ; bayer <= bayer_end - 2; bayer += 2, dst += 2 )
00569             {
00570                 t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] + bayer[bayer_step*2+2])*rcoeff;
00571                 t1 = (bayer[1] + bayer[bayer_step] + bayer[bayer_step+2] + bayer[bayer_step*2+1])*G2Y;
00572                 t2 = bayer[bayer_step+1]*(4*bcoeff);
00573                 dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+2);
00574 
00575                 t0 = (bayer[2] + bayer[bayer_step*2+2])*rcoeff;
00576                 t1 = (bayer[bayer_step+1] + bayer[bayer_step+3])*bcoeff;
00577                 t2 = bayer[bayer_step+2]*(2*G2Y);
00578                 dst[1] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+1);
00579             }
00580 
00581             if( bayer < bayer_end )
00582             {
00583                 t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] + bayer[bayer_step*2+2])*rcoeff;
00584                 t1 = (bayer[1] + bayer[bayer_step] + bayer[bayer_step+2] + bayer[bayer_step*2+1])*G2Y;
00585                 t2 = bayer[bayer_step+1]*(4*bcoeff);
00586                 dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+2);
00587                 bayer++;
00588                 dst++;
00589             }
00590 
00591             dst0[-1] = dst0[0];
00592             dst0[size.width] = dst0[size.width-1];
00593 
00594             brow = !brow;
00595             std::swap(bcoeff, rcoeff);
00596             start_with_green = !start_with_green;
00597         }
00598     }
00599 
00600 private:
00601     Mat srcmat;
00602     Mat dstmat;
00603     int Start_with_green;
00604     bool Brow;
00605     Size size;
00606     int Bcoeff, Rcoeff;
00607 };
00608 
00609 template<typename T, typename SIMDInterpolator>
00610 static void Bayer2Gray_( const Mat& srcmat, Mat& dstmat, int code )
00611 {
00612     const int R2Y = 4899;
00613     const int B2Y = 1868;
00614 
00615     Size size = srcmat.size();
00616     int bcoeff = B2Y, rcoeff = R2Y;
00617     int start_with_green = code == CV_BayerGB2GRAY || code == CV_BayerGR2GRAY;
00618     bool brow = true;
00619 
00620     if( code != CV_BayerBG2GRAY && code != CV_BayerGB2GRAY )
00621     {
00622         brow = false;
00623         std::swap(bcoeff, rcoeff);
00624     }
00625     size.height -= 2;
00626     size.width -= 2;
00627 
00628     if (size.height > 0)
00629     {
00630         Range range(0, size.height);
00631         Bayer2Gray_Invoker<T, SIMDInterpolator> invoker(srcmat, dstmat,
00632             start_with_green, brow, size, bcoeff, rcoeff);
00633         parallel_for_(range, invoker, dstmat.total()/static_cast<double>(1<<16));
00634     }
00635 
00636     size = dstmat.size();
00637     T* dst0 = dstmat.ptr<T>();
00638     int dst_step = (int)(dstmat.step/sizeof(T));
00639     if( size.height > 2 )
00640         for( int i = 0; i < size.width; i++ )
00641         {
00642             dst0[i] = dst0[i + dst_step];
00643             dst0[i + (size.height-1)*dst_step] = dst0[i + (size.height-2)*dst_step];
00644         }
00645     else
00646         for( int i = 0; i < size.width; i++ )
00647             dst0[i] = dst0[i + (size.height-1)*dst_step] = 0;
00648 }
00649 
00650 template <typename T>
00651 struct Alpha
00652 {
00653     static T value() { return std::numeric_limits<T>::max(); }
00654 };
00655 
00656 template <>
00657 struct Alpha<float>
00658 {
00659     static float value() { return 1.0f; }
00660 };
00661 
00662 template <typename T, typename SIMDInterpolator>
00663 class Bayer2RGB_Invoker :
00664     public ParallelLoopBody
00665 {
00666 public:
00667     Bayer2RGB_Invoker(const Mat& _srcmat, Mat& _dstmat, int _start_with_green, int _blue, const Size& _size) :
00668         ParallelLoopBody(),
00669         srcmat(_srcmat), dstmat(_dstmat), Start_with_green(_start_with_green), Blue(_blue), size(_size)
00670     {
00671     }
00672 
00673     virtual void operator() (const Range& range) const
00674     {
00675         SIMDInterpolator vecOp;
00676         T alpha = Alpha<T>::value();
00677         int dcn = dstmat.channels();
00678         int dcn2 = dcn << 1;
00679 
00680         int bayer_step = (int)(srcmat.step/sizeof(T));
00681         const T* bayer0 = srcmat.ptr<T>() + bayer_step * range.start;
00682 
00683         int dst_step = (int)(dstmat.step/sizeof(T));
00684         T* dst0 = reinterpret_cast<T*>(dstmat.data) + (range.start + 1) * dst_step + dcn + 1;
00685 
00686         int blue = Blue, start_with_green = Start_with_green;
00687         if (range.start % 2)
00688         {
00689             blue = -blue;
00690             start_with_green = !start_with_green;
00691         }
00692 
00693         for (int i = range.start; i < range.end; bayer0 += bayer_step, dst0 += dst_step, ++i )
00694         {
00695             int t0, t1;
00696             const T* bayer = bayer0;
00697             T* dst = dst0;
00698             const T* bayer_end = bayer + size.width;
00699 
00700             // in case of when size.width <= 2
00701             if( size.width <= 0 )
00702             {
00703                 if (dcn == 3)
00704                 {
00705                     dst[-4] = dst[-3] = dst[-2] = dst[size.width*dcn-1] =
00706                     dst[size.width*dcn] = dst[size.width*dcn+1] = 0;
00707                 }
00708                 else
00709                 {
00710                     dst[-5] = dst[-4] = dst[-3] = dst[size.width*dcn-1] =
00711                     dst[size.width*dcn] = dst[size.width*dcn+1] = 0;
00712                     dst[-2] = dst[size.width*dcn+2] = alpha;
00713                 }
00714                 continue;
00715             }
00716 
00717             if( start_with_green )
00718             {
00719                 t0 = (bayer[1] + bayer[bayer_step*2+1] + 1) >> 1;
00720                 t1 = (bayer[bayer_step] + bayer[bayer_step+2] + 1) >> 1;
00721 
00722                 dst[-blue] = (T)t0;
00723                 dst[0] = bayer[bayer_step+1];
00724                 dst[blue] = (T)t1;
00725                 if (dcn == 4)
00726                     dst[2] = alpha; // alpha channel
00727 
00728                 bayer++;
00729                 dst += dcn;
00730             }
00731 
00732             // simd optimization only for dcn == 3
00733             int delta = dcn == 4 ?
00734                 vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) :
00735                 vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
00736             bayer += delta;
00737             dst += delta*dcn;
00738 
00739             if (dcn == 3) // Bayer to BGR
00740             {
00741                 if( blue > 0 )
00742                 {
00743                     for( ; bayer <= bayer_end - 2; bayer += 2, dst += dcn2 )
00744                     {
00745                         t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
00746                               bayer[bayer_step*2+2] + 2) >> 2;
00747                         t1 = (bayer[1] + bayer[bayer_step] +
00748                               bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
00749                         dst[-1] = (T)t0;
00750                         dst[0] = (T)t1;
00751                         dst[1] = bayer[bayer_step+1];
00752 
00753                         t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
00754                         t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
00755                         dst[2] = (T)t0;
00756                         dst[3] = bayer[bayer_step+2];
00757                         dst[4] = (T)t1;
00758                     }
00759                 }
00760                 else
00761                 {
00762                     for( ; bayer <= bayer_end - 2; bayer += 2, dst += dcn2 )
00763                     {
00764                         t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
00765                               bayer[bayer_step*2+2] + 2) >> 2;
00766                         t1 = (bayer[1] + bayer[bayer_step] +
00767                               bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
00768                         dst[1] = (T)t0;
00769                         dst[0] = (T)t1;
00770                         dst[-1] = bayer[bayer_step+1];
00771 
00772                         t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
00773                         t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
00774                         dst[4] = (T)t0;
00775                         dst[3] = bayer[bayer_step+2];
00776                         dst[2] = (T)t1;
00777                     }
00778                 }
00779             }
00780             else // Bayer to BGRA
00781             {
00782                 // if current row does not contain Blue pixels
00783                 if( blue > 0 )
00784                 {
00785                     for( ; bayer <= bayer_end - 2; bayer += 2, dst += dcn2 )
00786                     {
00787                         t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
00788                               bayer[bayer_step*2+2] + 2) >> 2;
00789                         t1 = (bayer[1] + bayer[bayer_step] +
00790                               bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
00791                         dst[-1] = (T)t0;
00792                         dst[0] = (T)t1;
00793                         dst[1] = bayer[bayer_step+1];
00794                         dst[2] = alpha; // alpha channel
00795 
00796                         t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
00797                         t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
00798                         dst[3] = (T)t0;
00799                         dst[4] = bayer[bayer_step+2];
00800                         dst[5] = (T)t1;
00801                         dst[6] = alpha; // alpha channel
00802                     }
00803                 }
00804                 else // if current row contains Blue pixels
00805                 {
00806                     for( ; bayer <= bayer_end - 2; bayer += 2, dst += dcn2 )
00807                     {
00808                         t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
00809                               bayer[bayer_step*2+2] + 2) >> 2;
00810                         t1 = (bayer[1] + bayer[bayer_step] +
00811                               bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
00812                         dst[-1] = bayer[bayer_step+1];
00813                         dst[0] = (T)t1;
00814                         dst[1] = (T)t0;
00815                         dst[2] = alpha; // alpha channel
00816 
00817                         t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
00818                         t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
00819                         dst[3] = (T)t1;
00820                         dst[4] = bayer[bayer_step+2];
00821                         dst[5] = (T)t0;
00822                         dst[6] = alpha; // alpha channel
00823                     }
00824                 }
00825             }
00826 
00827             // if skip one pixel at the end of row
00828             if( bayer < bayer_end )
00829             {
00830                 t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
00831                       bayer[bayer_step*2+2] + 2) >> 2;
00832                 t1 = (bayer[1] + bayer[bayer_step] +
00833                       bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
00834                 dst[-blue] = (T)t0;
00835                 dst[0] = (T)t1;
00836                 dst[blue] = bayer[bayer_step+1];
00837                 if (dcn == 4)
00838                     dst[2] = alpha; // alpha channel
00839                 bayer++;
00840                 dst += dcn;
00841             }
00842 
00843             // fill the last and the first pixels of row accordingly
00844             if (dcn == 3)
00845             {
00846                 dst0[-4] = dst0[-1];
00847                 dst0[-3] = dst0[0];
00848                 dst0[-2] = dst0[1];
00849                 dst0[size.width*dcn-1] = dst0[size.width*dcn-4];
00850                 dst0[size.width*dcn] = dst0[size.width*dcn-3];
00851                 dst0[size.width*dcn+1] = dst0[size.width*dcn-2];
00852             }
00853             else
00854             {
00855                 dst0[-5] = dst0[-1];
00856                 dst0[-4] = dst0[0];
00857                 dst0[-3] = dst0[1];
00858                 dst0[-2] = dst0[2]; // alpha channel
00859                 dst0[size.width*dcn-1] = dst0[size.width*dcn-5];
00860                 dst0[size.width*dcn] = dst0[size.width*dcn-4];
00861                 dst0[size.width*dcn+1] = dst0[size.width*dcn-3];
00862                 dst0[size.width*dcn+2] = dst0[size.width*dcn-2]; // alpha channel
00863             }
00864 
00865             blue = -blue;
00866             start_with_green = !start_with_green;
00867         }
00868     }
00869 
00870 private:
00871     Mat srcmat;
00872     Mat dstmat;
00873     int Start_with_green, Blue;
00874     Size size;
00875 };
00876 
00877 template<typename T, class SIMDInterpolator>
00878 static void Bayer2RGB_( const Mat& srcmat, Mat& dstmat, int code )
00879 {
00880     int dst_step = (int)(dstmat.step/sizeof(T));
00881     Size size = srcmat.size();
00882     int blue = code == CV_BayerBG2BGR || code == CV_BayerGB2BGR ? -1 : 1;
00883     int start_with_green = code == CV_BayerGB2BGR || code == CV_BayerGR2BGR;
00884 
00885     int dcn = dstmat.channels();
00886     size.height -= 2;
00887     size.width -= 2;
00888 
00889     if (size.height > 0)
00890     {
00891         Range range(0, size.height);
00892         Bayer2RGB_Invoker<T, SIMDInterpolator> invoker(srcmat, dstmat, start_with_green, blue, size);
00893         parallel_for_(range, invoker, dstmat.total()/static_cast<double>(1<<16));
00894     }
00895 
00896     // filling the first and the last rows
00897     size = dstmat.size();
00898     T* dst0 = dstmat.ptr<T>();
00899     if( size.height > 2 )
00900         for( int i = 0; i < size.width*dcn; i++ )
00901         {
00902             dst0[i] = dst0[i + dst_step];
00903             dst0[i + (size.height-1)*dst_step] = dst0[i + (size.height-2)*dst_step];
00904         }
00905     else
00906         for( int i = 0; i < size.width*dcn; i++ )
00907             dst0[i] = dst0[i + (size.height-1)*dst_step] = 0;
00908 }
00909 
00910 
00911 /////////////////// Demosaicing using Variable Number of Gradients ///////////////////////
00912 
00913 static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
00914 {
00915     const uchar* bayer = srcmat.ptr();
00916     int bstep = (int)srcmat.step;
00917     uchar* dst = dstmat.ptr();
00918     int dststep = (int)dstmat.step;
00919     Size size = srcmat.size();
00920 
00921     int blueIdx = code == CV_BayerBG2BGR_VNG || code == CV_BayerGB2BGR_VNG ? 0 : 2;
00922     bool greenCell0 = code != CV_BayerBG2BGR_VNG && code != CV_BayerRG2BGR_VNG;
00923 
00924     // for too small images use the simple interpolation algorithm
00925     if( MIN(size.width, size.height) < 8 )
00926     {
00927         Bayer2RGB_<uchar, SIMDBayerInterpolator_8u>( srcmat, dstmat, code );
00928         return;
00929     }
00930 
00931     const int brows = 3, bcn = 7;
00932     int N = size.width, N2 = N*2, N3 = N*3, N4 = N*4, N5 = N*5, N6 = N*6, N7 = N*7;
00933     int i, bufstep = N7*bcn;
00934     cv::AutoBuffer<ushort> _buf(bufstep*brows);
00935     ushort* buf = (ushort*)_buf;
00936 
00937     bayer += bstep*2;
00938 
00939 #if CV_SSE2
00940     bool haveSSE = cv::checkHardwareSupport(CV_CPU_SSE2);
00941     #define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a))
00942 #endif
00943 
00944     for( int y = 2; y < size.height - 4; y++ )
00945     {
00946         uchar* dstrow = dst + dststep*y + 6;
00947         const uchar* srow;
00948 
00949         for( int dy = (y == 2 ? -1 : 1); dy <= 1; dy++ )
00950         {
00951             ushort* brow = buf + ((y + dy - 1)%brows)*bufstep + 1;
00952             srow = bayer + (y+dy)*bstep + 1;
00953 
00954             for( i = 0; i < bcn; i++ )
00955                 brow[N*i-1] = brow[(N-2) + N*i] = 0;
00956 
00957             i = 1;
00958 
00959 #if CV_SSE2
00960             if( haveSSE )
00961             {
00962                 __m128i z = _mm_setzero_si128();
00963                 for( ; i <= N-9; i += 8, srow += 8, brow += 8 )
00964                 {
00965                     __m128i s1, s2, s3, s4, s6, s7, s8, s9;
00966 
00967                     s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1-bstep)),z);
00968                     s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)),z);
00969                     s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1-bstep)),z);
00970 
00971                     s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)),z);
00972                     s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)),z);
00973 
00974                     s7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1+bstep)),z);
00975                     s8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)),z);
00976                     s9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1+bstep)),z);
00977 
00978                     __m128i b0, b1, b2, b3, b4, b5, b6;
00979 
00980                     b0 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s2,s8),1),
00981                                         _mm_adds_epu16(_mm_absdiff_epu16(s1, s7),
00982                                                        _mm_absdiff_epu16(s3, s9)));
00983                     b1 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s4,s6),1),
00984                                         _mm_adds_epu16(_mm_absdiff_epu16(s1, s3),
00985                                                        _mm_absdiff_epu16(s7, s9)));
00986                     b2 = _mm_slli_epi16(_mm_absdiff_epu16(s3,s7),1);
00987                     b3 = _mm_slli_epi16(_mm_absdiff_epu16(s1,s9),1);
00988 
00989                     _mm_storeu_si128((__m128i*)brow, b0);
00990                     _mm_storeu_si128((__m128i*)(brow + N), b1);
00991                     _mm_storeu_si128((__m128i*)(brow + N2), b2);
00992                     _mm_storeu_si128((__m128i*)(brow + N3), b3);
00993 
00994                     b4 = _mm_adds_epu16(b2,_mm_adds_epu16(_mm_absdiff_epu16(s2, s4),
00995                                                           _mm_absdiff_epu16(s6, s8)));
00996                     b5 = _mm_adds_epu16(b3,_mm_adds_epu16(_mm_absdiff_epu16(s2, s6),
00997                                                           _mm_absdiff_epu16(s4, s8)));
00998                     b6 = _mm_adds_epu16(_mm_adds_epu16(s2, s4), _mm_adds_epu16(s6, s8));
00999                     b6 = _mm_srli_epi16(b6, 1);
01000 
01001                     _mm_storeu_si128((__m128i*)(brow + N4), b4);
01002                     _mm_storeu_si128((__m128i*)(brow + N5), b5);
01003                     _mm_storeu_si128((__m128i*)(brow + N6), b6);
01004                 }
01005             }
01006 #endif
01007 
01008             for( ; i < N-1; i++, srow++, brow++ )
01009             {
01010                 brow[0] = (ushort)(std::abs(srow[-1-bstep] - srow[-1+bstep]) +
01011                                    std::abs(srow[-bstep] - srow[+bstep])*2 +
01012                                    std::abs(srow[1-bstep] - srow[1+bstep]));
01013                 brow[N] = (ushort)(std::abs(srow[-1-bstep] - srow[1-bstep]) +
01014                                    std::abs(srow[-1] - srow[1])*2 +
01015                                    std::abs(srow[-1+bstep] - srow[1+bstep]));
01016                 brow[N2] = (ushort)(std::abs(srow[+1-bstep] - srow[-1+bstep])*2);
01017                 brow[N3] = (ushort)(std::abs(srow[-1-bstep] - srow[1+bstep])*2);
01018                 brow[N4] = (ushort)(brow[N2] + std::abs(srow[-bstep] - srow[-1]) +
01019                                     std::abs(srow[+bstep] - srow[1]));
01020                 brow[N5] = (ushort)(brow[N3] + std::abs(srow[-bstep] - srow[1]) +
01021                                     std::abs(srow[+bstep] - srow[-1]));
01022                 brow[N6] = (ushort)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1);
01023             }
01024         }
01025 
01026         const ushort* brow0 = buf + ((y - 2) % brows)*bufstep + 2;
01027         const ushort* brow1 = buf + ((y - 1) % brows)*bufstep + 2;
01028         const ushort* brow2 = buf + (y % brows)*bufstep + 2;
01029         static const float scale[] = { 0.f, 0.5f, 0.25f, 0.1666666666667f, 0.125f, 0.1f, 0.08333333333f, 0.0714286f, 0.0625f };
01030         srow = bayer + y*bstep + 2;
01031         bool greenCell = greenCell0;
01032 
01033         i = 2;
01034 #if CV_SSE2
01035         int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2;
01036 #else
01037         int limit = N - 2;
01038 #endif
01039 
01040         do
01041         {
01042             for( ; i < limit; i++, srow++, brow0++, brow1++, brow2++, dstrow += 3 )
01043             {
01044                 int gradN = brow0[0] + brow1[0];
01045                 int gradS = brow1[0] + brow2[0];
01046                 int gradW = brow1[N-1] + brow1[N];
01047                 int gradE = brow1[N] + brow1[N+1];
01048                 int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
01049                 int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
01050                 int R, G, B;
01051 
01052                 if( !greenCell )
01053                 {
01054                     int gradNE = brow0[N4+1] + brow1[N4];
01055                     int gradSW = brow1[N4] + brow2[N4-1];
01056                     int gradNW = brow0[N5-1] + brow1[N5];
01057                     int gradSE = brow1[N5] + brow2[N5+1];
01058 
01059                     minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);
01060                     maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);
01061                     int T = minGrad + MAX(maxGrad/2, 1);
01062 
01063                     int Rs = 0, Gs = 0, Bs = 0, ng = 0;
01064                     if( gradN < T )
01065                     {
01066                         Rs += srow[-bstep*2] + srow[0];
01067                         Gs += srow[-bstep]*2;
01068                         Bs += srow[-bstep-1] + srow[-bstep+1];
01069                         ng++;
01070                     }
01071                     if( gradS < T )
01072                     {
01073                         Rs += srow[bstep*2] + srow[0];
01074                         Gs += srow[bstep]*2;
01075                         Bs += srow[bstep-1] + srow[bstep+1];
01076                         ng++;
01077                     }
01078                     if( gradW < T )
01079                     {
01080                         Rs += srow[-2] + srow[0];
01081                         Gs += srow[-1]*2;
01082                         Bs += srow[-bstep-1] + srow[bstep-1];
01083                         ng++;
01084                     }
01085                     if( gradE < T )
01086                     {
01087                         Rs += srow[2] + srow[0];
01088                         Gs += srow[1]*2;
01089                         Bs += srow[-bstep+1] + srow[bstep+1];
01090                         ng++;
01091                     }
01092                     if( gradNE < T )
01093                     {
01094                         Rs += srow[-bstep*2+2] + srow[0];
01095                         Gs += brow0[N6+1];
01096                         Bs += srow[-bstep+1]*2;
01097                         ng++;
01098                     }
01099                     if( gradSW < T )
01100                     {
01101                         Rs += srow[bstep*2-2] + srow[0];
01102                         Gs += brow2[N6-1];
01103                         Bs += srow[bstep-1]*2;
01104                         ng++;
01105                     }
01106                     if( gradNW < T )
01107                     {
01108                         Rs += srow[-bstep*2-2] + srow[0];
01109                         Gs += brow0[N6-1];
01110                         Bs += srow[-bstep+1]*2;
01111                         ng++;
01112                     }
01113                     if( gradSE < T )
01114                     {
01115                         Rs += srow[bstep*2+2] + srow[0];
01116                         Gs += brow2[N6+1];
01117                         Bs += srow[-bstep+1]*2;
01118                         ng++;
01119                     }
01120                     R = srow[0];
01121                     G = R + cvRound((Gs - Rs)*scale[ng]);
01122                     B = R + cvRound((Bs - Rs)*scale[ng]);
01123                 }
01124                 else
01125                 {
01126                     int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
01127                     int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
01128                     int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
01129                     int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
01130 
01131                     minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);
01132                     maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);
01133                     int T = minGrad + MAX(maxGrad/2, 1);
01134 
01135                     int Rs = 0, Gs = 0, Bs = 0, ng = 0;
01136                     if( gradN < T )
01137                     {
01138                         Rs += srow[-bstep*2-1] + srow[-bstep*2+1];
01139                         Gs += srow[-bstep*2] + srow[0];
01140                         Bs += srow[-bstep]*2;
01141                         ng++;
01142                     }
01143                     if( gradS < T )
01144                     {
01145                         Rs += srow[bstep*2-1] + srow[bstep*2+1];
01146                         Gs += srow[bstep*2] + srow[0];
01147                         Bs += srow[bstep]*2;
01148                         ng++;
01149                     }
01150                     if( gradW < T )
01151                     {
01152                         Rs += srow[-1]*2;
01153                         Gs += srow[-2] + srow[0];
01154                         Bs += srow[-bstep-2]+srow[bstep-2];
01155                         ng++;
01156                     }
01157                     if( gradE < T )
01158                     {
01159                         Rs += srow[1]*2;
01160                         Gs += srow[2] + srow[0];
01161                         Bs += srow[-bstep+2]+srow[bstep+2];
01162                         ng++;
01163                     }
01164                     if( gradNE < T )
01165                     {
01166                         Rs += srow[-bstep*2+1] + srow[1];
01167                         Gs += srow[-bstep+1]*2;
01168                         Bs += srow[-bstep] + srow[-bstep+2];
01169                         ng++;
01170                     }
01171                     if( gradSW < T )
01172                     {
01173                         Rs += srow[bstep*2-1] + srow[-1];
01174                         Gs += srow[bstep-1]*2;
01175                         Bs += srow[bstep] + srow[bstep-2];
01176                         ng++;
01177                     }
01178                     if( gradNW < T )
01179                     {
01180                         Rs += srow[-bstep*2-1] + srow[-1];
01181                         Gs += srow[-bstep-1]*2;
01182                         Bs += srow[-bstep-2]+srow[-bstep];
01183                         ng++;
01184                     }
01185                     if( gradSE < T )
01186                     {
01187                         Rs += srow[bstep*2+1] + srow[1];
01188                         Gs += srow[bstep+1]*2;
01189                         Bs += srow[bstep+2]+srow[bstep];
01190                         ng++;
01191                     }
01192                     G = srow[0];
01193                     R = G + cvRound((Rs - Gs)*scale[ng]);
01194                     B = G + cvRound((Bs - Gs)*scale[ng]);
01195                 }
01196                 dstrow[blueIdx] = cv::saturate_cast<uchar>(B);
01197                 dstrow[1] = cv::saturate_cast<uchar>(G);
01198                 dstrow[blueIdx^2] = cv::saturate_cast<uchar>(R);
01199                 greenCell = !greenCell;
01200             }
01201 
01202 #if CV_SSE2
01203             if( !haveSSE )
01204                 break;
01205 
01206             __m128i emask    = _mm_set1_epi32(0x0000ffff),
01207                     omask    = _mm_set1_epi32(0xffff0000),
01208                     z        = _mm_setzero_si128(),
01209                     one      = _mm_set1_epi16(1);
01210             __m128 _0_5      = _mm_set1_ps(0.5f);
01211 
01212             #define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask)) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
01213             #define _mm_cvtloepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16))   //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
01214             #define _mm_cvthiepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16))   //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
01215             #define _mm_loadl_u8_s16(ptr, offset) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)((ptr) + (offset))), z) //load 8 uchars to 8 shorts
01216 
01217             // process 8 pixels at once
01218             for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
01219             {
01220                 //int gradN = brow0[0] + brow1[0];
01221                 __m128i gradN = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow0), _mm_loadu_si128((__m128i*)brow1));
01222 
01223                 //int gradS = brow1[0] + brow2[0];
01224                 __m128i gradS = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow1), _mm_loadu_si128((__m128i*)brow2));
01225 
01226                 //int gradW = brow1[N-1] + brow1[N];
01227                 __m128i gradW = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N-1)), _mm_loadu_si128((__m128i*)(brow1+N)));
01228 
01229                 //int gradE = brow1[N+1] + brow1[N];
01230                 __m128i gradE = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N+1)), _mm_loadu_si128((__m128i*)(brow1+N)));
01231 
01232                 //int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
01233                 //int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
01234                 __m128i minGrad = _mm_min_epi16(_mm_min_epi16(gradN, gradS), _mm_min_epi16(gradW, gradE));
01235                 __m128i maxGrad = _mm_max_epi16(_mm_max_epi16(gradN, gradS), _mm_max_epi16(gradW, gradE));
01236 
01237                 __m128i grad0, grad1;
01238 
01239                 //int gradNE = brow0[N4+1] + brow1[N4];
01240                 //int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
01241                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N4+1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
01242                 grad1 = _mm_adds_epi16( _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N2)), _mm_loadu_si128((__m128i*)(brow0+N2+1))),
01243                                         _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2+1))));
01244                 __m128i gradNE = _mm_merge_epi16(grad0, grad1);
01245 
01246                 //int gradSW = brow1[N4] + brow2[N4-1];
01247                 //int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
01248                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N4-1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
01249                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N2)), _mm_loadu_si128((__m128i*)(brow2+N2-1))),
01250                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2-1))));
01251                 __m128i gradSW = _mm_merge_epi16(grad0, grad1);
01252 
01253                 minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW);
01254                 maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW);
01255 
01256                 //int gradNW = brow0[N5-1] + brow1[N5];
01257                 //int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
01258                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N5-1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
01259                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N3)), _mm_loadu_si128((__m128i*)(brow0+N3-1))),
01260                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3-1))));
01261                 __m128i gradNW = _mm_merge_epi16(grad0, grad1);
01262 
01263                 //int gradSE = brow1[N5] + brow2[N5+1];
01264                 //int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
01265                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N5+1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
01266                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N3)), _mm_loadu_si128((__m128i*)(brow2+N3+1))),
01267                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3+1))));
01268                 __m128i gradSE = _mm_merge_epi16(grad0, grad1);
01269 
01270                 minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE);
01271                 maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE);
01272 
01273                 //int T = minGrad + maxGrad/2;
01274                 __m128i T = _mm_adds_epi16(_mm_max_epi16(_mm_srli_epi16(maxGrad, 1), one), minGrad);
01275 
01276                 __m128i RGs = z, GRs = z, Bs = z, ng = z;
01277 
01278                 __m128i x0  = _mm_loadl_u8_s16(srow, +0          );
01279                 __m128i x1  = _mm_loadl_u8_s16(srow, -1 - bstep  );
01280                 __m128i x2  = _mm_loadl_u8_s16(srow, -1 - bstep*2);
01281                 __m128i x3  = _mm_loadl_u8_s16(srow,    - bstep  );
01282                 __m128i x4  = _mm_loadl_u8_s16(srow, +1 - bstep*2);
01283                 __m128i x5  = _mm_loadl_u8_s16(srow, +1 - bstep  );
01284                 __m128i x6  = _mm_loadl_u8_s16(srow, +2 - bstep  );
01285                 __m128i x7  = _mm_loadl_u8_s16(srow, +1          );
01286                 __m128i x8  = _mm_loadl_u8_s16(srow, +2 + bstep  );
01287                 __m128i x9  = _mm_loadl_u8_s16(srow, +1 + bstep  );
01288                 __m128i x10 = _mm_loadl_u8_s16(srow, +1 + bstep*2);
01289                 __m128i x11 = _mm_loadl_u8_s16(srow,    + bstep  );
01290                 __m128i x12 = _mm_loadl_u8_s16(srow, -1 + bstep*2);
01291                 __m128i x13 = _mm_loadl_u8_s16(srow, -1 + bstep  );
01292                 __m128i x14 = _mm_loadl_u8_s16(srow, -2 + bstep  );
01293                 __m128i x15 = _mm_loadl_u8_s16(srow, -1          );
01294                 __m128i x16 = _mm_loadl_u8_s16(srow, -2 - bstep  );
01295 
01296                 __m128i t0, t1, mask;
01297 
01298                 // gradN ***********************************************
01299                 mask = _mm_cmpgt_epi16(T, gradN); // mask = T>gradN
01300                 ng = _mm_sub_epi16(ng, mask);     // ng += (T>gradN)
01301 
01302                 t0 = _mm_slli_epi16(x3, 1);                                 // srow[-bstep]*2
01303                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2), x0);  // srow[-bstep*2] + srow[0]
01304 
01305                 // RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
01306                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
01307                 // GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
01308                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x2,x4)), mask));
01309                 // Bs  += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
01310                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x5), t0), mask));
01311 
01312                 // gradNE **********************************************
01313                 mask = _mm_cmpgt_epi16(T, gradNE); // mask = T>gradNE
01314                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradNE)
01315 
01316                 t0 = _mm_slli_epi16(x5, 1);                                    // srow[-bstep+1]*2
01317                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2+2), x0);   // srow[-bstep*2+2] + srow[0]
01318 
01319                 // RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
01320                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
01321                 // GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
01322                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)), _mm_adds_epi16(x4,x7)), mask));
01323                 // Bs  += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])}  * (T>gradNE)
01324                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x3,x6)), mask));
01325 
01326                 // gradE ***********************************************
01327                 mask = _mm_cmpgt_epi16(T, gradE);  // mask = T>gradE
01328                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradE)
01329 
01330                 t0 = _mm_slli_epi16(x7, 1);                         // srow[1]*2
01331                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, 2), x0); // srow[2] + srow[0]
01332 
01333                 // RGs += (srow[2] + srow[0]) * (T>gradE)
01334                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
01335                 // GRs += (srow[1]*2) * (T>gradE)
01336                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
01337                 // Bs  += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
01338                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x5,x9), _mm_adds_epi16(x6,x8)), mask));
01339 
01340                 // gradSE **********************************************
01341                 mask = _mm_cmpgt_epi16(T, gradSE);  // mask = T>gradSE
01342                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradSE)
01343 
01344                 t0 = _mm_slli_epi16(x9, 1);                                 // srow[bstep+1]*2
01345                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2+2), x0); // srow[bstep*2+2] + srow[0]
01346 
01347                 // RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
01348                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
01349                 // GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
01350                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)), _mm_adds_epi16(x7,x10)), mask));
01351                 // Bs  += {srow[-bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
01352                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1), _mm_adds_epi16(x8,x11)), mask));
01353 
01354                 // gradS ***********************************************
01355                 mask = _mm_cmpgt_epi16(T, gradS);  // mask = T>gradS
01356                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradS)
01357 
01358                 t0 = _mm_slli_epi16(x11, 1);                             // srow[bstep]*2
01359                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,bstep*2), x0); // srow[bstep*2]+srow[0]
01360 
01361                 // RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
01362                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
01363                 // GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
01364                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x10,x12)), mask));
01365                 // Bs  += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
01366                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x9,x13), t0), mask));
01367 
01368                 // gradSW **********************************************
01369                 mask = _mm_cmpgt_epi16(T, gradSW);  // mask = T>gradSW
01370                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradSW)
01371 
01372                 t0 = _mm_slli_epi16(x13, 1);                                // srow[bstep-1]*2
01373                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2-2), x0); // srow[bstep*2-2]+srow[0]
01374 
01375                 // RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
01376                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
01377                 // GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
01378                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)), _mm_adds_epi16(x12,x15)), mask));
01379                 // Bs  += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
01380                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x11,x14)), mask));
01381 
01382                 // gradW ***********************************************
01383                 mask = _mm_cmpgt_epi16(T, gradW);  // mask = T>gradW
01384                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradW)
01385 
01386                 t0 = _mm_slli_epi16(x15, 1);                         // srow[-1]*2
01387                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -2), x0); // srow[-2]+srow[0]
01388 
01389                 // RGs += (srow[-2]+srow[0]) * (T>gradW)
01390                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
01391                 // GRs += (srow[-1]*2) * (T>gradW)
01392                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
01393                 // Bs  += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
01394                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x13), _mm_adds_epi16(x14,x16)), mask));
01395 
01396                 // gradNW **********************************************
01397                 mask = _mm_cmpgt_epi16(T, gradNW);  // mask = T>gradNW
01398                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradNW)
01399 
01400                 t0 = _mm_slli_epi16(x1, 1);                                 // srow[-bstep-1]*2
01401                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,-bstep*2-2), x0); // srow[-bstep*2-2]+srow[0]
01402 
01403                 // RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
01404                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
01405                 // GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
01406                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)), _mm_adds_epi16(x2,x15)), mask));
01407                 // Bs  += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
01408                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1),_mm_adds_epi16(x3,x16)), mask));
01409 
01410                 __m128 ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng));
01411                 __m128 ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng));
01412 
01413                 // now interpolate r, g & b
01414                 t0 = _mm_subs_epi16(GRs, RGs);
01415                 t1 = _mm_subs_epi16(Bs, RGs);
01416 
01417                 t0 = _mm_add_epi16(x0, _mm_packs_epi32(
01418                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t0), ngf0)),
01419                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t0), ngf1))));
01420 
01421                 t1 = _mm_add_epi16(x0, _mm_packs_epi32(
01422                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t1), ngf0)),
01423                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t1), ngf1))));
01424 
01425                 x1 = _mm_merge_epi16(x0, t0);
01426                 x2 = _mm_merge_epi16(t0, x0);
01427 
01428                 uchar R[8], G[8], B[8];
01429 
01430                 _mm_storel_epi64(blueIdx ? (__m128i*)B : (__m128i*)R, _mm_packus_epi16(x1, z));
01431                 _mm_storel_epi64((__m128i*)G, _mm_packus_epi16(x2, z));
01432                 _mm_storel_epi64(blueIdx ? (__m128i*)R : (__m128i*)B, _mm_packus_epi16(t1, z));
01433 
01434                 for( int j = 0; j < 8; j++, dstrow += 3 )
01435                 {
01436                     dstrow[0] = B[j]; dstrow[1] = G[j]; dstrow[2] = R[j];
01437                 }
01438             }
01439 #endif
01440 
01441             limit = N - 2;
01442         }
01443         while( i < N - 2 );
01444 
01445         for( i = 0; i < 6; i++ )
01446         {
01447             dst[dststep*y + 5 - i] = dst[dststep*y + 8 - i];
01448             dst[dststep*y + (N - 2)*3 + i] = dst[dststep*y + (N - 3)*3 + i];
01449         }
01450 
01451         greenCell0 = !greenCell0;
01452         blueIdx ^= 2;
01453     }
01454 
01455     for( i = 0; i < size.width*3; i++ )
01456     {
01457         dst[i] = dst[i + dststep] = dst[i + dststep*2];
01458         dst[i + dststep*(size.height-4)] =
01459         dst[i + dststep*(size.height-3)] =
01460         dst[i + dststep*(size.height-2)] =
01461         dst[i + dststep*(size.height-1)] = dst[i + dststep*(size.height-5)];
01462     }
01463 }
01464 
01465 //////////////////////////////// Edge-Aware Demosaicing //////////////////////////////////
01466 
01467 template <typename T, typename SIMDInterpolator>
01468 class Bayer2RGB_EdgeAware_T_Invoker :
01469     public cv::ParallelLoopBody
01470 {
01471 public:
01472     Bayer2RGB_EdgeAware_T_Invoker(const Mat& _src, Mat& _dst, const Size& _size,
01473         int _blue, int _start_with_green) :
01474         ParallelLoopBody(),
01475         src(_src), dst(_dst), size(_size), Blue(_blue), Start_with_green(_start_with_green)
01476     {
01477     }
01478 
01479     virtual void operator()(const Range& range) const
01480     {
01481         int dcn = dst.channels();
01482         int dcn2 = dcn<<1;
01483         int start_with_green = Start_with_green, blue = Blue;
01484         int sstep = int(src.step / src.elemSize1()), dstep = int(dst.step / dst.elemSize1());
01485         SIMDInterpolator vecOp;
01486 
01487         const T* S = src.ptr<T>(range.start + 1) + 1;
01488         T* D = reinterpret_cast<T*>(dst.data + (range.start + 1) * dst.step) + dcn;
01489 
01490         if (range.start % 2)
01491         {
01492             start_with_green ^= 1;
01493             blue ^= 1;
01494         }
01495 
01496         // to BGR
01497         for (int y = range.start; y < range.end; ++y)
01498         {
01499             int x = 1;
01500             if (start_with_green)
01501             {
01502                 D[blue<<1] = (S[-sstep] + S[sstep]) >> 1;
01503                 D[1] = S[0];
01504                 D[2-(blue<<1)] = (S[-1] + S[1]) >> 1;
01505                 D += dcn;
01506                 ++S;
01507                 ++x;
01508             }
01509 
01510             int delta = vecOp.bayer2RGB_EA(S - sstep - 1, sstep, D, size.width, blue);
01511             x += delta;
01512             S += delta;
01513             D += dcn * delta;
01514 
01515             if (blue)
01516                 for (; x < size.width; x += 2, S += 2, D += dcn2)
01517                 {
01518                     D[0] = S[0];
01519                     D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[sstep] - S[-sstep]) ? (S[sstep] + S[-sstep] + 1) : (S[-1] + S[1] + 1)) >> 1;
01520                     D[2] = (S[-sstep-1] + S[-sstep+1] + S[sstep-1] + S[sstep+1]) >> 2;
01521 
01522                     D[3] = (S[0] + S[2] + 1) >> 1;
01523                     D[4] = S[1];
01524                     D[5] = (S[-sstep+1] + S[sstep+1] + 1) >> 1;
01525                 }
01526             else
01527                 for (; x < size.width; x += 2, S += 2, D += dcn2)
01528                 {
01529                     D[0] = (S[-sstep-1] + S[-sstep+1] + S[sstep-1] + S[sstep+1] + 2) >> 2;
01530                     D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[sstep] - S[-sstep]) ? (S[sstep] + S[-sstep] + 1) : (S[-1] + S[1] + 1)) >> 1;
01531                     D[2] = S[0];
01532 
01533                     D[3] = (S[-sstep+1] + S[sstep+1] + 1) >> 1;
01534                     D[4] = S[1];
01535                     D[5] = (S[0] + S[2] + 1) >> 1;
01536                 }
01537 
01538             if (x <= size.width)
01539             {
01540                 D[blue<<1] = (S[-sstep-1] + S[-sstep+1] + S[sstep-1] + S[sstep+1] + 2) >> 2;
01541                 D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[sstep] - S[-sstep]) ? (S[sstep] + S[-sstep] + 1) : (S[-1] + S[1] + 1)) >> 1;
01542                 D[2-(blue<<1)] = S[0];
01543                 D += dcn;
01544                 ++S;
01545             }
01546 
01547             for (int i = 0; i < dcn; ++i)
01548             {
01549                 D[i] = D[-dcn + i];
01550                 D[-dstep+dcn+i] = D[-dstep+(dcn<<1)+i];
01551             }
01552 
01553             start_with_green ^= 1;
01554             blue ^= 1;
01555             S += 2;
01556             D += dcn2;
01557         }
01558     }
01559 
01560 private:
01561     Mat src;
01562     Mat dst;
01563     Size size;
01564     int Blue, Start_with_green;
01565 };
01566 
01567 template <typename T, typename SIMDInterpolator>
01568 static void Bayer2RGB_EdgeAware_T(const Mat& src, Mat& dst, int code)
01569 {
01570     Size size = src.size();
01571 
01572     // for small sizes
01573     if (size.width <= 2 || size.height <= 2)
01574     {
01575         dst = Scalar::all(0);
01576         return;
01577     }
01578 
01579     size.width -= 2;
01580     size.height -= 2;
01581 
01582     int start_with_green = code == CV_BayerGB2BGR_EA || code == CV_BayerGR2BGR_EA ? 1 : 0;
01583     int blue = code == CV_BayerGB2BGR_EA || code == CV_BayerBG2BGR_EA ? 1 : 0;
01584 
01585     if (size.height > 0)
01586     {
01587         Bayer2RGB_EdgeAware_T_Invoker<T, SIMDInterpolator> invoker(src, dst, size, blue, start_with_green);
01588         Range range(0, size.height);
01589         parallel_for_(range, invoker, dst.total()/static_cast<double>(1<<16));
01590     }
01591     size = dst.size();
01592     size.width *= dst.channels();
01593     size_t dstep = dst.step / dst.elemSize1();
01594     T* firstRow = dst.ptr<T>();
01595     T* lastRow = dst.ptr<T>() + (size.height-1) * dstep;
01596 
01597     if (size.height > 2)
01598     {
01599         for (int x = 0; x < size.width; ++x)
01600         {
01601             firstRow[x] = (firstRow+dstep)[x];
01602             lastRow[x] = (lastRow-dstep)[x];
01603         }
01604     }
01605     else
01606         for (int x = 0; x < size.width; ++x)
01607             firstRow[x] = lastRow[x] = 0;
01608 }
01609 
01610 } // end namespace cv
01611 
01612 //////////////////////////////////////////////////////////////////////////////////////////
01613 //                           The main Demosaicing function                              //
01614 //////////////////////////////////////////////////////////////////////////////////////////
01615 
01616 void cv::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn)
01617 {
01618     Mat src = _src.getMat(), dst;
01619     Size sz = src.size();
01620     int scn = src.channels(), depth = src.depth();
01621 
01622     CV_Assert(depth == CV_8U || depth == CV_16U);
01623     CV_Assert(!src.empty());
01624 
01625     switch (code)
01626     {
01627     case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
01628         if (dcn <= 0)
01629             dcn = 1;
01630         CV_Assert( scn == 1 && dcn == 1 );
01631 
01632         _dst.create(sz, CV_MAKETYPE(depth, dcn));
01633         dst = _dst.getMat();
01634 
01635         if( depth == CV_8U )
01636             Bayer2Gray_<uchar, SIMDBayerInterpolator_8u>(src, dst, code);
01637         else if( depth == CV_16U )
01638             Bayer2Gray_<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst, code);
01639         else
01640             CV_Error(CV_StsUnsupportedFormat, "Bayer->Gray demosaicing only supports 8u and 16u types");
01641         break;
01642 
01643     case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
01644     case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:
01645         {
01646             if (dcn <= 0)
01647                 dcn = 3;
01648             CV_Assert( scn == 1 && (dcn == 3 || dcn == 4) );
01649 
01650             _dst.create(sz, CV_MAKE_TYPE(depth, dcn));
01651             Mat dst_ = _dst.getMat();
01652 
01653             if( code == CV_BayerBG2BGR || code == CV_BayerGB2BGR ||
01654                 code == CV_BayerRG2BGR || code == CV_BayerGR2BGR )
01655             {
01656                 if( depth == CV_8U )
01657                     Bayer2RGB_<uchar, SIMDBayerInterpolator_8u>(src, dst_, code);
01658                 else if( depth == CV_16U )
01659                     Bayer2RGB_<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst_, code);
01660                 else
01661                     CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB demosaicing only supports 8u and 16u types");
01662             }
01663             else
01664             {
01665                 CV_Assert( depth == CV_8U );
01666                 Bayer2RGB_VNG_8u(src, dst_, code);
01667             }
01668         }
01669         break;
01670 
01671     case CV_BayerBG2BGR_EA: case CV_BayerGB2BGR_EA: case CV_BayerRG2BGR_EA: case CV_BayerGR2BGR_EA:
01672         if (dcn <= 0)
01673             dcn = 3;
01674 
01675         CV_Assert(scn == 1 && dcn == 3);
01676         _dst.create(sz, CV_MAKETYPE(depth, dcn));
01677         dst = _dst.getMat();
01678 
01679         if (depth == CV_8U)
01680             Bayer2RGB_EdgeAware_T<uchar, SIMDBayerInterpolator_8u>(src, dst, code);
01681         else if (depth == CV_16U)
01682             Bayer2RGB_EdgeAware_T<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst, code);
01683         else
01684             CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB Edge-Aware demosaicing only currently supports 8u and 16u types");
01685 
01686         break;
01687 
01688     default:
01689         CV_Error( CV_StsBadFlag, "Unknown / unsupported color conversion code" );
01690     }
01691 }
01692