Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers simd_functions.hpp Source File

simd_functions.hpp

Go to the documentation of this file.
00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Third party copyrights are property of their respective owners.
00016 //
00017 // Redistribution and use in source and binary forms, with or without modification,
00018 // are permitted provided that the following conditions are met:
00019 //
00020 //   * Redistribution's of source code must retain the above copyright notice,
00021 //     this list of conditions and the following disclaimer.
00022 //
00023 //   * Redistribution's in binary form must reproduce the above copyright notice,
00024 //     this list of conditions and the following disclaimer in the documentation
00025 //     and/or other materials provided with the distribution.
00026 //
00027 //   * The name of the copyright holders may not be used to endorse or promote products
00028 //     derived from this software without specific prior written permission.
00029 //
00030 // This software is provided by the copyright holders and contributors "as is" and
00031 // any express or implied warranties, including, but not limited to, the implied
00032 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00033 // In no event shall the Intel Corporation or contributors be liable for any direct,
00034 // indirect, incidental, special, exemplary, or consequential damages
00035 // (including, but not limited to, procurement of substitute goods or services;
00036 // loss of use, data, or profits; or business interruption) however caused
00037 // and on any theory of liability, whether in contract, strict liability,
00038 // or tort (including negligence or otherwise) arising in any way out of
00039 // the use of this software, even if advised of the possibility of such damage.
00040 //
00041 //M*/
00042 
00043 /*
00044  * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
00045  *
00046  * Redistribution and use in source and binary forms, with or without
00047  * modification, are permitted provided that the following conditions are met:
00048  *
00049  *   Redistributions of source code must retain the above copyright notice,
00050  *   this list of conditions and the following disclaimer.
00051  *
00052  *   Redistributions in binary form must reproduce the above copyright notice,
00053  *   this list of conditions and the following disclaimer in the documentation
00054  *   and/or other materials provided with the distribution.
00055  *
00056  *   Neither the name of NVIDIA Corporation nor the names of its contributors
00057  *   may be used to endorse or promote products derived from this software
00058  *   without specific prior written permission.
00059  *
00060  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00061  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00062  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00063  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
00064  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00065  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00066  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00067  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00068  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00069  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00070  * POSSIBILITY OF SUCH DAMAGE.
00071  */
00072 
00073 #ifndef __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
00074 #define __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
00075 
00076 #include "common.hpp "
00077 
00078 /** @file
00079  * @deprecated Use @ref cudev instead.
00080  */
00081 
00082 //! @cond IGNORED
00083 
00084 namespace cv { namespace cuda { namespace device
00085 {
00086     // 2
00087 
00088     static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
00089     {
00090         unsigned int r = 0;
00091 
00092     #if __CUDA_ARCH__ >= 300
00093         asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00094     #elif __CUDA_ARCH__ >= 200
00095         asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00096         asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00097     #else
00098         unsigned int s;
00099         s = a ^ b;          // sum bits
00100         r = a + b;          // actual sum
00101         s = s ^ r;          // determine carry-ins for each bit position
00102         s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
00103         r = r - s;          // subtract out carry-out from low word
00104     #endif
00105 
00106         return r;
00107     }
00108 
00109     static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
00110     {
00111         unsigned int r = 0;
00112 
00113     #if __CUDA_ARCH__ >= 300
00114         asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00115     #elif __CUDA_ARCH__ >= 200
00116         asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00117         asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00118     #else
00119         unsigned int s;
00120         s = a ^ b;          // sum bits
00121         r = a - b;          // actual sum
00122         s = s ^ r;          // determine carry-ins for each bit position
00123         s = s & 0x00010000; // borrow to high word
00124         r = r + s;          // compensate for borrow from low word
00125     #endif
00126 
00127         return r;
00128     }
00129 
00130     static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
00131     {
00132         unsigned int r = 0;
00133 
00134     #if __CUDA_ARCH__ >= 300
00135         asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00136     #elif __CUDA_ARCH__ >= 200
00137         asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00138         asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00139     #else
00140         unsigned int s, t, u, v;
00141         s = a & 0x0000ffff; // extract low halfword
00142         r = b & 0x0000ffff; // extract low halfword
00143         u = ::max(r, s);    // maximum of low halfwords
00144         v = ::min(r, s);    // minimum of low halfwords
00145         s = a & 0xffff0000; // extract high halfword
00146         r = b & 0xffff0000; // extract high halfword
00147         t = ::max(r, s);    // maximum of high halfwords
00148         s = ::min(r, s);    // minimum of high halfwords
00149         r = u | t;          // maximum of both halfwords
00150         s = v | s;          // minimum of both halfwords
00151         r = r - s;          // |a - b| = max(a,b) - min(a,b);
00152     #endif
00153 
00154         return r;
00155     }
00156 
00157     static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
00158     {
00159         unsigned int r, s;
00160 
00161         // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
00162         // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
00163         s = a ^ b;
00164         r = a & b;
00165         s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
00166         s = s >> 1;
00167         s = r + s;
00168 
00169         return s;
00170     }
00171 
00172     static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
00173     {
00174         unsigned int r = 0;
00175 
00176     #if __CUDA_ARCH__ >= 300
00177         asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00178     #else
00179         // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
00180         // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
00181         unsigned int s;
00182         s = a ^ b;
00183         r = a | b;
00184         s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
00185         s = s >> 1;
00186         r = r - s;
00187     #endif
00188 
00189         return r;
00190     }
00191 
00192     static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
00193     {
00194         unsigned int r = 0;
00195 
00196     #if __CUDA_ARCH__ >= 300
00197         asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00198     #else
00199         // inspired by Alan Mycroft's null-byte detection algorithm:
00200         // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
00201         unsigned int c;
00202         r = a ^ b;          // 0x0000 if a == b
00203         c = r | 0x80008000; // set msbs, to catch carry out
00204         r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
00205         c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
00206         c = r & ~c;         // msb = 1, if r was 0x0000
00207         r = c >> 15;        // convert to bool
00208     #endif
00209 
00210         return r;
00211     }
00212 
00213     static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
00214     {
00215         unsigned int r, c;
00216 
00217     #if __CUDA_ARCH__ >= 300
00218         r = vseteq2(a, b);
00219         c = r << 16;        // convert bool
00220         r = c - r;          //  into mask
00221     #else
00222         // inspired by Alan Mycroft's null-byte detection algorithm:
00223         // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
00224         r = a ^ b;          // 0x0000 if a == b
00225         c = r | 0x80008000; // set msbs, to catch carry out
00226         r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
00227         c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
00228         c = r & ~c;         // msb = 1, if r was 0x0000
00229         r = c >> 15;        // convert
00230         r = c - r;          //  msbs to
00231         r = c | r;          //   mask
00232     #endif
00233 
00234         return r;
00235     }
00236 
00237     static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
00238     {
00239         unsigned int r = 0;
00240 
00241     #if __CUDA_ARCH__ >= 300
00242         asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00243     #else
00244         unsigned int c;
00245         asm("not.b32 %0, %0;" : "+r"(b));
00246         c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
00247         c = c & 0x80008000; // msb = carry-outs
00248         r = c >> 15;        // convert to bool
00249     #endif
00250 
00251         return r;
00252     }
00253 
00254     static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
00255     {
00256         unsigned int r, c;
00257 
00258     #if __CUDA_ARCH__ >= 300
00259         r = vsetge2(a, b);
00260         c = r << 16;        // convert bool
00261         r = c - r;          //  into mask
00262     #else
00263         asm("not.b32 %0, %0;" : "+r"(b));
00264         c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
00265         c = c & 0x80008000; // msb = carry-outs
00266         r = c >> 15;        // convert
00267         r = c - r;          //  msbs to
00268         r = c | r;          //   mask
00269     #endif
00270 
00271         return r;
00272     }
00273 
00274     static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
00275     {
00276         unsigned int r = 0;
00277 
00278     #if __CUDA_ARCH__ >= 300
00279         asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00280     #else
00281         unsigned int c;
00282         asm("not.b32 %0, %0;" : "+r"(b));
00283         c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
00284         c = c & 0x80008000; // msbs = carry-outs
00285         r = c >> 15;        // convert to bool
00286     #endif
00287 
00288         return r;
00289     }
00290 
00291     static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
00292     {
00293         unsigned int r, c;
00294 
00295     #if __CUDA_ARCH__ >= 300
00296         r = vsetgt2(a, b);
00297         c = r << 16;        // convert bool
00298         r = c - r;          //  into mask
00299     #else
00300         asm("not.b32 %0, %0;" : "+r"(b));
00301         c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
00302         c = c & 0x80008000; // msbs = carry-outs
00303         r = c >> 15;        // convert
00304         r = c - r;          //  msbs to
00305         r = c | r;          //   mask
00306     #endif
00307 
00308         return r;
00309     }
00310 
00311     static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
00312     {
00313         unsigned int r = 0;
00314 
00315     #if __CUDA_ARCH__ >= 300
00316         asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00317     #else
00318         unsigned int c;
00319         asm("not.b32 %0, %0;" : "+r"(a));
00320         c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
00321         c = c & 0x80008000; // msb = carry-outs
00322         r = c >> 15;        // convert to bool
00323     #endif
00324 
00325         return r;
00326     }
00327 
00328     static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
00329     {
00330         unsigned int r, c;
00331 
00332     #if __CUDA_ARCH__ >= 300
00333         r = vsetle2(a, b);
00334         c = r << 16;        // convert bool
00335         r = c - r;          //  into mask
00336     #else
00337         asm("not.b32 %0, %0;" : "+r"(a));
00338         c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
00339         c = c & 0x80008000; // msb = carry-outs
00340         r = c >> 15;        // convert
00341         r = c - r;          //  msbs to
00342         r = c | r;          //   mask
00343     #endif
00344 
00345         return r;
00346     }
00347 
00348     static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
00349     {
00350         unsigned int r = 0;
00351 
00352     #if __CUDA_ARCH__ >= 300
00353         asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00354     #else
00355         unsigned int c;
00356         asm("not.b32 %0, %0;" : "+r"(a));
00357         c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
00358         c = c & 0x80008000; // msb = carry-outs
00359         r = c >> 15;        // convert to bool
00360     #endif
00361 
00362         return r;
00363     }
00364 
00365     static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
00366     {
00367         unsigned int r, c;
00368 
00369     #if __CUDA_ARCH__ >= 300
00370         r = vsetlt2(a, b);
00371         c = r << 16;        // convert bool
00372         r = c - r;          //  into mask
00373     #else
00374         asm("not.b32 %0, %0;" : "+r"(a));
00375         c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
00376         c = c & 0x80008000; // msb = carry-outs
00377         r = c >> 15;        // convert
00378         r = c - r;          //  msbs to
00379         r = c | r;          //   mask
00380     #endif
00381 
00382         return r;
00383     }
00384 
00385     static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
00386     {
00387         unsigned int r = 0;
00388 
00389     #if __CUDA_ARCH__ >= 300
00390         asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00391     #else
00392         // inspired by Alan Mycroft's null-byte detection algorithm:
00393         // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
00394         unsigned int c;
00395         r = a ^ b;          // 0x0000 if a == b
00396         c = r | 0x80008000; // set msbs, to catch carry out
00397         c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
00398         c = r | c;          // msb = 1, if r was not 0x0000
00399         c = c & 0x80008000; // extract msbs
00400         r = c >> 15;        // convert to bool
00401     #endif
00402 
00403         return r;
00404     }
00405 
00406     static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
00407     {
00408         unsigned int r, c;
00409 
00410     #if __CUDA_ARCH__ >= 300
00411         r = vsetne2(a, b);
00412         c = r << 16;        // convert bool
00413         r = c - r;          //  into mask
00414     #else
00415         // inspired by Alan Mycroft's null-byte detection algorithm:
00416         // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
00417         r = a ^ b;          // 0x0000 if a == b
00418         c = r | 0x80008000; // set msbs, to catch carry out
00419         c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
00420         c = r | c;          // msb = 1, if r was not 0x0000
00421         c = c & 0x80008000; // extract msbs
00422         r = c >> 15;        // convert
00423         r = c - r;          //  msbs to
00424         r = c | r;          //   mask
00425     #endif
00426 
00427         return r;
00428     }
00429 
00430     static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
00431     {
00432         unsigned int r = 0;
00433 
00434     #if __CUDA_ARCH__ >= 300
00435         asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00436     #elif __CUDA_ARCH__ >= 200
00437         asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00438         asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00439     #else
00440         unsigned int s, t, u;
00441         r = a & 0x0000ffff; // extract low halfword
00442         s = b & 0x0000ffff; // extract low halfword
00443         t = ::max(r, s);    // maximum of low halfwords
00444         r = a & 0xffff0000; // extract high halfword
00445         s = b & 0xffff0000; // extract high halfword
00446         u = ::max(r, s);    // maximum of high halfwords
00447         r = t | u;          // combine halfword maximums
00448     #endif
00449 
00450         return r;
00451     }
00452 
00453     static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
00454     {
00455         unsigned int r = 0;
00456 
00457     #if __CUDA_ARCH__ >= 300
00458         asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00459     #elif __CUDA_ARCH__ >= 200
00460         asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00461         asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00462     #else
00463         unsigned int s, t, u;
00464         r = a & 0x0000ffff; // extract low halfword
00465         s = b & 0x0000ffff; // extract low halfword
00466         t = ::min(r, s);    // minimum of low halfwords
00467         r = a & 0xffff0000; // extract high halfword
00468         s = b & 0xffff0000; // extract high halfword
00469         u = ::min(r, s);    // minimum of high halfwords
00470         r = t | u;          // combine halfword minimums
00471     #endif
00472 
00473         return r;
00474     }
00475 
00476     // 4
00477 
00478     static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
00479     {
00480         unsigned int r = 0;
00481 
00482     #if __CUDA_ARCH__ >= 300
00483         asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00484     #elif __CUDA_ARCH__ >= 200
00485         asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00486         asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00487         asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00488         asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00489     #else
00490         unsigned int s, t;
00491         s = a ^ b;          // sum bits
00492         r = a & 0x7f7f7f7f; // clear msbs
00493         t = b & 0x7f7f7f7f; // clear msbs
00494         s = s & 0x80808080; // msb sum bits
00495         r = r + t;          // add without msbs, record carry-out in msbs
00496         r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
00497     #endif /* __CUDA_ARCH__ >= 300 */
00498 
00499         return r;
00500     }
00501 
00502     static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
00503     {
00504         unsigned int r = 0;
00505 
00506     #if __CUDA_ARCH__ >= 300
00507         asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00508     #elif __CUDA_ARCH__ >= 200
00509         asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00510         asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00511         asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00512         asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00513     #else
00514         unsigned int s, t;
00515         s = a ^ ~b;         // inverted sum bits
00516         r = a | 0x80808080; // set msbs
00517         t = b & 0x7f7f7f7f; // clear msbs
00518         s = s & 0x80808080; // inverted msb sum bits
00519         r = r - t;          // subtract w/o msbs, record inverted borrows in msb
00520         r = r ^ s;          // combine inverted msb sum bits and borrows
00521     #endif
00522 
00523         return r;
00524     }
00525 
00526     static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
00527     {
00528         unsigned int r, s;
00529 
00530         // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
00531         // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
00532         s = a ^ b;
00533         r = a & b;
00534         s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
00535         s = s >> 1;
00536         s = r + s;
00537 
00538         return s;
00539     }
00540 
00541     static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
00542     {
00543         unsigned int r = 0;
00544 
00545     #if __CUDA_ARCH__ >= 300
00546         asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00547     #else
00548         // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
00549         // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
00550         unsigned int c;
00551         c = a ^ b;
00552         r = a | b;
00553         c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
00554         c = c >> 1;
00555         r = r - c;
00556     #endif
00557 
00558         return r;
00559     }
00560 
00561     static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
00562     {
00563         unsigned int r = 0;
00564 
00565     #if __CUDA_ARCH__ >= 300
00566         asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00567     #else
00568         // inspired by Alan Mycroft's null-byte detection algorithm:
00569         // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
00570         unsigned int c;
00571         r = a ^ b;          // 0x00 if a == b
00572         c = r | 0x80808080; // set msbs, to catch carry out
00573         r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
00574         c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
00575         c = r & ~c;         // msb = 1, if r was 0x00
00576         r = c >> 7;         // convert to bool
00577     #endif
00578 
00579         return r;
00580     }
00581 
00582     static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
00583     {
00584         unsigned int r, t;
00585 
00586     #if __CUDA_ARCH__ >= 300
00587         r = vseteq4(a, b);
00588         t = r << 8;         // convert bool
00589         r = t - r;          //  to mask
00590     #else
00591         // inspired by Alan Mycroft's null-byte detection algorithm:
00592         // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
00593         t = a ^ b;          // 0x00 if a == b
00594         r = t | 0x80808080; // set msbs, to catch carry out
00595         t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
00596         r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
00597         r = t & ~r;         // msb = 1, if t was 0x00
00598         t = r >> 7;         // build mask
00599         t = r - t;          //  from
00600         r = t | r;          //   msbs
00601     #endif
00602 
00603         return r;
00604     }
00605 
00606     static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
00607     {
00608         unsigned int r = 0;
00609 
00610     #if __CUDA_ARCH__ >= 300
00611         asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00612     #else
00613         unsigned int c;
00614         asm("not.b32 %0, %0;" : "+r"(a));
00615         c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
00616         c = c & 0x80808080; // msb = carry-outs
00617         r = c >> 7;         // convert to bool
00618     #endif
00619 
00620         return r;
00621     }
00622 
00623     static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
00624     {
00625         unsigned int r, c;
00626 
00627     #if __CUDA_ARCH__ >= 300
00628         r = vsetle4(a, b);
00629         c = r << 8;         // convert bool
00630         r = c - r;          //  to mask
00631     #else
00632         asm("not.b32 %0, %0;" : "+r"(a));
00633         c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
00634         c = c & 0x80808080; // msbs = carry-outs
00635         r = c >> 7;         // convert
00636         r = c - r;          //  msbs to
00637         r = c | r;          //   mask
00638     #endif
00639 
00640         return r;
00641     }
00642 
00643     static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
00644     {
00645         unsigned int r = 0;
00646 
00647     #if __CUDA_ARCH__ >= 300
00648         asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00649     #else
00650         unsigned int c;
00651         asm("not.b32 %0, %0;" : "+r"(a));
00652         c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
00653         c = c & 0x80808080; // msb = carry-outs
00654         r = c >> 7;         // convert to bool
00655     #endif
00656 
00657         return r;
00658     }
00659 
00660     static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
00661     {
00662         unsigned int r, c;
00663 
00664     #if __CUDA_ARCH__ >= 300
00665         r = vsetlt4(a, b);
00666         c = r << 8;         // convert bool
00667         r = c - r;          //  to mask
00668     #else
00669         asm("not.b32 %0, %0;" : "+r"(a));
00670         c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
00671         c = c & 0x80808080; // msbs = carry-outs
00672         r = c >> 7;         // convert
00673         r = c - r;          //  msbs to
00674         r = c | r;          //   mask
00675     #endif
00676 
00677         return r;
00678     }
00679 
00680     static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
00681     {
00682         unsigned int r = 0;
00683 
00684     #if __CUDA_ARCH__ >= 300
00685         asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00686     #else
00687         unsigned int c;
00688         asm("not.b32 %0, %0;" : "+r"(b));
00689         c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
00690         c = c & 0x80808080; // msb = carry-outs
00691         r = c >> 7;         // convert to bool
00692     #endif
00693 
00694         return r;
00695     }
00696 
00697     static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
00698     {
00699         unsigned int r, s;
00700 
00701     #if __CUDA_ARCH__ >= 300
00702         r = vsetge4(a, b);
00703         s = r << 8;         // convert bool
00704         r = s - r;          //  to mask
00705     #else
00706         asm ("not.b32 %0,%0;" : "+r"(b));
00707         r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
00708         r = r & 0x80808080; // msb = carry-outs
00709         s = r >> 7;         // build mask
00710         s = r - s;          //  from
00711         r = s | r;          //   msbs
00712     #endif
00713 
00714         return r;
00715     }
00716 
00717     static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
00718     {
00719         unsigned int r = 0;
00720 
00721     #if __CUDA_ARCH__ >= 300
00722         asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00723     #else
00724         unsigned int c;
00725         asm("not.b32 %0, %0;" : "+r"(b));
00726         c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
00727         c = c & 0x80808080; // msb = carry-outs
00728         r = c >> 7;         // convert to bool
00729     #endif
00730 
00731         return r;
00732     }
00733 
00734     static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
00735     {
00736         unsigned int r, c;
00737 
00738     #if __CUDA_ARCH__ >= 300
00739         r = vsetgt4(a, b);
00740         c = r << 8;         // convert bool
00741         r = c - r;          //  to mask
00742     #else
00743         asm("not.b32 %0, %0;" : "+r"(b));
00744         c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
00745         c = c & 0x80808080; // msb = carry-outs
00746         r = c >> 7;         // convert
00747         r = c - r;          //  msbs to
00748         r = c | r;          //   mask
00749     #endif
00750 
00751         return r;
00752     }
00753 
00754     static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
00755     {
00756         unsigned int r = 0;
00757 
00758     #if __CUDA_ARCH__ >= 300
00759         asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00760     #else
00761         // inspired by Alan Mycroft's null-byte detection algorithm:
00762         // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
00763         unsigned int c;
00764         r = a ^ b;          // 0x00 if a == b
00765         c = r | 0x80808080; // set msbs, to catch carry out
00766         c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
00767         c = r | c;          // msb = 1, if r was not 0x00
00768         c = c & 0x80808080; // extract msbs
00769         r = c >> 7;         // convert to bool
00770     #endif
00771 
00772         return r;
00773     }
00774 
00775     static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
00776     {
00777         unsigned int r, c;
00778 
00779     #if __CUDA_ARCH__ >= 300
00780         r = vsetne4(a, b);
00781         c = r << 8;         // convert bool
00782         r = c - r;          //  to mask
00783     #else
00784         // inspired by Alan Mycroft's null-byte detection algorithm:
00785         // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
00786         r = a ^ b;          // 0x00 if a == b
00787         c = r | 0x80808080; // set msbs, to catch carry out
00788         c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
00789         c = r | c;          // msb = 1, if r was not 0x00
00790         c = c & 0x80808080; // extract msbs
00791         r = c >> 7;         // convert
00792         r = c - r;          //  msbs to
00793         r = c | r;          //   mask
00794     #endif
00795 
00796         return r;
00797     }
00798 
00799     static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
00800     {
00801         unsigned int r = 0;
00802 
00803     #if __CUDA_ARCH__ >= 300
00804         asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00805     #elif __CUDA_ARCH__ >= 200
00806         asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00807         asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00808         asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00809         asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00810     #else
00811         unsigned int s;
00812         s = vcmpge4(a, b);  // mask = 0xff if a >= b
00813         r = a ^ b;          //
00814         s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
00815         r = s ^ r;          // select a when b >= a, else select b => min(a,b)
00816         r = s - r;          // |a - b| = max(a,b) - min(a,b);
00817     #endif
00818 
00819         return r;
00820     }
00821 
00822     static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
00823     {
00824         unsigned int r = 0;
00825 
00826     #if __CUDA_ARCH__ >= 300
00827         asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00828     #elif __CUDA_ARCH__ >= 200
00829         asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00830         asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00831         asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00832         asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00833     #else
00834         unsigned int s;
00835         s = vcmpge4(a, b);  // mask = 0xff if a >= b
00836         r = a & s;          // select a when b >= a
00837         s = b & ~s;         // select b when b < a
00838         r = r | s;          // combine byte selections
00839     #endif
00840 
00841         return r;           // byte-wise unsigned maximum
00842     }
00843 
00844     static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
00845     {
00846         unsigned int r = 0;
00847 
00848     #if __CUDA_ARCH__ >= 300
00849         asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00850     #elif __CUDA_ARCH__ >= 200
00851         asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00852         asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00853         asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00854         asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
00855     #else
00856         unsigned int s;
00857         s = vcmpge4(b, a);  // mask = 0xff if a >= b
00858         r = a & s;          // select a when b >= a
00859         s = b & ~s;         // select b when b < a
00860         r = r | s;          // combine byte selections
00861     #endif
00862 
00863         return r;
00864     }
00865 }}}
00866 
00867 //! @endcond
00868 
00869 #endif // __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
00870