Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
simd_functions.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Third party copyrights are property of their respective owners. 00016 // 00017 // Redistribution and use in source and binary forms, with or without modification, 00018 // are permitted provided that the following conditions are met: 00019 // 00020 // * Redistribution's of source code must retain the above copyright notice, 00021 // this list of conditions and the following disclaimer. 00022 // 00023 // * Redistribution's in binary form must reproduce the above copyright notice, 00024 // this list of conditions and the following disclaimer in the documentation 00025 // and/or other materials provided with the distribution. 00026 // 00027 // * The name of the copyright holders may not be used to endorse or promote products 00028 // derived from this software without specific prior written permission. 00029 // 00030 // This software is provided by the copyright holders and contributors "as is" and 00031 // any express or implied warranties, including, but not limited to, the implied 00032 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00033 // In no event shall the Intel Corporation or contributors be liable for any direct, 00034 // indirect, incidental, special, exemplary, or consequential damages 00035 // (including, but not limited to, procurement of substitute goods or services; 00036 // loss of use, data, or profits; or business interruption) however caused 00037 // and on any theory of liability, whether in contract, strict liability, 00038 // or tort (including negligence or otherwise) arising in any way out of 00039 // the use of this software, even if advised of the possibility of such damage. 00040 // 00041 //M*/ 00042 00043 /* 00044 * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. 00045 * 00046 * Redistribution and use in source and binary forms, with or without 00047 * modification, are permitted provided that the following conditions are met: 00048 * 00049 * Redistributions of source code must retain the above copyright notice, 00050 * this list of conditions and the following disclaimer. 00051 * 00052 * Redistributions in binary form must reproduce the above copyright notice, 00053 * this list of conditions and the following disclaimer in the documentation 00054 * and/or other materials provided with the distribution. 00055 * 00056 * Neither the name of NVIDIA Corporation nor the names of its contributors 00057 * may be used to endorse or promote products derived from this software 00058 * without specific prior written permission. 00059 * 00060 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00061 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00062 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00063 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 00064 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00065 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00066 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 00067 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 00068 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 00069 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00070 * POSSIBILITY OF SUCH DAMAGE. 00071 */ 00072 00073 #ifndef __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__ 00074 #define __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__ 00075 00076 #include "common.hpp " 00077 00078 /** @file 00079 * @deprecated Use @ref cudev instead. 00080 */ 00081 00082 //! @cond IGNORED 00083 00084 namespace cv { namespace cuda { namespace device 00085 { 00086 // 2 00087 00088 static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b) 00089 { 00090 unsigned int r = 0; 00091 00092 #if __CUDA_ARCH__ >= 300 00093 asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00094 #elif __CUDA_ARCH__ >= 200 00095 asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00096 asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00097 #else 00098 unsigned int s; 00099 s = a ^ b; // sum bits 00100 r = a + b; // actual sum 00101 s = s ^ r; // determine carry-ins for each bit position 00102 s = s & 0x00010000; // carry-in to high word (= carry-out from low word) 00103 r = r - s; // subtract out carry-out from low word 00104 #endif 00105 00106 return r; 00107 } 00108 00109 static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b) 00110 { 00111 unsigned int r = 0; 00112 00113 #if __CUDA_ARCH__ >= 300 00114 asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00115 #elif __CUDA_ARCH__ >= 200 00116 asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00117 asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00118 #else 00119 unsigned int s; 00120 s = a ^ b; // sum bits 00121 r = a - b; // actual sum 00122 s = s ^ r; // determine carry-ins for each bit position 00123 s = s & 0x00010000; // borrow to high word 00124 r = r + s; // compensate for borrow from low word 00125 #endif 00126 00127 return r; 00128 } 00129 00130 static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b) 00131 { 00132 unsigned int r = 0; 00133 00134 #if __CUDA_ARCH__ >= 300 00135 asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00136 #elif __CUDA_ARCH__ >= 200 00137 asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00138 asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00139 #else 00140 unsigned int s, t, u, v; 00141 s = a & 0x0000ffff; // extract low halfword 00142 r = b & 0x0000ffff; // extract low halfword 00143 u = ::max(r, s); // maximum of low halfwords 00144 v = ::min(r, s); // minimum of low halfwords 00145 s = a & 0xffff0000; // extract high halfword 00146 r = b & 0xffff0000; // extract high halfword 00147 t = ::max(r, s); // maximum of high halfwords 00148 s = ::min(r, s); // minimum of high halfwords 00149 r = u | t; // maximum of both halfwords 00150 s = v | s; // minimum of both halfwords 00151 r = r - s; // |a - b| = max(a,b) - min(a,b); 00152 #endif 00153 00154 return r; 00155 } 00156 00157 static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b) 00158 { 00159 unsigned int r, s; 00160 00161 // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==> 00162 // (a + b) / 2 = (a & b) + ((a ^ b) >> 1) 00163 s = a ^ b; 00164 r = a & b; 00165 s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries 00166 s = s >> 1; 00167 s = r + s; 00168 00169 return s; 00170 } 00171 00172 static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b) 00173 { 00174 unsigned int r = 0; 00175 00176 #if __CUDA_ARCH__ >= 300 00177 asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00178 #else 00179 // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==> 00180 // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1) 00181 unsigned int s; 00182 s = a ^ b; 00183 r = a | b; 00184 s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries 00185 s = s >> 1; 00186 r = r - s; 00187 #endif 00188 00189 return r; 00190 } 00191 00192 static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b) 00193 { 00194 unsigned int r = 0; 00195 00196 #if __CUDA_ARCH__ >= 300 00197 asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00198 #else 00199 // inspired by Alan Mycroft's null-byte detection algorithm: 00200 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 00201 unsigned int c; 00202 r = a ^ b; // 0x0000 if a == b 00203 c = r | 0x80008000; // set msbs, to catch carry out 00204 r = r ^ c; // extract msbs, msb = 1 if r < 0x8000 00205 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 00206 c = r & ~c; // msb = 1, if r was 0x0000 00207 r = c >> 15; // convert to bool 00208 #endif 00209 00210 return r; 00211 } 00212 00213 static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b) 00214 { 00215 unsigned int r, c; 00216 00217 #if __CUDA_ARCH__ >= 300 00218 r = vseteq2(a, b); 00219 c = r << 16; // convert bool 00220 r = c - r; // into mask 00221 #else 00222 // inspired by Alan Mycroft's null-byte detection algorithm: 00223 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 00224 r = a ^ b; // 0x0000 if a == b 00225 c = r | 0x80008000; // set msbs, to catch carry out 00226 r = r ^ c; // extract msbs, msb = 1 if r < 0x8000 00227 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 00228 c = r & ~c; // msb = 1, if r was 0x0000 00229 r = c >> 15; // convert 00230 r = c - r; // msbs to 00231 r = c | r; // mask 00232 #endif 00233 00234 return r; 00235 } 00236 00237 static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b) 00238 { 00239 unsigned int r = 0; 00240 00241 #if __CUDA_ARCH__ >= 300 00242 asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00243 #else 00244 unsigned int c; 00245 asm("not.b32 %0, %0;" : "+r"(b)); 00246 c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 00247 c = c & 0x80008000; // msb = carry-outs 00248 r = c >> 15; // convert to bool 00249 #endif 00250 00251 return r; 00252 } 00253 00254 static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b) 00255 { 00256 unsigned int r, c; 00257 00258 #if __CUDA_ARCH__ >= 300 00259 r = vsetge2(a, b); 00260 c = r << 16; // convert bool 00261 r = c - r; // into mask 00262 #else 00263 asm("not.b32 %0, %0;" : "+r"(b)); 00264 c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 00265 c = c & 0x80008000; // msb = carry-outs 00266 r = c >> 15; // convert 00267 r = c - r; // msbs to 00268 r = c | r; // mask 00269 #endif 00270 00271 return r; 00272 } 00273 00274 static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b) 00275 { 00276 unsigned int r = 0; 00277 00278 #if __CUDA_ARCH__ >= 300 00279 asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00280 #else 00281 unsigned int c; 00282 asm("not.b32 %0, %0;" : "+r"(b)); 00283 c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] 00284 c = c & 0x80008000; // msbs = carry-outs 00285 r = c >> 15; // convert to bool 00286 #endif 00287 00288 return r; 00289 } 00290 00291 static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b) 00292 { 00293 unsigned int r, c; 00294 00295 #if __CUDA_ARCH__ >= 300 00296 r = vsetgt2(a, b); 00297 c = r << 16; // convert bool 00298 r = c - r; // into mask 00299 #else 00300 asm("not.b32 %0, %0;" : "+r"(b)); 00301 c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] 00302 c = c & 0x80008000; // msbs = carry-outs 00303 r = c >> 15; // convert 00304 r = c - r; // msbs to 00305 r = c | r; // mask 00306 #endif 00307 00308 return r; 00309 } 00310 00311 static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b) 00312 { 00313 unsigned int r = 0; 00314 00315 #if __CUDA_ARCH__ >= 300 00316 asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00317 #else 00318 unsigned int c; 00319 asm("not.b32 %0, %0;" : "+r"(a)); 00320 c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 00321 c = c & 0x80008000; // msb = carry-outs 00322 r = c >> 15; // convert to bool 00323 #endif 00324 00325 return r; 00326 } 00327 00328 static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b) 00329 { 00330 unsigned int r, c; 00331 00332 #if __CUDA_ARCH__ >= 300 00333 r = vsetle2(a, b); 00334 c = r << 16; // convert bool 00335 r = c - r; // into mask 00336 #else 00337 asm("not.b32 %0, %0;" : "+r"(a)); 00338 c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 00339 c = c & 0x80008000; // msb = carry-outs 00340 r = c >> 15; // convert 00341 r = c - r; // msbs to 00342 r = c | r; // mask 00343 #endif 00344 00345 return r; 00346 } 00347 00348 static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b) 00349 { 00350 unsigned int r = 0; 00351 00352 #if __CUDA_ARCH__ >= 300 00353 asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00354 #else 00355 unsigned int c; 00356 asm("not.b32 %0, %0;" : "+r"(a)); 00357 c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] 00358 c = c & 0x80008000; // msb = carry-outs 00359 r = c >> 15; // convert to bool 00360 #endif 00361 00362 return r; 00363 } 00364 00365 static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b) 00366 { 00367 unsigned int r, c; 00368 00369 #if __CUDA_ARCH__ >= 300 00370 r = vsetlt2(a, b); 00371 c = r << 16; // convert bool 00372 r = c - r; // into mask 00373 #else 00374 asm("not.b32 %0, %0;" : "+r"(a)); 00375 c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] 00376 c = c & 0x80008000; // msb = carry-outs 00377 r = c >> 15; // convert 00378 r = c - r; // msbs to 00379 r = c | r; // mask 00380 #endif 00381 00382 return r; 00383 } 00384 00385 static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b) 00386 { 00387 unsigned int r = 0; 00388 00389 #if __CUDA_ARCH__ >= 300 00390 asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00391 #else 00392 // inspired by Alan Mycroft's null-byte detection algorithm: 00393 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 00394 unsigned int c; 00395 r = a ^ b; // 0x0000 if a == b 00396 c = r | 0x80008000; // set msbs, to catch carry out 00397 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 00398 c = r | c; // msb = 1, if r was not 0x0000 00399 c = c & 0x80008000; // extract msbs 00400 r = c >> 15; // convert to bool 00401 #endif 00402 00403 return r; 00404 } 00405 00406 static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b) 00407 { 00408 unsigned int r, c; 00409 00410 #if __CUDA_ARCH__ >= 300 00411 r = vsetne2(a, b); 00412 c = r << 16; // convert bool 00413 r = c - r; // into mask 00414 #else 00415 // inspired by Alan Mycroft's null-byte detection algorithm: 00416 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 00417 r = a ^ b; // 0x0000 if a == b 00418 c = r | 0x80008000; // set msbs, to catch carry out 00419 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 00420 c = r | c; // msb = 1, if r was not 0x0000 00421 c = c & 0x80008000; // extract msbs 00422 r = c >> 15; // convert 00423 r = c - r; // msbs to 00424 r = c | r; // mask 00425 #endif 00426 00427 return r; 00428 } 00429 00430 static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b) 00431 { 00432 unsigned int r = 0; 00433 00434 #if __CUDA_ARCH__ >= 300 00435 asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00436 #elif __CUDA_ARCH__ >= 200 00437 asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00438 asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00439 #else 00440 unsigned int s, t, u; 00441 r = a & 0x0000ffff; // extract low halfword 00442 s = b & 0x0000ffff; // extract low halfword 00443 t = ::max(r, s); // maximum of low halfwords 00444 r = a & 0xffff0000; // extract high halfword 00445 s = b & 0xffff0000; // extract high halfword 00446 u = ::max(r, s); // maximum of high halfwords 00447 r = t | u; // combine halfword maximums 00448 #endif 00449 00450 return r; 00451 } 00452 00453 static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b) 00454 { 00455 unsigned int r = 0; 00456 00457 #if __CUDA_ARCH__ >= 300 00458 asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00459 #elif __CUDA_ARCH__ >= 200 00460 asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00461 asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00462 #else 00463 unsigned int s, t, u; 00464 r = a & 0x0000ffff; // extract low halfword 00465 s = b & 0x0000ffff; // extract low halfword 00466 t = ::min(r, s); // minimum of low halfwords 00467 r = a & 0xffff0000; // extract high halfword 00468 s = b & 0xffff0000; // extract high halfword 00469 u = ::min(r, s); // minimum of high halfwords 00470 r = t | u; // combine halfword minimums 00471 #endif 00472 00473 return r; 00474 } 00475 00476 // 4 00477 00478 static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b) 00479 { 00480 unsigned int r = 0; 00481 00482 #if __CUDA_ARCH__ >= 300 00483 asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00484 #elif __CUDA_ARCH__ >= 200 00485 asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00486 asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00487 asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00488 asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00489 #else 00490 unsigned int s, t; 00491 s = a ^ b; // sum bits 00492 r = a & 0x7f7f7f7f; // clear msbs 00493 t = b & 0x7f7f7f7f; // clear msbs 00494 s = s & 0x80808080; // msb sum bits 00495 r = r + t; // add without msbs, record carry-out in msbs 00496 r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out 00497 #endif /* __CUDA_ARCH__ >= 300 */ 00498 00499 return r; 00500 } 00501 00502 static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b) 00503 { 00504 unsigned int r = 0; 00505 00506 #if __CUDA_ARCH__ >= 300 00507 asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00508 #elif __CUDA_ARCH__ >= 200 00509 asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00510 asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00511 asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00512 asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00513 #else 00514 unsigned int s, t; 00515 s = a ^ ~b; // inverted sum bits 00516 r = a | 0x80808080; // set msbs 00517 t = b & 0x7f7f7f7f; // clear msbs 00518 s = s & 0x80808080; // inverted msb sum bits 00519 r = r - t; // subtract w/o msbs, record inverted borrows in msb 00520 r = r ^ s; // combine inverted msb sum bits and borrows 00521 #endif 00522 00523 return r; 00524 } 00525 00526 static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b) 00527 { 00528 unsigned int r, s; 00529 00530 // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==> 00531 // (a + b) / 2 = (a & b) + ((a ^ b) >> 1) 00532 s = a ^ b; 00533 r = a & b; 00534 s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries 00535 s = s >> 1; 00536 s = r + s; 00537 00538 return s; 00539 } 00540 00541 static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b) 00542 { 00543 unsigned int r = 0; 00544 00545 #if __CUDA_ARCH__ >= 300 00546 asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00547 #else 00548 // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==> 00549 // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1) 00550 unsigned int c; 00551 c = a ^ b; 00552 r = a | b; 00553 c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries 00554 c = c >> 1; 00555 r = r - c; 00556 #endif 00557 00558 return r; 00559 } 00560 00561 static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b) 00562 { 00563 unsigned int r = 0; 00564 00565 #if __CUDA_ARCH__ >= 300 00566 asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00567 #else 00568 // inspired by Alan Mycroft's null-byte detection algorithm: 00569 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 00570 unsigned int c; 00571 r = a ^ b; // 0x00 if a == b 00572 c = r | 0x80808080; // set msbs, to catch carry out 00573 r = r ^ c; // extract msbs, msb = 1 if r < 0x80 00574 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 00575 c = r & ~c; // msb = 1, if r was 0x00 00576 r = c >> 7; // convert to bool 00577 #endif 00578 00579 return r; 00580 } 00581 00582 static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b) 00583 { 00584 unsigned int r, t; 00585 00586 #if __CUDA_ARCH__ >= 300 00587 r = vseteq4(a, b); 00588 t = r << 8; // convert bool 00589 r = t - r; // to mask 00590 #else 00591 // inspired by Alan Mycroft's null-byte detection algorithm: 00592 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 00593 t = a ^ b; // 0x00 if a == b 00594 r = t | 0x80808080; // set msbs, to catch carry out 00595 t = t ^ r; // extract msbs, msb = 1 if t < 0x80 00596 r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80 00597 r = t & ~r; // msb = 1, if t was 0x00 00598 t = r >> 7; // build mask 00599 t = r - t; // from 00600 r = t | r; // msbs 00601 #endif 00602 00603 return r; 00604 } 00605 00606 static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b) 00607 { 00608 unsigned int r = 0; 00609 00610 #if __CUDA_ARCH__ >= 300 00611 asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00612 #else 00613 unsigned int c; 00614 asm("not.b32 %0, %0;" : "+r"(a)); 00615 c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 00616 c = c & 0x80808080; // msb = carry-outs 00617 r = c >> 7; // convert to bool 00618 #endif 00619 00620 return r; 00621 } 00622 00623 static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b) 00624 { 00625 unsigned int r, c; 00626 00627 #if __CUDA_ARCH__ >= 300 00628 r = vsetle4(a, b); 00629 c = r << 8; // convert bool 00630 r = c - r; // to mask 00631 #else 00632 asm("not.b32 %0, %0;" : "+r"(a)); 00633 c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 00634 c = c & 0x80808080; // msbs = carry-outs 00635 r = c >> 7; // convert 00636 r = c - r; // msbs to 00637 r = c | r; // mask 00638 #endif 00639 00640 return r; 00641 } 00642 00643 static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b) 00644 { 00645 unsigned int r = 0; 00646 00647 #if __CUDA_ARCH__ >= 300 00648 asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00649 #else 00650 unsigned int c; 00651 asm("not.b32 %0, %0;" : "+r"(a)); 00652 c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] 00653 c = c & 0x80808080; // msb = carry-outs 00654 r = c >> 7; // convert to bool 00655 #endif 00656 00657 return r; 00658 } 00659 00660 static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b) 00661 { 00662 unsigned int r, c; 00663 00664 #if __CUDA_ARCH__ >= 300 00665 r = vsetlt4(a, b); 00666 c = r << 8; // convert bool 00667 r = c - r; // to mask 00668 #else 00669 asm("not.b32 %0, %0;" : "+r"(a)); 00670 c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] 00671 c = c & 0x80808080; // msbs = carry-outs 00672 r = c >> 7; // convert 00673 r = c - r; // msbs to 00674 r = c | r; // mask 00675 #endif 00676 00677 return r; 00678 } 00679 00680 static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b) 00681 { 00682 unsigned int r = 0; 00683 00684 #if __CUDA_ARCH__ >= 300 00685 asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00686 #else 00687 unsigned int c; 00688 asm("not.b32 %0, %0;" : "+r"(b)); 00689 c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 00690 c = c & 0x80808080; // msb = carry-outs 00691 r = c >> 7; // convert to bool 00692 #endif 00693 00694 return r; 00695 } 00696 00697 static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b) 00698 { 00699 unsigned int r, s; 00700 00701 #if __CUDA_ARCH__ >= 300 00702 r = vsetge4(a, b); 00703 s = r << 8; // convert bool 00704 r = s - r; // to mask 00705 #else 00706 asm ("not.b32 %0,%0;" : "+r"(b)); 00707 r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2 00708 r = r & 0x80808080; // msb = carry-outs 00709 s = r >> 7; // build mask 00710 s = r - s; // from 00711 r = s | r; // msbs 00712 #endif 00713 00714 return r; 00715 } 00716 00717 static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b) 00718 { 00719 unsigned int r = 0; 00720 00721 #if __CUDA_ARCH__ >= 300 00722 asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00723 #else 00724 unsigned int c; 00725 asm("not.b32 %0, %0;" : "+r"(b)); 00726 c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] 00727 c = c & 0x80808080; // msb = carry-outs 00728 r = c >> 7; // convert to bool 00729 #endif 00730 00731 return r; 00732 } 00733 00734 static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b) 00735 { 00736 unsigned int r, c; 00737 00738 #if __CUDA_ARCH__ >= 300 00739 r = vsetgt4(a, b); 00740 c = r << 8; // convert bool 00741 r = c - r; // to mask 00742 #else 00743 asm("not.b32 %0, %0;" : "+r"(b)); 00744 c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] 00745 c = c & 0x80808080; // msb = carry-outs 00746 r = c >> 7; // convert 00747 r = c - r; // msbs to 00748 r = c | r; // mask 00749 #endif 00750 00751 return r; 00752 } 00753 00754 static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b) 00755 { 00756 unsigned int r = 0; 00757 00758 #if __CUDA_ARCH__ >= 300 00759 asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00760 #else 00761 // inspired by Alan Mycroft's null-byte detection algorithm: 00762 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 00763 unsigned int c; 00764 r = a ^ b; // 0x00 if a == b 00765 c = r | 0x80808080; // set msbs, to catch carry out 00766 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 00767 c = r | c; // msb = 1, if r was not 0x00 00768 c = c & 0x80808080; // extract msbs 00769 r = c >> 7; // convert to bool 00770 #endif 00771 00772 return r; 00773 } 00774 00775 static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b) 00776 { 00777 unsigned int r, c; 00778 00779 #if __CUDA_ARCH__ >= 300 00780 r = vsetne4(a, b); 00781 c = r << 8; // convert bool 00782 r = c - r; // to mask 00783 #else 00784 // inspired by Alan Mycroft's null-byte detection algorithm: 00785 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 00786 r = a ^ b; // 0x00 if a == b 00787 c = r | 0x80808080; // set msbs, to catch carry out 00788 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 00789 c = r | c; // msb = 1, if r was not 0x00 00790 c = c & 0x80808080; // extract msbs 00791 r = c >> 7; // convert 00792 r = c - r; // msbs to 00793 r = c | r; // mask 00794 #endif 00795 00796 return r; 00797 } 00798 00799 static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b) 00800 { 00801 unsigned int r = 0; 00802 00803 #if __CUDA_ARCH__ >= 300 00804 asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00805 #elif __CUDA_ARCH__ >= 200 00806 asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00807 asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00808 asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00809 asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00810 #else 00811 unsigned int s; 00812 s = vcmpge4(a, b); // mask = 0xff if a >= b 00813 r = a ^ b; // 00814 s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b) 00815 r = s ^ r; // select a when b >= a, else select b => min(a,b) 00816 r = s - r; // |a - b| = max(a,b) - min(a,b); 00817 #endif 00818 00819 return r; 00820 } 00821 00822 static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b) 00823 { 00824 unsigned int r = 0; 00825 00826 #if __CUDA_ARCH__ >= 300 00827 asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00828 #elif __CUDA_ARCH__ >= 200 00829 asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00830 asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00831 asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00832 asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00833 #else 00834 unsigned int s; 00835 s = vcmpge4(a, b); // mask = 0xff if a >= b 00836 r = a & s; // select a when b >= a 00837 s = b & ~s; // select b when b < a 00838 r = r | s; // combine byte selections 00839 #endif 00840 00841 return r; // byte-wise unsigned maximum 00842 } 00843 00844 static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b) 00845 { 00846 unsigned int r = 0; 00847 00848 #if __CUDA_ARCH__ >= 300 00849 asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00850 #elif __CUDA_ARCH__ >= 200 00851 asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00852 asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00853 asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00854 asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 00855 #else 00856 unsigned int s; 00857 s = vcmpge4(b, a); // mask = 0xff if a >= b 00858 r = a & s; // select a when b >= a 00859 s = b & ~s; // select b when b < a 00860 r = r | s; // combine byte selections 00861 #endif 00862 00863 return r; 00864 } 00865 }}} 00866 00867 //! @endcond 00868 00869 #endif // __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__ 00870
Generated on Tue Jul 12 2022 14:47:36 by
1.7.2
