Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
scan.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Third party copyrights are property of their respective owners. 00016 // 00017 // Redistribution and use in source and binary forms, with or without modification, 00018 // are permitted provided that the following conditions are met: 00019 // 00020 // * Redistribution's of source code must retain the above copyright notice, 00021 // this list of conditions and the following disclaimer. 00022 // 00023 // * Redistribution's in binary form must reproduce the above copyright notice, 00024 // this list of conditions and the following disclaimer in the documentation 00025 // and/or other materials provided with the distribution. 00026 // 00027 // * The name of the copyright holders may not be used to endorse or promote products 00028 // derived from this software without specific prior written permission. 00029 // 00030 // This software is provided by the copyright holders and contributors "as is" and 00031 // any express or implied warranties, including, but not limited to, the implied 00032 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00033 // In no event shall the Intel Corporation or contributors be liable for any direct, 00034 // indirect, incidental, special, exemplary, or consequential damages 00035 // (including, but not limited to, procurement of substitute goods or services; 00036 // loss of use, data, or profits; or business interruption) however caused 00037 // and on any theory of liability, whether in contract, strict liability, 00038 // or tort (including negligence or otherwise) arising in any way out of 00039 // the use of this software, even if advised of the possibility of such damage. 00040 // 00041 //M*/ 00042 00043 #ifndef __OPENCV_CUDA_SCAN_HPP__ 00044 #define __OPENCV_CUDA_SCAN_HPP__ 00045 00046 #include "opencv2/core/cuda/common.hpp " 00047 #include "opencv2/core/cuda/utility.hpp " 00048 #include "opencv2/core/cuda/warp.hpp " 00049 #include "opencv2/core/cuda/warp_shuffle.hpp " 00050 00051 /** @file 00052 * @deprecated Use @ref cudev instead. 00053 */ 00054 00055 //! @cond IGNORED 00056 00057 namespace cv { namespace cuda { namespace device 00058 { 00059 enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 }; 00060 00061 template <ScanKind Kind, typename T, typename F> struct WarpScan 00062 { 00063 __device__ __forceinline__ WarpScan() {} 00064 __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; } 00065 00066 __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) 00067 { 00068 const unsigned int lane = idx & 31; 00069 F op; 00070 00071 if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]); 00072 if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]); 00073 if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]); 00074 if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]); 00075 if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]); 00076 00077 if( Kind == INCLUSIVE ) 00078 return ptr [idx]; 00079 else 00080 return (lane > 0) ? ptr [idx - 1] : 0; 00081 } 00082 00083 __device__ __forceinline__ unsigned int index(const unsigned int tid) 00084 { 00085 return tid; 00086 } 00087 00088 __device__ __forceinline__ void init(volatile T *ptr){} 00089 00090 static const int warp_offset = 0; 00091 00092 typedef WarpScan<INCLUSIVE, T, F> merge; 00093 }; 00094 00095 template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp 00096 { 00097 __device__ __forceinline__ WarpScanNoComp() {} 00098 __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; } 00099 00100 __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) 00101 { 00102 const unsigned int lane = threadIdx.x & 31; 00103 F op; 00104 00105 ptr [idx ] = op(ptr [idx - 1], ptr [idx]); 00106 ptr [idx ] = op(ptr [idx - 2], ptr [idx]); 00107 ptr [idx ] = op(ptr [idx - 4], ptr [idx]); 00108 ptr [idx ] = op(ptr [idx - 8], ptr [idx]); 00109 ptr [idx ] = op(ptr [idx - 16], ptr [idx]); 00110 00111 if( Kind == INCLUSIVE ) 00112 return ptr [idx]; 00113 else 00114 return (lane > 0) ? ptr [idx - 1] : 0; 00115 } 00116 00117 __device__ __forceinline__ unsigned int index(const unsigned int tid) 00118 { 00119 return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask); 00120 } 00121 00122 __device__ __forceinline__ void init(volatile T *ptr) 00123 { 00124 ptr[threadIdx.x] = 0; 00125 } 00126 00127 static const int warp_smem_stride = 32 + 16 + 1; 00128 static const int warp_offset = 16; 00129 static const int warp_log = 5; 00130 static const int warp_mask = 31; 00131 00132 typedef WarpScanNoComp<INCLUSIVE, T, F> merge; 00133 }; 00134 00135 template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan 00136 { 00137 __device__ __forceinline__ BlockScan() {} 00138 __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; } 00139 00140 __device__ __forceinline__ T operator()(volatile T *ptr) 00141 { 00142 const unsigned int tid = threadIdx.x; 00143 const unsigned int lane = tid & warp_mask; 00144 const unsigned int warp = tid >> warp_log; 00145 00146 Sc scan; 00147 typename Sc::merge merge_scan; 00148 const unsigned int idx = scan.index(tid); 00149 00150 T val = scan(ptr, idx); 00151 __syncthreads (); 00152 00153 if( warp == 0) 00154 scan.init(ptr); 00155 __syncthreads (); 00156 00157 if( lane == 31 ) 00158 ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx]; 00159 __syncthreads (); 00160 00161 if( warp == 0 ) 00162 merge_scan(ptr, idx); 00163 __syncthreads(); 00164 00165 if ( warp > 0) 00166 val = ptr [scan.warp_offset + warp - 1] + val; 00167 __syncthreads (); 00168 00169 ptr[idx] = val; 00170 __syncthreads (); 00171 00172 return val ; 00173 } 00174 00175 static const int warp_log = 5; 00176 static const int warp_mask = 31; 00177 }; 00178 00179 template <typename T> 00180 __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid) 00181 { 00182 #if __CUDA_ARCH__ >= 300 00183 const unsigned int laneId = cv::cuda::device::Warp::laneId(); 00184 00185 // scan on shuffl functions 00186 #pragma unroll 00187 for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2) 00188 { 00189 const T n = cv::cuda::device::shfl_up(idata, i); 00190 if (laneId >= i) 00191 idata += n; 00192 } 00193 00194 return idata; 00195 #else 00196 unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1)); 00197 s_Data[pos] = 0; 00198 pos += OPENCV_CUDA_WARP_SIZE; 00199 s_Data[pos] = idata; 00200 00201 s_Data[pos] += s_Data[pos - 1]; 00202 s_Data[pos] += s_Data[pos - 2]; 00203 s_Data[pos] += s_Data[pos - 4]; 00204 s_Data[pos] += s_Data[pos - 8]; 00205 s_Data[pos] += s_Data[pos - 16]; 00206 00207 return s_Data[pos]; 00208 #endif 00209 } 00210 00211 template <typename T> 00212 __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid) 00213 { 00214 return warpScanInclusive(idata, s_Data, tid) - idata; 00215 } 00216 00217 template <int tiNumScanThreads, typename T> 00218 __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid) 00219 { 00220 if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE) 00221 { 00222 //Bottom-level inclusive warp scan 00223 T warpResult = warpScanInclusive(idata, s_Data, tid); 00224 00225 //Save top elements of each warp for exclusive warp scan 00226 //sync to wait for warp scans to complete (because s_Data is being overwritten) 00227 __syncthreads(); 00228 if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1)) 00229 { 00230 s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult; 00231 } 00232 00233 //wait for warp scans to complete 00234 __syncthreads(); 00235 00236 if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) ) 00237 { 00238 //grab top warp elements 00239 T val = s_Data[tid]; 00240 //calculate exclusive scan and write back to shared memory 00241 s_Data[tid] = warpScanExclusive(val, s_Data, tid); 00242 } 00243 00244 //return updated warp scans with exclusive scan results 00245 __syncthreads(); 00246 00247 return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE]; 00248 } 00249 else 00250 { 00251 return warpScanInclusive(idata, s_Data, tid); 00252 } 00253 } 00254 }}} 00255 00256 //! @endcond 00257 00258 #endif // __OPENCV_CUDA_SCAN_HPP__ 00259
Generated on Tue Jul 12 2022 14:47:34 by
1.7.2
