Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
reduce.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Third party copyrights are property of their respective owners. 00016 // 00017 // Redistribution and use in source and binary forms, with or without modification, 00018 // are permitted provided that the following conditions are met: 00019 // 00020 // * Redistribution's of source code must retain the above copyright notice, 00021 // this list of conditions and the following disclaimer. 00022 // 00023 // * Redistribution's in binary form must reproduce the above copyright notice, 00024 // this list of conditions and the following disclaimer in the documentation 00025 // and/or other materials provided with the distribution. 00026 // 00027 // * The name of the copyright holders may not be used to endorse or promote products 00028 // derived from this software without specific prior written permission. 00029 // 00030 // This software is provided by the copyright holders and contributors "as is" and 00031 // any express or implied warranties, including, but not limited to, the implied 00032 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00033 // In no event shall the Intel Corporation or contributors be liable for any direct, 00034 // indirect, incidental, special, exemplary, or consequential damages 00035 // (including, but not limited to, procurement of substitute goods or services; 00036 // loss of use, data, or profits; or business interruption) however caused 00037 // and on any theory of liability, whether in contract, strict liability, 00038 // or tort (including negligence or otherwise) arising in any way out of 00039 // the use of this software, even if advised of the possibility of such damage. 00040 // 00041 //M*/ 00042 00043 #ifndef __OPENCV_CUDA_REDUCE_DETAIL_HPP__ 00044 #define __OPENCV_CUDA_REDUCE_DETAIL_HPP__ 00045 00046 #include <thrust/tuple.h> 00047 #include "../warp.hpp" 00048 #include "../warp_shuffle.hpp" 00049 00050 //! @cond IGNORED 00051 00052 namespace cv { namespace cuda { namespace device 00053 { 00054 namespace reduce_detail 00055 { 00056 template <typename T> struct GetType; 00057 template <typename T> struct GetType<T*> 00058 { 00059 typedef T type; 00060 }; 00061 template <typename T> struct GetType<volatile T*> 00062 { 00063 typedef T type; 00064 }; 00065 template <typename T> struct GetType<T&> 00066 { 00067 typedef T type; 00068 }; 00069 00070 template <unsigned int I, unsigned int N> 00071 struct For 00072 { 00073 template <class PointerTuple, class ValTuple> 00074 static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) 00075 { 00076 thrust::get<I>(smem)[tid] = thrust::get<I>(val); 00077 00078 For<I + 1, N>::loadToSmem(smem, val, tid); 00079 } 00080 template <class PointerTuple, class ValTuple> 00081 static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) 00082 { 00083 thrust::get<I>(val) = thrust::get<I>(smem)[tid]; 00084 00085 For<I + 1, N>::loadFromSmem(smem, val, tid); 00086 } 00087 00088 template <class PointerTuple, class ValTuple, class OpTuple> 00089 static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op) 00090 { 00091 typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta]; 00092 thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg); 00093 00094 For<I + 1, N>::merge(smem, val, tid, delta, op); 00095 } 00096 template <class ValTuple, class OpTuple> 00097 static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op) 00098 { 00099 typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width); 00100 thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg); 00101 00102 For<I + 1, N>::mergeShfl(val, delta, width, op); 00103 } 00104 }; 00105 template <unsigned int N> 00106 struct For<N, N> 00107 { 00108 template <class PointerTuple, class ValTuple> 00109 static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int) 00110 { 00111 } 00112 template <class PointerTuple, class ValTuple> 00113 static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int) 00114 { 00115 } 00116 00117 template <class PointerTuple, class ValTuple, class OpTuple> 00118 static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&) 00119 { 00120 } 00121 template <class ValTuple, class OpTuple> 00122 static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&) 00123 { 00124 } 00125 }; 00126 00127 template <typename T> 00128 __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid) 00129 { 00130 smem[tid] = val; 00131 } 00132 template <typename T> 00133 __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid) 00134 { 00135 val = smem[tid]; 00136 } 00137 template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, 00138 typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9> 00139 __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem, 00140 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, 00141 unsigned int tid) 00142 { 00143 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid); 00144 } 00145 template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, 00146 typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9> 00147 __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem, 00148 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, 00149 unsigned int tid) 00150 { 00151 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid); 00152 } 00153 00154 template <typename T, class Op> 00155 __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op) 00156 { 00157 T reg = smem[tid + delta]; 00158 smem[tid] = val = op(val, reg); 00159 } 00160 template <typename T, class Op> 00161 __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op) 00162 { 00163 T reg = shfl_down(val, delta, width); 00164 val = op(val, reg); 00165 } 00166 template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, 00167 typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9, 00168 class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9> 00169 __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem, 00170 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, 00171 unsigned int tid, 00172 unsigned int delta, 00173 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op) 00174 { 00175 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op); 00176 } 00177 template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9, 00178 class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9> 00179 __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, 00180 unsigned int delta, 00181 unsigned int width, 00182 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op) 00183 { 00184 For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op); 00185 } 00186 00187 template <unsigned int N> struct Generic 00188 { 00189 template <typename Pointer, typename Reference, class Op> 00190 static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) 00191 { 00192 loadToSmem(smem, val, tid); 00193 if (N >= 32) 00194 __syncthreads(); 00195 00196 if (N >= 2048) 00197 { 00198 if (tid < 1024) 00199 merge(smem, val, tid, 1024, op); 00200 00201 __syncthreads(); 00202 } 00203 if (N >= 1024) 00204 { 00205 if (tid < 512) 00206 merge(smem, val, tid, 512, op); 00207 00208 __syncthreads(); 00209 } 00210 if (N >= 512) 00211 { 00212 if (tid < 256) 00213 merge(smem, val, tid, 256, op); 00214 00215 __syncthreads(); 00216 } 00217 if (N >= 256) 00218 { 00219 if (tid < 128) 00220 merge(smem, val, tid, 128, op); 00221 00222 __syncthreads(); 00223 } 00224 if (N >= 128) 00225 { 00226 if (tid < 64) 00227 merge(smem, val, tid, 64, op); 00228 00229 __syncthreads(); 00230 } 00231 if (N >= 64) 00232 { 00233 if (tid < 32) 00234 merge(smem, val, tid, 32, op); 00235 } 00236 00237 if (tid < 16) 00238 { 00239 merge(smem, val, tid, 16, op); 00240 merge(smem, val, tid, 8, op); 00241 merge(smem, val, tid, 4, op); 00242 merge(smem, val, tid, 2, op); 00243 merge(smem, val, tid, 1, op); 00244 } 00245 } 00246 }; 00247 00248 template <unsigned int I, typename Pointer, typename Reference, class Op> 00249 struct Unroll 00250 { 00251 static __device__ void loopShfl(Reference val, Op op, unsigned int N) 00252 { 00253 mergeShfl(val, I, N, op); 00254 Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N); 00255 } 00256 static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op) 00257 { 00258 merge(smem, val, tid, I, op); 00259 Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op); 00260 } 00261 }; 00262 template <typename Pointer, typename Reference, class Op> 00263 struct Unroll<0, Pointer, Reference, Op> 00264 { 00265 static __device__ void loopShfl(Reference, Op, unsigned int) 00266 { 00267 } 00268 static __device__ void loop(Pointer, Reference, unsigned int, Op) 00269 { 00270 } 00271 }; 00272 00273 template <unsigned int N> struct WarpOptimized 00274 { 00275 template <typename Pointer, typename Reference, class Op> 00276 static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) 00277 { 00278 #if __CUDA_ARCH__ >= 300 00279 (void) smem; 00280 (void) tid; 00281 00282 Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N); 00283 #else 00284 loadToSmem(smem, val, tid); 00285 00286 if (tid < N / 2) 00287 Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op); 00288 #endif 00289 } 00290 }; 00291 00292 template <unsigned int N> struct GenericOptimized32 00293 { 00294 enum { M = N / 32 }; 00295 00296 template <typename Pointer, typename Reference, class Op> 00297 static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) 00298 { 00299 const unsigned int laneId = Warp::laneId(); 00300 00301 #if __CUDA_ARCH__ >= 300 00302 Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize); 00303 00304 if (laneId == 0) 00305 loadToSmem(smem, val, tid / 32); 00306 #else 00307 loadToSmem(smem, val, tid); 00308 00309 if (laneId < 16) 00310 Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op); 00311 00312 __syncthreads(); 00313 00314 if (laneId == 0) 00315 loadToSmem(smem, val, tid / 32); 00316 #endif 00317 00318 __syncthreads(); 00319 00320 loadFromSmem(smem, val, tid); 00321 00322 if (tid < 32) 00323 { 00324 #if __CUDA_ARCH__ >= 300 00325 Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M); 00326 #else 00327 Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op); 00328 #endif 00329 } 00330 } 00331 }; 00332 00333 template <bool val, class T1, class T2> struct StaticIf; 00334 template <class T1, class T2> struct StaticIf<true, T1, T2> 00335 { 00336 typedef T1 type; 00337 }; 00338 template <class T1, class T2> struct StaticIf<false, T1, T2> 00339 { 00340 typedef T2 type; 00341 }; 00342 00343 template <unsigned int N> struct IsPowerOf2 00344 { 00345 enum { value = ((N != 0) && !(N & (N - 1))) }; 00346 }; 00347 00348 template <unsigned int N> struct Dispatcher 00349 { 00350 typedef typename StaticIf< 00351 (N <= 32) && IsPowerOf2<N>::value, 00352 WarpOptimized<N>, 00353 typename StaticIf< 00354 (N <= 1024) && IsPowerOf2<N>::value, 00355 GenericOptimized32<N>, 00356 Generic<N> 00357 >::type 00358 >::type reductor; 00359 }; 00360 } 00361 }}} 00362 00363 //! @endcond 00364 00365 #endif // __OPENCV_CUDA_REDUCE_DETAIL_HPP__ 00366
Generated on Tue Jul 12 2022 14:47:34 by
1.7.2
