Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers reduce.hpp Source File

reduce.hpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Third party copyrights are property of their respective owners.
00016 //
00017 // Redistribution and use in source and binary forms, with or without modification,
00018 // are permitted provided that the following conditions are met:
00019 //
00020 //   * Redistribution's of source code must retain the above copyright notice,
00021 //     this list of conditions and the following disclaimer.
00022 //
00023 //   * Redistribution's in binary form must reproduce the above copyright notice,
00024 //     this list of conditions and the following disclaimer in the documentation
00025 //     and/or other materials provided with the distribution.
00026 //
00027 //   * The name of the copyright holders may not be used to endorse or promote products
00028 //     derived from this software without specific prior written permission.
00029 //
00030 // This software is provided by the copyright holders and contributors "as is" and
00031 // any express or implied warranties, including, but not limited to, the implied
00032 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00033 // In no event shall the Intel Corporation or contributors be liable for any direct,
00034 // indirect, incidental, special, exemplary, or consequential damages
00035 // (including, but not limited to, procurement of substitute goods or services;
00036 // loss of use, data, or profits; or business interruption) however caused
00037 // and on any theory of liability, whether in contract, strict liability,
00038 // or tort (including negligence or otherwise) arising in any way out of
00039 // the use of this software, even if advised of the possibility of such damage.
00040 //
00041 //M*/
00042 
00043 #ifndef __OPENCV_CUDA_REDUCE_DETAIL_HPP__
00044 #define __OPENCV_CUDA_REDUCE_DETAIL_HPP__
00045 
00046 #include <thrust/tuple.h>
00047 #include "../warp.hpp"
00048 #include "../warp_shuffle.hpp"
00049 
00050 //! @cond IGNORED
00051 
00052 namespace cv { namespace cuda { namespace device
00053 {
00054     namespace reduce_detail
00055     {
00056         template <typename T> struct GetType;
00057         template <typename T> struct GetType<T*>
00058         {
00059             typedef T type;
00060         };
00061         template <typename T> struct GetType<volatile T*>
00062         {
00063             typedef T type;
00064         };
00065         template <typename T> struct GetType<T&>
00066         {
00067             typedef T type;
00068         };
00069 
00070         template <unsigned int I, unsigned int N>
00071         struct For
00072         {
00073             template <class PointerTuple, class ValTuple>
00074             static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
00075             {
00076                 thrust::get<I>(smem)[tid] = thrust::get<I>(val);
00077 
00078                 For<I + 1, N>::loadToSmem(smem, val, tid);
00079             }
00080             template <class PointerTuple, class ValTuple>
00081             static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
00082             {
00083                 thrust::get<I>(val) = thrust::get<I>(smem)[tid];
00084 
00085                 For<I + 1, N>::loadFromSmem(smem, val, tid);
00086             }
00087 
00088             template <class PointerTuple, class ValTuple, class OpTuple>
00089             static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
00090             {
00091                 typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
00092                 thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
00093 
00094                 For<I + 1, N>::merge(smem, val, tid, delta, op);
00095             }
00096             template <class ValTuple, class OpTuple>
00097             static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
00098             {
00099                 typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
00100                 thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
00101 
00102                 For<I + 1, N>::mergeShfl(val, delta, width, op);
00103             }
00104         };
00105         template <unsigned int N>
00106         struct For<N, N>
00107         {
00108             template <class PointerTuple, class ValTuple>
00109             static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
00110             {
00111             }
00112             template <class PointerTuple, class ValTuple>
00113             static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
00114             {
00115             }
00116 
00117             template <class PointerTuple, class ValTuple, class OpTuple>
00118             static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
00119             {
00120             }
00121             template <class ValTuple, class OpTuple>
00122             static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
00123             {
00124             }
00125         };
00126 
00127         template <typename T>
00128         __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
00129         {
00130             smem[tid] = val;
00131         }
00132         template <typename T>
00133         __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
00134         {
00135             val = smem[tid];
00136         }
00137         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
00138                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
00139         __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
00140                                                        const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
00141                                                        unsigned int tid)
00142         {
00143             For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
00144         }
00145         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
00146                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
00147         __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
00148                                                          const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
00149                                                          unsigned int tid)
00150         {
00151             For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
00152         }
00153 
00154         template <typename T, class Op>
00155         __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
00156         {
00157             T reg = smem[tid + delta];
00158             smem[tid] = val = op(val, reg);
00159         }
00160         template <typename T, class Op>
00161         __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
00162         {
00163             T reg = shfl_down(val, delta, width);
00164             val = op(val, reg);
00165         }
00166         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
00167                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
00168                   class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
00169         __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
00170                                               const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
00171                                               unsigned int tid,
00172                                               unsigned int delta,
00173                                               const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
00174         {
00175             For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
00176         }
00177         template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
00178                   class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
00179         __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
00180                                                   unsigned int delta,
00181                                                   unsigned int width,
00182                                                   const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
00183         {
00184             For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
00185         }
00186 
00187         template <unsigned int N> struct Generic
00188         {
00189             template <typename Pointer, typename Reference, class Op>
00190             static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
00191             {
00192                 loadToSmem(smem, val, tid);
00193                 if (N >= 32)
00194                     __syncthreads();
00195 
00196                 if (N >= 2048)
00197                 {
00198                     if (tid < 1024)
00199                         merge(smem, val, tid, 1024, op);
00200 
00201                     __syncthreads();
00202                 }
00203                 if (N >= 1024)
00204                 {
00205                     if (tid < 512)
00206                         merge(smem, val, tid, 512, op);
00207 
00208                     __syncthreads();
00209                 }
00210                 if (N >= 512)
00211                 {
00212                     if (tid < 256)
00213                         merge(smem, val, tid, 256, op);
00214 
00215                     __syncthreads();
00216                 }
00217                 if (N >= 256)
00218                 {
00219                     if (tid < 128)
00220                         merge(smem, val, tid, 128, op);
00221 
00222                     __syncthreads();
00223                 }
00224                 if (N >= 128)
00225                 {
00226                     if (tid < 64)
00227                         merge(smem, val, tid, 64, op);
00228 
00229                     __syncthreads();
00230                 }
00231                 if (N >= 64)
00232                 {
00233                     if (tid < 32)
00234                         merge(smem, val, tid, 32, op);
00235                 }
00236 
00237                 if (tid < 16)
00238                 {
00239                     merge(smem, val, tid, 16, op);
00240                     merge(smem, val, tid, 8, op);
00241                     merge(smem, val, tid, 4, op);
00242                     merge(smem, val, tid, 2, op);
00243                     merge(smem, val, tid, 1, op);
00244                 }
00245             }
00246         };
00247 
00248         template <unsigned int I, typename Pointer, typename Reference, class Op>
00249         struct Unroll
00250         {
00251             static __device__ void loopShfl(Reference val, Op op, unsigned int N)
00252             {
00253                 mergeShfl(val, I, N, op);
00254                 Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
00255             }
00256             static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
00257             {
00258                 merge(smem, val, tid, I, op);
00259                 Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
00260             }
00261         };
00262         template <typename Pointer, typename Reference, class Op>
00263         struct Unroll<0, Pointer, Reference, Op>
00264         {
00265             static __device__ void loopShfl(Reference, Op, unsigned int)
00266             {
00267             }
00268             static __device__ void loop(Pointer, Reference, unsigned int, Op)
00269             {
00270             }
00271         };
00272 
00273         template <unsigned int N> struct WarpOptimized
00274         {
00275             template <typename Pointer, typename Reference, class Op>
00276             static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
00277             {
00278             #if __CUDA_ARCH__ >= 300
00279                 (void) smem;
00280                 (void) tid;
00281 
00282                 Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
00283             #else
00284                 loadToSmem(smem, val, tid);
00285 
00286                 if (tid < N / 2)
00287                     Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
00288             #endif
00289             }
00290         };
00291 
00292         template <unsigned int N> struct GenericOptimized32
00293         {
00294             enum { M = N / 32 };
00295 
00296             template <typename Pointer, typename Reference, class Op>
00297             static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
00298             {
00299                 const unsigned int laneId = Warp::laneId();
00300 
00301             #if __CUDA_ARCH__ >= 300
00302                 Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
00303 
00304                 if (laneId == 0)
00305                     loadToSmem(smem, val, tid / 32);
00306             #else
00307                 loadToSmem(smem, val, tid);
00308 
00309                 if (laneId < 16)
00310                     Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
00311 
00312                 __syncthreads();
00313 
00314                 if (laneId == 0)
00315                     loadToSmem(smem, val, tid / 32);
00316             #endif
00317 
00318                 __syncthreads();
00319 
00320                 loadFromSmem(smem, val, tid);
00321 
00322                 if (tid < 32)
00323                 {
00324                 #if __CUDA_ARCH__ >= 300
00325                     Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
00326                 #else
00327                     Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
00328                 #endif
00329                 }
00330             }
00331         };
00332 
00333         template <bool val, class T1, class T2> struct StaticIf;
00334         template <class T1, class T2> struct StaticIf<true, T1, T2>
00335         {
00336             typedef T1 type;
00337         };
00338         template <class T1, class T2> struct StaticIf<false, T1, T2>
00339         {
00340             typedef T2 type;
00341         };
00342 
00343         template <unsigned int N> struct IsPowerOf2
00344         {
00345             enum { value = ((N != 0) && !(N & (N - 1))) };
00346         };
00347 
00348         template <unsigned int N> struct Dispatcher
00349         {
00350             typedef typename StaticIf<
00351                 (N <= 32) && IsPowerOf2<N>::value,
00352                 WarpOptimized<N>,
00353                 typename StaticIf<
00354                     (N <= 1024) && IsPowerOf2<N>::value,
00355                     GenericOptimized32<N>,
00356                     Generic<N>
00357                 >::type
00358             >::type reductor;
00359         };
00360     }
00361 }}}
00362 
00363 //! @endcond
00364 
00365 #endif // __OPENCV_CUDA_REDUCE_DETAIL_HPP__
00366