Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers scan.hpp Source File

scan.hpp

Go to the documentation of this file.
00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Third party copyrights are property of their respective owners.
00016 //
00017 // Redistribution and use in source and binary forms, with or without modification,
00018 // are permitted provided that the following conditions are met:
00019 //
00020 //   * Redistribution's of source code must retain the above copyright notice,
00021 //     this list of conditions and the following disclaimer.
00022 //
00023 //   * Redistribution's in binary form must reproduce the above copyright notice,
00024 //     this list of conditions and the following disclaimer in the documentation
00025 //     and/or other materials provided with the distribution.
00026 //
00027 //   * The name of the copyright holders may not be used to endorse or promote products
00028 //     derived from this software without specific prior written permission.
00029 //
00030 // This software is provided by the copyright holders and contributors "as is" and
00031 // any express or implied warranties, including, but not limited to, the implied
00032 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00033 // In no event shall the Intel Corporation or contributors be liable for any direct,
00034 // indirect, incidental, special, exemplary, or consequential damages
00035 // (including, but not limited to, procurement of substitute goods or services;
00036 // loss of use, data, or profits; or business interruption) however caused
00037 // and on any theory of liability, whether in contract, strict liability,
00038 // or tort (including negligence or otherwise) arising in any way out of
00039 // the use of this software, even if advised of the possibility of such damage.
00040 //
00041 //M*/
00042 
00043 #ifndef __OPENCV_CUDA_SCAN_HPP__
00044 #define __OPENCV_CUDA_SCAN_HPP__
00045 
00046 #include "opencv2/core/cuda/common.hpp "
00047 #include "opencv2/core/cuda/utility.hpp "
00048 #include "opencv2/core/cuda/warp.hpp "
00049 #include "opencv2/core/cuda/warp_shuffle.hpp "
00050 
00051 /** @file
00052  * @deprecated Use @ref cudev instead.
00053  */
00054 
00055 //! @cond IGNORED
00056 
00057 namespace cv { namespace cuda { namespace device
00058 {
00059     enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };
00060 
00061     template <ScanKind Kind, typename T, typename F> struct WarpScan
00062     {
00063         __device__ __forceinline__ WarpScan() {}
00064         __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
00065 
00066         __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
00067         {
00068             const unsigned int lane = idx & 31;
00069             F op;
00070 
00071             if ( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
00072             if ( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
00073             if ( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
00074             if ( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
00075             if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
00076 
00077             if( Kind == INCLUSIVE )
00078                 return ptr [idx];
00079             else
00080                 return (lane > 0) ? ptr [idx - 1] : 0;
00081         }
00082 
00083         __device__ __forceinline__ unsigned int index(const unsigned int tid)
00084         {
00085             return tid;
00086         }
00087 
00088         __device__ __forceinline__ void init(volatile T *ptr){}
00089 
00090         static const int warp_offset      = 0;
00091 
00092         typedef WarpScan<INCLUSIVE, T, F>  merge;
00093     };
00094 
00095     template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
00096     {
00097         __device__ __forceinline__ WarpScanNoComp() {}
00098         __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
00099 
00100         __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
00101         {
00102             const unsigned int lane = threadIdx.x & 31;
00103             F op;
00104 
00105             ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
00106             ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
00107             ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
00108             ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
00109             ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
00110 
00111             if( Kind == INCLUSIVE )
00112                 return ptr [idx];
00113             else
00114                 return (lane > 0) ? ptr [idx - 1] : 0;
00115         }
00116 
00117         __device__ __forceinline__ unsigned int index(const unsigned int tid)
00118         {
00119             return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
00120         }
00121 
00122         __device__ __forceinline__ void init(volatile T *ptr)
00123         {
00124             ptr[threadIdx.x] = 0;
00125         }
00126 
00127         static const int warp_smem_stride = 32 + 16 + 1;
00128         static const int warp_offset      = 16;
00129         static const int warp_log         = 5;
00130         static const int warp_mask        = 31;
00131 
00132         typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
00133     };
00134 
00135     template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
00136     {
00137         __device__ __forceinline__ BlockScan() {}
00138         __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
00139 
00140         __device__ __forceinline__ T operator()(volatile T *ptr)
00141         {
00142             const unsigned int tid  = threadIdx.x;
00143             const unsigned int lane = tid & warp_mask;
00144             const unsigned int warp = tid >> warp_log;
00145 
00146             Sc scan;
00147             typename Sc::merge merge_scan;
00148             const unsigned int idx = scan.index(tid);
00149 
00150             T val = scan(ptr, idx);
00151             __syncthreads ();
00152 
00153             if( warp == 0)
00154                 scan.init(ptr);
00155             __syncthreads ();
00156 
00157             if( lane == 31 )
00158                 ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
00159             __syncthreads ();
00160 
00161             if( warp == 0 )
00162                 merge_scan(ptr, idx);
00163             __syncthreads();
00164 
00165             if ( warp > 0)
00166                 val = ptr [scan.warp_offset + warp - 1] + val;
00167             __syncthreads ();
00168 
00169             ptr[idx] = val;
00170             __syncthreads ();
00171 
00172             return val ;
00173         }
00174 
00175         static const int warp_log  = 5;
00176         static const int warp_mask = 31;
00177     };
00178 
00179     template <typename T>
00180     __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
00181     {
00182     #if __CUDA_ARCH__ >= 300
00183         const unsigned int laneId = cv::cuda::device::Warp::laneId();
00184 
00185         // scan on shuffl functions
00186         #pragma unroll
00187         for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
00188         {
00189             const T n = cv::cuda::device::shfl_up(idata, i);
00190             if (laneId >= i)
00191                   idata += n;
00192         }
00193 
00194         return idata;
00195     #else
00196         unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
00197         s_Data[pos] = 0;
00198         pos += OPENCV_CUDA_WARP_SIZE;
00199         s_Data[pos] = idata;
00200 
00201         s_Data[pos] += s_Data[pos - 1];
00202         s_Data[pos] += s_Data[pos - 2];
00203         s_Data[pos] += s_Data[pos - 4];
00204         s_Data[pos] += s_Data[pos - 8];
00205         s_Data[pos] += s_Data[pos - 16];
00206 
00207         return s_Data[pos];
00208     #endif
00209     }
00210 
00211     template <typename T>
00212     __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid)
00213     {
00214         return warpScanInclusive(idata, s_Data, tid) - idata;
00215     }
00216 
00217     template <int tiNumScanThreads, typename T>
00218     __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
00219     {
00220         if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
00221         {
00222             //Bottom-level inclusive warp scan
00223             T warpResult = warpScanInclusive(idata, s_Data, tid);
00224 
00225             //Save top elements of each warp for exclusive warp scan
00226             //sync to wait for warp scans to complete (because s_Data is being overwritten)
00227             __syncthreads();
00228             if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
00229             {
00230                 s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
00231             }
00232 
00233             //wait for warp scans to complete
00234             __syncthreads();
00235 
00236             if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
00237             {
00238                 //grab top warp elements
00239                 T val = s_Data[tid];
00240                 //calculate exclusive scan and write back to shared memory
00241                 s_Data[tid] = warpScanExclusive(val, s_Data, tid);
00242             }
00243 
00244             //return updated warp scans with exclusive scan results
00245             __syncthreads();
00246 
00247             return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
00248         }
00249         else
00250         {
00251             return warpScanInclusive(idata, s_Data, tid);
00252         }
00253     }
00254 }}}
00255 
00256 //! @endcond
00257 
00258 #endif // __OPENCV_CUDA_SCAN_HPP__
00259