Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

Committer:
thedo
Date:
Fri Jul 21 01:26:54 2017 +0000
Revision:
167:2ee3e82cb6f5
Parent:
166:240bc5a0f42a
gr-peach-opencv-project-sd-card

Who changed what in which revision?

UserRevisionLine numberNew contents of line
thedo 166:240bc5a0f42a 1 /*M///////////////////////////////////////////////////////////////////////////////////////
thedo 166:240bc5a0f42a 2 //
thedo 166:240bc5a0f42a 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
thedo 166:240bc5a0f42a 4 //
thedo 166:240bc5a0f42a 5 // By downloading, copying, installing or using the software you agree to this license.
thedo 166:240bc5a0f42a 6 // If you do not agree to this license, do not download, install,
thedo 166:240bc5a0f42a 7 // copy or use the software.
thedo 166:240bc5a0f42a 8 //
thedo 166:240bc5a0f42a 9 //
thedo 166:240bc5a0f42a 10 // License Agreement
thedo 166:240bc5a0f42a 11 // For Open Source Computer Vision Library
thedo 166:240bc5a0f42a 12 //
thedo 166:240bc5a0f42a 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
thedo 166:240bc5a0f42a 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
thedo 166:240bc5a0f42a 15 // Third party copyrights are property of their respective owners.
thedo 166:240bc5a0f42a 16 //
thedo 166:240bc5a0f42a 17 // Redistribution and use in source and binary forms, with or without modification,
thedo 166:240bc5a0f42a 18 // are permitted provided that the following conditions are met:
thedo 166:240bc5a0f42a 19 //
thedo 166:240bc5a0f42a 20 // * Redistribution's of source code must retain the above copyright notice,
thedo 166:240bc5a0f42a 21 // this list of conditions and the following disclaimer.
thedo 166:240bc5a0f42a 22 //
thedo 166:240bc5a0f42a 23 // * Redistribution's in binary form must reproduce the above copyright notice,
thedo 166:240bc5a0f42a 24 // this list of conditions and the following disclaimer in the documentation
thedo 166:240bc5a0f42a 25 // and/or other materials provided with the distribution.
thedo 166:240bc5a0f42a 26 //
thedo 166:240bc5a0f42a 27 // * The name of the copyright holders may not be used to endorse or promote products
thedo 166:240bc5a0f42a 28 // derived from this software without specific prior written permission.
thedo 166:240bc5a0f42a 29 //
thedo 166:240bc5a0f42a 30 // This software is provided by the copyright holders and contributors "as is" and
thedo 166:240bc5a0f42a 31 // any express or implied warranties, including, but not limited to, the implied
thedo 166:240bc5a0f42a 32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
thedo 166:240bc5a0f42a 33 // In no event shall the Intel Corporation or contributors be liable for any direct,
thedo 166:240bc5a0f42a 34 // indirect, incidental, special, exemplary, or consequential damages
thedo 166:240bc5a0f42a 35 // (including, but not limited to, procurement of substitute goods or services;
thedo 166:240bc5a0f42a 36 // loss of use, data, or profits; or business interruption) however caused
thedo 166:240bc5a0f42a 37 // and on any theory of liability, whether in contract, strict liability,
thedo 166:240bc5a0f42a 38 // or tort (including negligence or otherwise) arising in any way out of
thedo 166:240bc5a0f42a 39 // the use of this software, even if advised of the possibility of such damage.
thedo 166:240bc5a0f42a 40 //
thedo 166:240bc5a0f42a 41 //M*/
thedo 166:240bc5a0f42a 42
thedo 166:240bc5a0f42a 43 #ifndef __OPENCV_CUDA_REDUCE_DETAIL_HPP__
thedo 166:240bc5a0f42a 44 #define __OPENCV_CUDA_REDUCE_DETAIL_HPP__
thedo 166:240bc5a0f42a 45
thedo 166:240bc5a0f42a 46 #include <thrust/tuple.h>
thedo 166:240bc5a0f42a 47 #include "../warp.hpp"
thedo 166:240bc5a0f42a 48 #include "../warp_shuffle.hpp"
thedo 166:240bc5a0f42a 49
thedo 166:240bc5a0f42a 50 //! @cond IGNORED
thedo 166:240bc5a0f42a 51
thedo 166:240bc5a0f42a 52 namespace cv { namespace cuda { namespace device
thedo 166:240bc5a0f42a 53 {
thedo 166:240bc5a0f42a 54 namespace reduce_detail
thedo 166:240bc5a0f42a 55 {
thedo 166:240bc5a0f42a 56 template <typename T> struct GetType;
thedo 166:240bc5a0f42a 57 template <typename T> struct GetType<T*>
thedo 166:240bc5a0f42a 58 {
thedo 166:240bc5a0f42a 59 typedef T type;
thedo 166:240bc5a0f42a 60 };
thedo 166:240bc5a0f42a 61 template <typename T> struct GetType<volatile T*>
thedo 166:240bc5a0f42a 62 {
thedo 166:240bc5a0f42a 63 typedef T type;
thedo 166:240bc5a0f42a 64 };
thedo 166:240bc5a0f42a 65 template <typename T> struct GetType<T&>
thedo 166:240bc5a0f42a 66 {
thedo 166:240bc5a0f42a 67 typedef T type;
thedo 166:240bc5a0f42a 68 };
thedo 166:240bc5a0f42a 69
thedo 166:240bc5a0f42a 70 template <unsigned int I, unsigned int N>
thedo 166:240bc5a0f42a 71 struct For
thedo 166:240bc5a0f42a 72 {
thedo 166:240bc5a0f42a 73 template <class PointerTuple, class ValTuple>
thedo 166:240bc5a0f42a 74 static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
thedo 166:240bc5a0f42a 75 {
thedo 166:240bc5a0f42a 76 thrust::get<I>(smem)[tid] = thrust::get<I>(val);
thedo 166:240bc5a0f42a 77
thedo 166:240bc5a0f42a 78 For<I + 1, N>::loadToSmem(smem, val, tid);
thedo 166:240bc5a0f42a 79 }
thedo 166:240bc5a0f42a 80 template <class PointerTuple, class ValTuple>
thedo 166:240bc5a0f42a 81 static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
thedo 166:240bc5a0f42a 82 {
thedo 166:240bc5a0f42a 83 thrust::get<I>(val) = thrust::get<I>(smem)[tid];
thedo 166:240bc5a0f42a 84
thedo 166:240bc5a0f42a 85 For<I + 1, N>::loadFromSmem(smem, val, tid);
thedo 166:240bc5a0f42a 86 }
thedo 166:240bc5a0f42a 87
thedo 166:240bc5a0f42a 88 template <class PointerTuple, class ValTuple, class OpTuple>
thedo 166:240bc5a0f42a 89 static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
thedo 166:240bc5a0f42a 90 {
thedo 166:240bc5a0f42a 91 typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
thedo 166:240bc5a0f42a 92 thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
thedo 166:240bc5a0f42a 93
thedo 166:240bc5a0f42a 94 For<I + 1, N>::merge(smem, val, tid, delta, op);
thedo 166:240bc5a0f42a 95 }
thedo 166:240bc5a0f42a 96 template <class ValTuple, class OpTuple>
thedo 166:240bc5a0f42a 97 static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
thedo 166:240bc5a0f42a 98 {
thedo 166:240bc5a0f42a 99 typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
thedo 166:240bc5a0f42a 100 thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
thedo 166:240bc5a0f42a 101
thedo 166:240bc5a0f42a 102 For<I + 1, N>::mergeShfl(val, delta, width, op);
thedo 166:240bc5a0f42a 103 }
thedo 166:240bc5a0f42a 104 };
thedo 166:240bc5a0f42a 105 template <unsigned int N>
thedo 166:240bc5a0f42a 106 struct For<N, N>
thedo 166:240bc5a0f42a 107 {
thedo 166:240bc5a0f42a 108 template <class PointerTuple, class ValTuple>
thedo 166:240bc5a0f42a 109 static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
thedo 166:240bc5a0f42a 110 {
thedo 166:240bc5a0f42a 111 }
thedo 166:240bc5a0f42a 112 template <class PointerTuple, class ValTuple>
thedo 166:240bc5a0f42a 113 static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
thedo 166:240bc5a0f42a 114 {
thedo 166:240bc5a0f42a 115 }
thedo 166:240bc5a0f42a 116
thedo 166:240bc5a0f42a 117 template <class PointerTuple, class ValTuple, class OpTuple>
thedo 166:240bc5a0f42a 118 static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
thedo 166:240bc5a0f42a 119 {
thedo 166:240bc5a0f42a 120 }
thedo 166:240bc5a0f42a 121 template <class ValTuple, class OpTuple>
thedo 166:240bc5a0f42a 122 static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
thedo 166:240bc5a0f42a 123 {
thedo 166:240bc5a0f42a 124 }
thedo 166:240bc5a0f42a 125 };
thedo 166:240bc5a0f42a 126
thedo 166:240bc5a0f42a 127 template <typename T>
thedo 166:240bc5a0f42a 128 __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
thedo 166:240bc5a0f42a 129 {
thedo 166:240bc5a0f42a 130 smem[tid] = val;
thedo 166:240bc5a0f42a 131 }
thedo 166:240bc5a0f42a 132 template <typename T>
thedo 166:240bc5a0f42a 133 __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
thedo 166:240bc5a0f42a 134 {
thedo 166:240bc5a0f42a 135 val = smem[tid];
thedo 166:240bc5a0f42a 136 }
thedo 166:240bc5a0f42a 137 template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
thedo 166:240bc5a0f42a 138 typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
thedo 166:240bc5a0f42a 139 __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
thedo 166:240bc5a0f42a 140 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
thedo 166:240bc5a0f42a 141 unsigned int tid)
thedo 166:240bc5a0f42a 142 {
thedo 166:240bc5a0f42a 143 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
thedo 166:240bc5a0f42a 144 }
thedo 166:240bc5a0f42a 145 template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
thedo 166:240bc5a0f42a 146 typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
thedo 166:240bc5a0f42a 147 __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
thedo 166:240bc5a0f42a 148 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
thedo 166:240bc5a0f42a 149 unsigned int tid)
thedo 166:240bc5a0f42a 150 {
thedo 166:240bc5a0f42a 151 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
thedo 166:240bc5a0f42a 152 }
thedo 166:240bc5a0f42a 153
thedo 166:240bc5a0f42a 154 template <typename T, class Op>
thedo 166:240bc5a0f42a 155 __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
thedo 166:240bc5a0f42a 156 {
thedo 166:240bc5a0f42a 157 T reg = smem[tid + delta];
thedo 166:240bc5a0f42a 158 smem[tid] = val = op(val, reg);
thedo 166:240bc5a0f42a 159 }
thedo 166:240bc5a0f42a 160 template <typename T, class Op>
thedo 166:240bc5a0f42a 161 __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
thedo 166:240bc5a0f42a 162 {
thedo 166:240bc5a0f42a 163 T reg = shfl_down(val, delta, width);
thedo 166:240bc5a0f42a 164 val = op(val, reg);
thedo 166:240bc5a0f42a 165 }
thedo 166:240bc5a0f42a 166 template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
thedo 166:240bc5a0f42a 167 typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
thedo 166:240bc5a0f42a 168 class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
thedo 166:240bc5a0f42a 169 __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
thedo 166:240bc5a0f42a 170 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
thedo 166:240bc5a0f42a 171 unsigned int tid,
thedo 166:240bc5a0f42a 172 unsigned int delta,
thedo 166:240bc5a0f42a 173 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
thedo 166:240bc5a0f42a 174 {
thedo 166:240bc5a0f42a 175 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
thedo 166:240bc5a0f42a 176 }
thedo 166:240bc5a0f42a 177 template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
thedo 166:240bc5a0f42a 178 class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
thedo 166:240bc5a0f42a 179 __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
thedo 166:240bc5a0f42a 180 unsigned int delta,
thedo 166:240bc5a0f42a 181 unsigned int width,
thedo 166:240bc5a0f42a 182 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
thedo 166:240bc5a0f42a 183 {
thedo 166:240bc5a0f42a 184 For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
thedo 166:240bc5a0f42a 185 }
thedo 166:240bc5a0f42a 186
thedo 166:240bc5a0f42a 187 template <unsigned int N> struct Generic
thedo 166:240bc5a0f42a 188 {
thedo 166:240bc5a0f42a 189 template <typename Pointer, typename Reference, class Op>
thedo 166:240bc5a0f42a 190 static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
thedo 166:240bc5a0f42a 191 {
thedo 166:240bc5a0f42a 192 loadToSmem(smem, val, tid);
thedo 166:240bc5a0f42a 193 if (N >= 32)
thedo 166:240bc5a0f42a 194 __syncthreads();
thedo 166:240bc5a0f42a 195
thedo 166:240bc5a0f42a 196 if (N >= 2048)
thedo 166:240bc5a0f42a 197 {
thedo 166:240bc5a0f42a 198 if (tid < 1024)
thedo 166:240bc5a0f42a 199 merge(smem, val, tid, 1024, op);
thedo 166:240bc5a0f42a 200
thedo 166:240bc5a0f42a 201 __syncthreads();
thedo 166:240bc5a0f42a 202 }
thedo 166:240bc5a0f42a 203 if (N >= 1024)
thedo 166:240bc5a0f42a 204 {
thedo 166:240bc5a0f42a 205 if (tid < 512)
thedo 166:240bc5a0f42a 206 merge(smem, val, tid, 512, op);
thedo 166:240bc5a0f42a 207
thedo 166:240bc5a0f42a 208 __syncthreads();
thedo 166:240bc5a0f42a 209 }
thedo 166:240bc5a0f42a 210 if (N >= 512)
thedo 166:240bc5a0f42a 211 {
thedo 166:240bc5a0f42a 212 if (tid < 256)
thedo 166:240bc5a0f42a 213 merge(smem, val, tid, 256, op);
thedo 166:240bc5a0f42a 214
thedo 166:240bc5a0f42a 215 __syncthreads();
thedo 166:240bc5a0f42a 216 }
thedo 166:240bc5a0f42a 217 if (N >= 256)
thedo 166:240bc5a0f42a 218 {
thedo 166:240bc5a0f42a 219 if (tid < 128)
thedo 166:240bc5a0f42a 220 merge(smem, val, tid, 128, op);
thedo 166:240bc5a0f42a 221
thedo 166:240bc5a0f42a 222 __syncthreads();
thedo 166:240bc5a0f42a 223 }
thedo 166:240bc5a0f42a 224 if (N >= 128)
thedo 166:240bc5a0f42a 225 {
thedo 166:240bc5a0f42a 226 if (tid < 64)
thedo 166:240bc5a0f42a 227 merge(smem, val, tid, 64, op);
thedo 166:240bc5a0f42a 228
thedo 166:240bc5a0f42a 229 __syncthreads();
thedo 166:240bc5a0f42a 230 }
thedo 166:240bc5a0f42a 231 if (N >= 64)
thedo 166:240bc5a0f42a 232 {
thedo 166:240bc5a0f42a 233 if (tid < 32)
thedo 166:240bc5a0f42a 234 merge(smem, val, tid, 32, op);
thedo 166:240bc5a0f42a 235 }
thedo 166:240bc5a0f42a 236
thedo 166:240bc5a0f42a 237 if (tid < 16)
thedo 166:240bc5a0f42a 238 {
thedo 166:240bc5a0f42a 239 merge(smem, val, tid, 16, op);
thedo 166:240bc5a0f42a 240 merge(smem, val, tid, 8, op);
thedo 166:240bc5a0f42a 241 merge(smem, val, tid, 4, op);
thedo 166:240bc5a0f42a 242 merge(smem, val, tid, 2, op);
thedo 166:240bc5a0f42a 243 merge(smem, val, tid, 1, op);
thedo 166:240bc5a0f42a 244 }
thedo 166:240bc5a0f42a 245 }
thedo 166:240bc5a0f42a 246 };
thedo 166:240bc5a0f42a 247
thedo 166:240bc5a0f42a 248 template <unsigned int I, typename Pointer, typename Reference, class Op>
thedo 166:240bc5a0f42a 249 struct Unroll
thedo 166:240bc5a0f42a 250 {
thedo 166:240bc5a0f42a 251 static __device__ void loopShfl(Reference val, Op op, unsigned int N)
thedo 166:240bc5a0f42a 252 {
thedo 166:240bc5a0f42a 253 mergeShfl(val, I, N, op);
thedo 166:240bc5a0f42a 254 Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
thedo 166:240bc5a0f42a 255 }
thedo 166:240bc5a0f42a 256 static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
thedo 166:240bc5a0f42a 257 {
thedo 166:240bc5a0f42a 258 merge(smem, val, tid, I, op);
thedo 166:240bc5a0f42a 259 Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
thedo 166:240bc5a0f42a 260 }
thedo 166:240bc5a0f42a 261 };
thedo 166:240bc5a0f42a 262 template <typename Pointer, typename Reference, class Op>
thedo 166:240bc5a0f42a 263 struct Unroll<0, Pointer, Reference, Op>
thedo 166:240bc5a0f42a 264 {
thedo 166:240bc5a0f42a 265 static __device__ void loopShfl(Reference, Op, unsigned int)
thedo 166:240bc5a0f42a 266 {
thedo 166:240bc5a0f42a 267 }
thedo 166:240bc5a0f42a 268 static __device__ void loop(Pointer, Reference, unsigned int, Op)
thedo 166:240bc5a0f42a 269 {
thedo 166:240bc5a0f42a 270 }
thedo 166:240bc5a0f42a 271 };
thedo 166:240bc5a0f42a 272
thedo 166:240bc5a0f42a 273 template <unsigned int N> struct WarpOptimized
thedo 166:240bc5a0f42a 274 {
thedo 166:240bc5a0f42a 275 template <typename Pointer, typename Reference, class Op>
thedo 166:240bc5a0f42a 276 static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
thedo 166:240bc5a0f42a 277 {
thedo 166:240bc5a0f42a 278 #if __CUDA_ARCH__ >= 300
thedo 166:240bc5a0f42a 279 (void) smem;
thedo 166:240bc5a0f42a 280 (void) tid;
thedo 166:240bc5a0f42a 281
thedo 166:240bc5a0f42a 282 Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
thedo 166:240bc5a0f42a 283 #else
thedo 166:240bc5a0f42a 284 loadToSmem(smem, val, tid);
thedo 166:240bc5a0f42a 285
thedo 166:240bc5a0f42a 286 if (tid < N / 2)
thedo 166:240bc5a0f42a 287 Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
thedo 166:240bc5a0f42a 288 #endif
thedo 166:240bc5a0f42a 289 }
thedo 166:240bc5a0f42a 290 };
thedo 166:240bc5a0f42a 291
thedo 166:240bc5a0f42a 292 template <unsigned int N> struct GenericOptimized32
thedo 166:240bc5a0f42a 293 {
thedo 166:240bc5a0f42a 294 enum { M = N / 32 };
thedo 166:240bc5a0f42a 295
thedo 166:240bc5a0f42a 296 template <typename Pointer, typename Reference, class Op>
thedo 166:240bc5a0f42a 297 static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
thedo 166:240bc5a0f42a 298 {
thedo 166:240bc5a0f42a 299 const unsigned int laneId = Warp::laneId();
thedo 166:240bc5a0f42a 300
thedo 166:240bc5a0f42a 301 #if __CUDA_ARCH__ >= 300
thedo 166:240bc5a0f42a 302 Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
thedo 166:240bc5a0f42a 303
thedo 166:240bc5a0f42a 304 if (laneId == 0)
thedo 166:240bc5a0f42a 305 loadToSmem(smem, val, tid / 32);
thedo 166:240bc5a0f42a 306 #else
thedo 166:240bc5a0f42a 307 loadToSmem(smem, val, tid);
thedo 166:240bc5a0f42a 308
thedo 166:240bc5a0f42a 309 if (laneId < 16)
thedo 166:240bc5a0f42a 310 Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
thedo 166:240bc5a0f42a 311
thedo 166:240bc5a0f42a 312 __syncthreads();
thedo 166:240bc5a0f42a 313
thedo 166:240bc5a0f42a 314 if (laneId == 0)
thedo 166:240bc5a0f42a 315 loadToSmem(smem, val, tid / 32);
thedo 166:240bc5a0f42a 316 #endif
thedo 166:240bc5a0f42a 317
thedo 166:240bc5a0f42a 318 __syncthreads();
thedo 166:240bc5a0f42a 319
thedo 166:240bc5a0f42a 320 loadFromSmem(smem, val, tid);
thedo 166:240bc5a0f42a 321
thedo 166:240bc5a0f42a 322 if (tid < 32)
thedo 166:240bc5a0f42a 323 {
thedo 166:240bc5a0f42a 324 #if __CUDA_ARCH__ >= 300
thedo 166:240bc5a0f42a 325 Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
thedo 166:240bc5a0f42a 326 #else
thedo 166:240bc5a0f42a 327 Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
thedo 166:240bc5a0f42a 328 #endif
thedo 166:240bc5a0f42a 329 }
thedo 166:240bc5a0f42a 330 }
thedo 166:240bc5a0f42a 331 };
thedo 166:240bc5a0f42a 332
thedo 166:240bc5a0f42a 333 template <bool val, class T1, class T2> struct StaticIf;
thedo 166:240bc5a0f42a 334 template <class T1, class T2> struct StaticIf<true, T1, T2>
thedo 166:240bc5a0f42a 335 {
thedo 166:240bc5a0f42a 336 typedef T1 type;
thedo 166:240bc5a0f42a 337 };
thedo 166:240bc5a0f42a 338 template <class T1, class T2> struct StaticIf<false, T1, T2>
thedo 166:240bc5a0f42a 339 {
thedo 166:240bc5a0f42a 340 typedef T2 type;
thedo 166:240bc5a0f42a 341 };
thedo 166:240bc5a0f42a 342
thedo 166:240bc5a0f42a 343 template <unsigned int N> struct IsPowerOf2
thedo 166:240bc5a0f42a 344 {
thedo 166:240bc5a0f42a 345 enum { value = ((N != 0) && !(N & (N - 1))) };
thedo 166:240bc5a0f42a 346 };
thedo 166:240bc5a0f42a 347
thedo 166:240bc5a0f42a 348 template <unsigned int N> struct Dispatcher
thedo 166:240bc5a0f42a 349 {
thedo 166:240bc5a0f42a 350 typedef typename StaticIf<
thedo 166:240bc5a0f42a 351 (N <= 32) && IsPowerOf2<N>::value,
thedo 166:240bc5a0f42a 352 WarpOptimized<N>,
thedo 166:240bc5a0f42a 353 typename StaticIf<
thedo 166:240bc5a0f42a 354 (N <= 1024) && IsPowerOf2<N>::value,
thedo 166:240bc5a0f42a 355 GenericOptimized32<N>,
thedo 166:240bc5a0f42a 356 Generic<N>
thedo 166:240bc5a0f42a 357 >::type
thedo 166:240bc5a0f42a 358 >::type reductor;
thedo 166:240bc5a0f42a 359 };
thedo 166:240bc5a0f42a 360 }
thedo 166:240bc5a0f42a 361 }}}
thedo 166:240bc5a0f42a 362
thedo 166:240bc5a0f42a 363 //! @endcond
thedo 166:240bc5a0f42a 364
thedo 166:240bc5a0f42a 365 #endif // __OPENCV_CUDA_REDUCE_DETAIL_HPP__
thedo 166:240bc5a0f42a 366