gr-peach-opencv-project-sd-card_update

Renesas GR-PEACH OpenCV Development » Code » gr-peach-opencv-project-sd-card_update

Renesas GR-PEACH OpenCV Development / gr-peach-opencv-project-sd-card_update

Fork of gr-peach-opencv-project-sd-card by the do

opencv_3_1/opencv2/core/detail/reduce.hpp@167:2ee3e82cb6f5, 2017-07-21 (annotated)

Committer:: thedo
Date:: Fri Jul 21 01:26:54 2017 +0000
Revision:: 167:2ee3e82cb6f5
Parent:: 166:240bc5a0f42a

gr-peach-opencv-project-sd-card

Who changed what in which revision?

User	Revision	Line number	New contents of line
thedo	166:240bc5a0f42a	1	/*M///////////////////////////////////////////////////////////////////////////////////////
thedo	166:240bc5a0f42a	2	//
thedo	166:240bc5a0f42a	3	// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
thedo	166:240bc5a0f42a	4	//
thedo	166:240bc5a0f42a	5	// By downloading, copying, installing or using the software you agree to this license.
thedo	166:240bc5a0f42a	6	// If you do not agree to this license, do not download, install,
thedo	166:240bc5a0f42a	7	// copy or use the software.
thedo	166:240bc5a0f42a	8	//
thedo	166:240bc5a0f42a	9	//
thedo	166:240bc5a0f42a	10	// License Agreement
thedo	166:240bc5a0f42a	11	// For Open Source Computer Vision Library
thedo	166:240bc5a0f42a	12	//
thedo	166:240bc5a0f42a	13	// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
thedo	166:240bc5a0f42a	14	// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
thedo	166:240bc5a0f42a	15	// Third party copyrights are property of their respective owners.
thedo	166:240bc5a0f42a	16	//
thedo	166:240bc5a0f42a	17	// Redistribution and use in source and binary forms, with or without modification,
thedo	166:240bc5a0f42a	18	// are permitted provided that the following conditions are met:
thedo	166:240bc5a0f42a	19	//
thedo	166:240bc5a0f42a	20	// * Redistribution's of source code must retain the above copyright notice,
thedo	166:240bc5a0f42a	21	// this list of conditions and the following disclaimer.
thedo	166:240bc5a0f42a	22	//
thedo	166:240bc5a0f42a	23	// * Redistribution's in binary form must reproduce the above copyright notice,
thedo	166:240bc5a0f42a	24	// this list of conditions and the following disclaimer in the documentation
thedo	166:240bc5a0f42a	25	// and/or other materials provided with the distribution.
thedo	166:240bc5a0f42a	26	//
thedo	166:240bc5a0f42a	27	// * The name of the copyright holders may not be used to endorse or promote products
thedo	166:240bc5a0f42a	28	// derived from this software without specific prior written permission.
thedo	166:240bc5a0f42a	29	//
thedo	166:240bc5a0f42a	30	// This software is provided by the copyright holders and contributors "as is" and
thedo	166:240bc5a0f42a	31	// any express or implied warranties, including, but not limited to, the implied
thedo	166:240bc5a0f42a	32	// warranties of merchantability and fitness for a particular purpose are disclaimed.
thedo	166:240bc5a0f42a	33	// In no event shall the Intel Corporation or contributors be liable for any direct,
thedo	166:240bc5a0f42a	34	// indirect, incidental, special, exemplary, or consequential damages
thedo	166:240bc5a0f42a	35	// (including, but not limited to, procurement of substitute goods or services;
thedo	166:240bc5a0f42a	36	// loss of use, data, or profits; or business interruption) however caused
thedo	166:240bc5a0f42a	37	// and on any theory of liability, whether in contract, strict liability,
thedo	166:240bc5a0f42a	38	// or tort (including negligence or otherwise) arising in any way out of
thedo	166:240bc5a0f42a	39	// the use of this software, even if advised of the possibility of such damage.
thedo	166:240bc5a0f42a	40	//
thedo	166:240bc5a0f42a	41	//M*/
thedo	166:240bc5a0f42a	42
thedo	166:240bc5a0f42a	43	#ifndef __OPENCV_CUDA_REDUCE_DETAIL_HPP__
thedo	166:240bc5a0f42a	44	#define __OPENCV_CUDA_REDUCE_DETAIL_HPP__
thedo	166:240bc5a0f42a	45
thedo	166:240bc5a0f42a	46	#include <thrust/tuple.h>
thedo	166:240bc5a0f42a	47	#include "../warp.hpp"
thedo	166:240bc5a0f42a	48	#include "../warp_shuffle.hpp"
thedo	166:240bc5a0f42a	49
thedo	166:240bc5a0f42a	50	//! @cond IGNORED
thedo	166:240bc5a0f42a	51
thedo	166:240bc5a0f42a	52	namespace cv { namespace cuda { namespace device
thedo	166:240bc5a0f42a	53	{
thedo	166:240bc5a0f42a	54	namespace reduce_detail
thedo	166:240bc5a0f42a	55	{
thedo	166:240bc5a0f42a	56	template <typename T> struct GetType;
thedo	166:240bc5a0f42a	57	template <typename T> struct GetType<T*>
thedo	166:240bc5a0f42a	58	{
thedo	166:240bc5a0f42a	59	typedef T type;
thedo	166:240bc5a0f42a	60	};
thedo	166:240bc5a0f42a	61	template <typename T> struct GetType<volatile T*>
thedo	166:240bc5a0f42a	62	{
thedo	166:240bc5a0f42a	63	typedef T type;
thedo	166:240bc5a0f42a	64	};
thedo	166:240bc5a0f42a	65	template <typename T> struct GetType<T&>
thedo	166:240bc5a0f42a	66	{
thedo	166:240bc5a0f42a	67	typedef T type;
thedo	166:240bc5a0f42a	68	};
thedo	166:240bc5a0f42a	69
thedo	166:240bc5a0f42a	70	template <unsigned int I, unsigned int N>
thedo	166:240bc5a0f42a	71	struct For
thedo	166:240bc5a0f42a	72	{
thedo	166:240bc5a0f42a	73	template <class PointerTuple, class ValTuple>
thedo	166:240bc5a0f42a	74	static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
thedo	166:240bc5a0f42a	75	{
thedo	166:240bc5a0f42a	76	thrust::get<I>(smem)[tid] = thrust::get<I>(val);
thedo	166:240bc5a0f42a	77
thedo	166:240bc5a0f42a	78	For<I + 1, N>::loadToSmem(smem, val, tid);
thedo	166:240bc5a0f42a	79	}
thedo	166:240bc5a0f42a	80	template <class PointerTuple, class ValTuple>
thedo	166:240bc5a0f42a	81	static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
thedo	166:240bc5a0f42a	82	{
thedo	166:240bc5a0f42a	83	thrust::get<I>(val) = thrust::get<I>(smem)[tid];
thedo	166:240bc5a0f42a	84
thedo	166:240bc5a0f42a	85	For<I + 1, N>::loadFromSmem(smem, val, tid);
thedo	166:240bc5a0f42a	86	}
thedo	166:240bc5a0f42a	87
thedo	166:240bc5a0f42a	88	template <class PointerTuple, class ValTuple, class OpTuple>
thedo	166:240bc5a0f42a	89	static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
thedo	166:240bc5a0f42a	90	{
thedo	166:240bc5a0f42a	91	typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
thedo	166:240bc5a0f42a	92	thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
thedo	166:240bc5a0f42a	93
thedo	166:240bc5a0f42a	94	For<I + 1, N>::merge(smem, val, tid, delta, op);
thedo	166:240bc5a0f42a	95	}
thedo	166:240bc5a0f42a	96	template <class ValTuple, class OpTuple>
thedo	166:240bc5a0f42a	97	static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
thedo	166:240bc5a0f42a	98	{
thedo	166:240bc5a0f42a	99	typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
thedo	166:240bc5a0f42a	100	thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
thedo	166:240bc5a0f42a	101
thedo	166:240bc5a0f42a	102	For<I + 1, N>::mergeShfl(val, delta, width, op);
thedo	166:240bc5a0f42a	103	}
thedo	166:240bc5a0f42a	104	};
thedo	166:240bc5a0f42a	105	template <unsigned int N>
thedo	166:240bc5a0f42a	106	struct For<N, N>
thedo	166:240bc5a0f42a	107	{
thedo	166:240bc5a0f42a	108	template <class PointerTuple, class ValTuple>
thedo	166:240bc5a0f42a	109	static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
thedo	166:240bc5a0f42a	110	{
thedo	166:240bc5a0f42a	111	}
thedo	166:240bc5a0f42a	112	template <class PointerTuple, class ValTuple>
thedo	166:240bc5a0f42a	113	static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
thedo	166:240bc5a0f42a	114	{
thedo	166:240bc5a0f42a	115	}
thedo	166:240bc5a0f42a	116
thedo	166:240bc5a0f42a	117	template <class PointerTuple, class ValTuple, class OpTuple>
thedo	166:240bc5a0f42a	118	static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
thedo	166:240bc5a0f42a	119	{
thedo	166:240bc5a0f42a	120	}
thedo	166:240bc5a0f42a	121	template <class ValTuple, class OpTuple>
thedo	166:240bc5a0f42a	122	static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
thedo	166:240bc5a0f42a	123	{
thedo	166:240bc5a0f42a	124	}
thedo	166:240bc5a0f42a	125	};
thedo	166:240bc5a0f42a	126
thedo	166:240bc5a0f42a	127	template <typename T>
thedo	166:240bc5a0f42a	128	__device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
thedo	166:240bc5a0f42a	129	{
thedo	166:240bc5a0f42a	130	smem[tid] = val;
thedo	166:240bc5a0f42a	131	}
thedo	166:240bc5a0f42a	132	template <typename T>
thedo	166:240bc5a0f42a	133	__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
thedo	166:240bc5a0f42a	134	{
thedo	166:240bc5a0f42a	135	val = smem[tid];
thedo	166:240bc5a0f42a	136	}
thedo	166:240bc5a0f42a	137	template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
thedo	166:240bc5a0f42a	138	typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
thedo	166:240bc5a0f42a	139	__device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
thedo	166:240bc5a0f42a	140	const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
thedo	166:240bc5a0f42a	141	unsigned int tid)
thedo	166:240bc5a0f42a	142	{
thedo	166:240bc5a0f42a	143	For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
thedo	166:240bc5a0f42a	144	}
thedo	166:240bc5a0f42a	145	template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
thedo	166:240bc5a0f42a	146	typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
thedo	166:240bc5a0f42a	147	__device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
thedo	166:240bc5a0f42a	148	const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
thedo	166:240bc5a0f42a	149	unsigned int tid)
thedo	166:240bc5a0f42a	150	{
thedo	166:240bc5a0f42a	151	For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
thedo	166:240bc5a0f42a	152	}
thedo	166:240bc5a0f42a	153
thedo	166:240bc5a0f42a	154	template <typename T, class Op>
thedo	166:240bc5a0f42a	155	__device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
thedo	166:240bc5a0f42a	156	{
thedo	166:240bc5a0f42a	157	T reg = smem[tid + delta];
thedo	166:240bc5a0f42a	158	smem[tid] = val = op(val, reg);
thedo	166:240bc5a0f42a	159	}
thedo	166:240bc5a0f42a	160	template <typename T, class Op>
thedo	166:240bc5a0f42a	161	__device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
thedo	166:240bc5a0f42a	162	{
thedo	166:240bc5a0f42a	163	T reg = shfl_down(val, delta, width);
thedo	166:240bc5a0f42a	164	val = op(val, reg);
thedo	166:240bc5a0f42a	165	}
thedo	166:240bc5a0f42a	166	template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
thedo	166:240bc5a0f42a	167	typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
thedo	166:240bc5a0f42a	168	class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
thedo	166:240bc5a0f42a	169	__device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
thedo	166:240bc5a0f42a	170	const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
thedo	166:240bc5a0f42a	171	unsigned int tid,
thedo	166:240bc5a0f42a	172	unsigned int delta,
thedo	166:240bc5a0f42a	173	const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
thedo	166:240bc5a0f42a	174	{
thedo	166:240bc5a0f42a	175	For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
thedo	166:240bc5a0f42a	176	}
thedo	166:240bc5a0f42a	177	template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
thedo	166:240bc5a0f42a	178	class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
thedo	166:240bc5a0f42a	179	__device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
thedo	166:240bc5a0f42a	180	unsigned int delta,
thedo	166:240bc5a0f42a	181	unsigned int width,
thedo	166:240bc5a0f42a	182	const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
thedo	166:240bc5a0f42a	183	{
thedo	166:240bc5a0f42a	184	For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
thedo	166:240bc5a0f42a	185	}
thedo	166:240bc5a0f42a	186
thedo	166:240bc5a0f42a	187	template <unsigned int N> struct Generic
thedo	166:240bc5a0f42a	188	{
thedo	166:240bc5a0f42a	189	template <typename Pointer, typename Reference, class Op>
thedo	166:240bc5a0f42a	190	static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
thedo	166:240bc5a0f42a	191	{
thedo	166:240bc5a0f42a	192	loadToSmem(smem, val, tid);
thedo	166:240bc5a0f42a	193	if (N >= 32)
thedo	166:240bc5a0f42a	194	__syncthreads();
thedo	166:240bc5a0f42a	195
thedo	166:240bc5a0f42a	196	if (N >= 2048)
thedo	166:240bc5a0f42a	197	{
thedo	166:240bc5a0f42a	198	if (tid < 1024)
thedo	166:240bc5a0f42a	199	merge(smem, val, tid, 1024, op);
thedo	166:240bc5a0f42a	200
thedo	166:240bc5a0f42a	201	__syncthreads();
thedo	166:240bc5a0f42a	202	}
thedo	166:240bc5a0f42a	203	if (N >= 1024)
thedo	166:240bc5a0f42a	204	{
thedo	166:240bc5a0f42a	205	if (tid < 512)
thedo	166:240bc5a0f42a	206	merge(smem, val, tid, 512, op);
thedo	166:240bc5a0f42a	207
thedo	166:240bc5a0f42a	208	__syncthreads();
thedo	166:240bc5a0f42a	209	}
thedo	166:240bc5a0f42a	210	if (N >= 512)
thedo	166:240bc5a0f42a	211	{
thedo	166:240bc5a0f42a	212	if (tid < 256)
thedo	166:240bc5a0f42a	213	merge(smem, val, tid, 256, op);
thedo	166:240bc5a0f42a	214
thedo	166:240bc5a0f42a	215	__syncthreads();
thedo	166:240bc5a0f42a	216	}
thedo	166:240bc5a0f42a	217	if (N >= 256)
thedo	166:240bc5a0f42a	218	{
thedo	166:240bc5a0f42a	219	if (tid < 128)
thedo	166:240bc5a0f42a	220	merge(smem, val, tid, 128, op);
thedo	166:240bc5a0f42a	221
thedo	166:240bc5a0f42a	222	__syncthreads();
thedo	166:240bc5a0f42a	223	}
thedo	166:240bc5a0f42a	224	if (N >= 128)
thedo	166:240bc5a0f42a	225	{
thedo	166:240bc5a0f42a	226	if (tid < 64)
thedo	166:240bc5a0f42a	227	merge(smem, val, tid, 64, op);
thedo	166:240bc5a0f42a	228
thedo	166:240bc5a0f42a	229	__syncthreads();
thedo	166:240bc5a0f42a	230	}
thedo	166:240bc5a0f42a	231	if (N >= 64)
thedo	166:240bc5a0f42a	232	{
thedo	166:240bc5a0f42a	233	if (tid < 32)
thedo	166:240bc5a0f42a	234	merge(smem, val, tid, 32, op);
thedo	166:240bc5a0f42a	235	}
thedo	166:240bc5a0f42a	236
thedo	166:240bc5a0f42a	237	if (tid < 16)
thedo	166:240bc5a0f42a	238	{
thedo	166:240bc5a0f42a	239	merge(smem, val, tid, 16, op);
thedo	166:240bc5a0f42a	240	merge(smem, val, tid, 8, op);
thedo	166:240bc5a0f42a	241	merge(smem, val, tid, 4, op);
thedo	166:240bc5a0f42a	242	merge(smem, val, tid, 2, op);
thedo	166:240bc5a0f42a	243	merge(smem, val, tid, 1, op);
thedo	166:240bc5a0f42a	244	}
thedo	166:240bc5a0f42a	245	}
thedo	166:240bc5a0f42a	246	};
thedo	166:240bc5a0f42a	247
thedo	166:240bc5a0f42a	248	template <unsigned int I, typename Pointer, typename Reference, class Op>
thedo	166:240bc5a0f42a	249	struct Unroll
thedo	166:240bc5a0f42a	250	{
thedo	166:240bc5a0f42a	251	static __device__ void loopShfl(Reference val, Op op, unsigned int N)
thedo	166:240bc5a0f42a	252	{
thedo	166:240bc5a0f42a	253	mergeShfl(val, I, N, op);
thedo	166:240bc5a0f42a	254	Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
thedo	166:240bc5a0f42a	255	}
thedo	166:240bc5a0f42a	256	static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
thedo	166:240bc5a0f42a	257	{
thedo	166:240bc5a0f42a	258	merge(smem, val, tid, I, op);
thedo	166:240bc5a0f42a	259	Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
thedo	166:240bc5a0f42a	260	}
thedo	166:240bc5a0f42a	261	};
thedo	166:240bc5a0f42a	262	template <typename Pointer, typename Reference, class Op>
thedo	166:240bc5a0f42a	263	struct Unroll<0, Pointer, Reference, Op>
thedo	166:240bc5a0f42a	264	{
thedo	166:240bc5a0f42a	265	static __device__ void loopShfl(Reference, Op, unsigned int)
thedo	166:240bc5a0f42a	266	{
thedo	166:240bc5a0f42a	267	}
thedo	166:240bc5a0f42a	268	static __device__ void loop(Pointer, Reference, unsigned int, Op)
thedo	166:240bc5a0f42a	269	{
thedo	166:240bc5a0f42a	270	}
thedo	166:240bc5a0f42a	271	};
thedo	166:240bc5a0f42a	272
thedo	166:240bc5a0f42a	273	template <unsigned int N> struct WarpOptimized
thedo	166:240bc5a0f42a	274	{
thedo	166:240bc5a0f42a	275	template <typename Pointer, typename Reference, class Op>
thedo	166:240bc5a0f42a	276	static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
thedo	166:240bc5a0f42a	277	{
thedo	166:240bc5a0f42a	278	#if __CUDA_ARCH__ >= 300
thedo	166:240bc5a0f42a	279	(void) smem;
thedo	166:240bc5a0f42a	280	(void) tid;
thedo	166:240bc5a0f42a	281
thedo	166:240bc5a0f42a	282	Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
thedo	166:240bc5a0f42a	283	#else
thedo	166:240bc5a0f42a	284	loadToSmem(smem, val, tid);
thedo	166:240bc5a0f42a	285
thedo	166:240bc5a0f42a	286	if (tid < N / 2)
thedo	166:240bc5a0f42a	287	Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
thedo	166:240bc5a0f42a	288	#endif
thedo	166:240bc5a0f42a	289	}
thedo	166:240bc5a0f42a	290	};
thedo	166:240bc5a0f42a	291
thedo	166:240bc5a0f42a	292	template <unsigned int N> struct GenericOptimized32
thedo	166:240bc5a0f42a	293	{
thedo	166:240bc5a0f42a	294	enum { M = N / 32 };
thedo	166:240bc5a0f42a	295
thedo	166:240bc5a0f42a	296	template <typename Pointer, typename Reference, class Op>
thedo	166:240bc5a0f42a	297	static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
thedo	166:240bc5a0f42a	298	{
thedo	166:240bc5a0f42a	299	const unsigned int laneId = Warp::laneId();
thedo	166:240bc5a0f42a	300
thedo	166:240bc5a0f42a	301	#if __CUDA_ARCH__ >= 300
thedo	166:240bc5a0f42a	302	Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
thedo	166:240bc5a0f42a	303
thedo	166:240bc5a0f42a	304	if (laneId == 0)
thedo	166:240bc5a0f42a	305	loadToSmem(smem, val, tid / 32);
thedo	166:240bc5a0f42a	306	#else
thedo	166:240bc5a0f42a	307	loadToSmem(smem, val, tid);
thedo	166:240bc5a0f42a	308
thedo	166:240bc5a0f42a	309	if (laneId < 16)
thedo	166:240bc5a0f42a	310	Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
thedo	166:240bc5a0f42a	311
thedo	166:240bc5a0f42a	312	__syncthreads();
thedo	166:240bc5a0f42a	313
thedo	166:240bc5a0f42a	314	if (laneId == 0)
thedo	166:240bc5a0f42a	315	loadToSmem(smem, val, tid / 32);
thedo	166:240bc5a0f42a	316	#endif
thedo	166:240bc5a0f42a	317
thedo	166:240bc5a0f42a	318	__syncthreads();
thedo	166:240bc5a0f42a	319
thedo	166:240bc5a0f42a	320	loadFromSmem(smem, val, tid);
thedo	166:240bc5a0f42a	321
thedo	166:240bc5a0f42a	322	if (tid < 32)
thedo	166:240bc5a0f42a	323	{
thedo	166:240bc5a0f42a	324	#if __CUDA_ARCH__ >= 300
thedo	166:240bc5a0f42a	325	Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
thedo	166:240bc5a0f42a	326	#else
thedo	166:240bc5a0f42a	327	Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
thedo	166:240bc5a0f42a	328	#endif
thedo	166:240bc5a0f42a	329	}
thedo	166:240bc5a0f42a	330	}
thedo	166:240bc5a0f42a	331	};
thedo	166:240bc5a0f42a	332
thedo	166:240bc5a0f42a	333	template <bool val, class T1, class T2> struct StaticIf;
thedo	166:240bc5a0f42a	334	template <class T1, class T2> struct StaticIf<true, T1, T2>
thedo	166:240bc5a0f42a	335	{
thedo	166:240bc5a0f42a	336	typedef T1 type;
thedo	166:240bc5a0f42a	337	};
thedo	166:240bc5a0f42a	338	template <class T1, class T2> struct StaticIf<false, T1, T2>
thedo	166:240bc5a0f42a	339	{
thedo	166:240bc5a0f42a	340	typedef T2 type;
thedo	166:240bc5a0f42a	341	};
thedo	166:240bc5a0f42a	342
thedo	166:240bc5a0f42a	343	template <unsigned int N> struct IsPowerOf2
thedo	166:240bc5a0f42a	344	{
thedo	166:240bc5a0f42a	345	enum { value = ((N != 0) && !(N & (N - 1))) };
thedo	166:240bc5a0f42a	346	};
thedo	166:240bc5a0f42a	347
thedo	166:240bc5a0f42a	348	template <unsigned int N> struct Dispatcher
thedo	166:240bc5a0f42a	349	{
thedo	166:240bc5a0f42a	350	typedef typename StaticIf<
thedo	166:240bc5a0f42a	351	(N <= 32) && IsPowerOf2<N>::value,
thedo	166:240bc5a0f42a	352	WarpOptimized<N>,
thedo	166:240bc5a0f42a	353	typename StaticIf<
thedo	166:240bc5a0f42a	354	(N <= 1024) && IsPowerOf2<N>::value,
thedo	166:240bc5a0f42a	355	GenericOptimized32<N>,
thedo	166:240bc5a0f42a	356	Generic<N>
thedo	166:240bc5a0f42a	357	>::type
thedo	166:240bc5a0f42a	358	>::type reductor;
thedo	166:240bc5a0f42a	359	};
thedo	166:240bc5a0f42a	360	}
thedo	166:240bc5a0f42a	361	}}}
thedo	166:240bc5a0f42a	362
thedo	166:240bc5a0f42a	363	//! @endcond
thedo	166:240bc5a0f42a	364
thedo	166:240bc5a0f42a	365	#endif // __OPENCV_CUDA_REDUCE_DETAIL_HPP__
thedo	166:240bc5a0f42a	366

Repository toolbox

Repository details

Type:	Program
Created:	26 Jul 2017
Imports:	3
Forks:	0
Commits:	168
Dependents:	0
Dependencies:	0
Followers:	9

Important changes to repositories hosted on mbed.com

opencv_3_1/opencv2/core/detail/reduce.hpp@167:2ee3e82cb6f5, 2017-07-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Important changes to repositories hosted on mbed.com

opencv_3_1/opencv2/core/detail/reduce.hpp@167:2ee3e82cb6f5, 2017-07-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning