Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
transform_detail.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Third party copyrights are property of their respective owners. 00016 // 00017 // Redistribution and use in source and binary forms, with or without modification, 00018 // are permitted provided that the following conditions are met: 00019 // 00020 // * Redistribution's of source code must retain the above copyright notice, 00021 // this list of conditions and the following disclaimer. 00022 // 00023 // * Redistribution's in binary form must reproduce the above copyright notice, 00024 // this list of conditions and the following disclaimer in the documentation 00025 // and/or other materials provided with the distribution. 00026 // 00027 // * The name of the copyright holders may not be used to endorse or promote products 00028 // derived from this software without specific prior written permission. 00029 // 00030 // This software is provided by the copyright holders and contributors "as is" and 00031 // any express or implied warranties, including, but not limited to, the implied 00032 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00033 // In no event shall the Intel Corporation or contributors be liable for any direct, 00034 // indirect, incidental, special, exemplary, or consequential damages 00035 // (including, but not limited to, procurement of substitute goods or services; 00036 // loss of use, data, or profits; or business interruption) however caused 00037 // and on any theory of liability, whether in contract, strict liability, 00038 // or tort (including negligence or otherwise) arising in any way out of 00039 // the use of this software, even if advised of the possibility of such damage. 00040 // 00041 //M*/ 00042 00043 #ifndef __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__ 00044 #define __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__ 00045 00046 #include "../common.hpp" 00047 #include "../vec_traits.hpp" 00048 #include "../functional.hpp" 00049 00050 //! @cond IGNORED 00051 00052 namespace cv { namespace cuda { namespace device 00053 { 00054 namespace transform_detail 00055 { 00056 //! Read Write Traits 00057 00058 template <typename T, typename D, int shift> struct UnaryReadWriteTraits 00059 { 00060 typedef typename TypeVec<T, shift>::vec_type read_type; 00061 typedef typename TypeVec<D, shift>::vec_type write_type; 00062 }; 00063 00064 template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits 00065 { 00066 typedef typename TypeVec<T1, shift>::vec_type read_type1; 00067 typedef typename TypeVec<T2, shift>::vec_type read_type2; 00068 typedef typename TypeVec<D, shift>::vec_type write_type; 00069 }; 00070 00071 //! Transform kernels 00072 00073 template <int shift> struct OpUnroller; 00074 template <> struct OpUnroller<1> 00075 { 00076 template <typename T, typename D, typename UnOp, typename Mask> 00077 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) 00078 { 00079 if (mask(y, x_shifted)) 00080 dst.x = op(src.x); 00081 } 00082 00083 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00084 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) 00085 { 00086 if (mask(y, x_shifted)) 00087 dst.x = op(src1.x, src2.x); 00088 } 00089 }; 00090 template <> struct OpUnroller<2> 00091 { 00092 template <typename T, typename D, typename UnOp, typename Mask> 00093 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) 00094 { 00095 if (mask(y, x_shifted)) 00096 dst.x = op(src.x); 00097 if (mask(y, x_shifted + 1)) 00098 dst.y = op(src.y); 00099 } 00100 00101 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00102 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) 00103 { 00104 if (mask(y, x_shifted)) 00105 dst.x = op(src1.x, src2.x); 00106 if (mask(y, x_shifted + 1)) 00107 dst.y = op(src1.y, src2.y); 00108 } 00109 }; 00110 template <> struct OpUnroller<3> 00111 { 00112 template <typename T, typename D, typename UnOp, typename Mask> 00113 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) 00114 { 00115 if (mask(y, x_shifted)) 00116 dst.x = op(src.x); 00117 if (mask(y, x_shifted + 1)) 00118 dst.y = op(src.y); 00119 if (mask(y, x_shifted + 2)) 00120 dst.z = op(src.z); 00121 } 00122 00123 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00124 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) 00125 { 00126 if (mask(y, x_shifted)) 00127 dst.x = op(src1.x, src2.x); 00128 if (mask(y, x_shifted + 1)) 00129 dst.y = op(src1.y, src2.y); 00130 if (mask(y, x_shifted + 2)) 00131 dst.z = op(src1.z, src2.z); 00132 } 00133 }; 00134 template <> struct OpUnroller<4> 00135 { 00136 template <typename T, typename D, typename UnOp, typename Mask> 00137 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) 00138 { 00139 if (mask(y, x_shifted)) 00140 dst.x = op(src.x); 00141 if (mask(y, x_shifted + 1)) 00142 dst.y = op(src.y); 00143 if (mask(y, x_shifted + 2)) 00144 dst.z = op(src.z); 00145 if (mask(y, x_shifted + 3)) 00146 dst.w = op(src.w); 00147 } 00148 00149 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00150 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) 00151 { 00152 if (mask(y, x_shifted)) 00153 dst.x = op(src1.x, src2.x); 00154 if (mask(y, x_shifted + 1)) 00155 dst.y = op(src1.y, src2.y); 00156 if (mask(y, x_shifted + 2)) 00157 dst.z = op(src1.z, src2.z); 00158 if (mask(y, x_shifted + 3)) 00159 dst.w = op(src1.w, src2.w); 00160 } 00161 }; 00162 template <> struct OpUnroller<8> 00163 { 00164 template <typename T, typename D, typename UnOp, typename Mask> 00165 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) 00166 { 00167 if (mask(y, x_shifted)) 00168 dst.a0 = op(src.a0); 00169 if (mask(y, x_shifted + 1)) 00170 dst.a1 = op(src.a1); 00171 if (mask(y, x_shifted + 2)) 00172 dst.a2 = op(src.a2); 00173 if (mask(y, x_shifted + 3)) 00174 dst.a3 = op(src.a3); 00175 if (mask(y, x_shifted + 4)) 00176 dst.a4 = op(src.a4); 00177 if (mask(y, x_shifted + 5)) 00178 dst.a5 = op(src.a5); 00179 if (mask(y, x_shifted + 6)) 00180 dst.a6 = op(src.a6); 00181 if (mask(y, x_shifted + 7)) 00182 dst.a7 = op(src.a7); 00183 } 00184 00185 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00186 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) 00187 { 00188 if (mask(y, x_shifted)) 00189 dst.a0 = op(src1.a0, src2.a0); 00190 if (mask(y, x_shifted + 1)) 00191 dst.a1 = op(src1.a1, src2.a1); 00192 if (mask(y, x_shifted + 2)) 00193 dst.a2 = op(src1.a2, src2.a2); 00194 if (mask(y, x_shifted + 3)) 00195 dst.a3 = op(src1.a3, src2.a3); 00196 if (mask(y, x_shifted + 4)) 00197 dst.a4 = op(src1.a4, src2.a4); 00198 if (mask(y, x_shifted + 5)) 00199 dst.a5 = op(src1.a5, src2.a5); 00200 if (mask(y, x_shifted + 6)) 00201 dst.a6 = op(src1.a6, src2.a6); 00202 if (mask(y, x_shifted + 7)) 00203 dst.a7 = op(src1.a7, src2.a7); 00204 } 00205 }; 00206 00207 template <typename T, typename D, typename UnOp, typename Mask> 00208 static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op) 00209 { 00210 typedef TransformFunctorTraits<UnOp> ft; 00211 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type; 00212 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type; 00213 00214 const int x = threadIdx.x + blockIdx.x * blockDim.x; 00215 const int y = threadIdx.y + blockIdx.y * blockDim.y; 00216 const int x_shifted = x * ft::smart_shift; 00217 00218 if (y < src_.rows) 00219 { 00220 const T* src = src_.ptr(y); 00221 D* dst = dst_.ptr(y); 00222 00223 if (x_shifted + ft::smart_shift - 1 < src_.cols) 00224 { 00225 const read_type src_n_el = ((const read_type*)src)[x]; 00226 write_type dst_n_el = ((const write_type*)dst)[x]; 00227 00228 OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y); 00229 00230 ((write_type*)dst)[x] = dst_n_el; 00231 } 00232 else 00233 { 00234 for (int real_x = x_shifted; real_x < src_.cols; ++real_x) 00235 { 00236 if (mask(y, real_x)) 00237 dst[real_x] = op(src[real_x]); 00238 } 00239 } 00240 } 00241 } 00242 00243 template <typename T, typename D, typename UnOp, typename Mask> 00244 __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op) 00245 { 00246 const int x = blockDim.x * blockIdx.x + threadIdx.x; 00247 const int y = blockDim.y * blockIdx.y + threadIdx.y; 00248 00249 if (x < src.cols && y < src.rows && mask(y, x)) 00250 { 00251 dst.ptr(y)[x] = op(src.ptr(y)[x]); 00252 } 00253 } 00254 00255 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00256 static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, 00257 const Mask mask, const BinOp op) 00258 { 00259 typedef TransformFunctorTraits<BinOp> ft; 00260 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1; 00261 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2; 00262 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type; 00263 00264 const int x = threadIdx.x + blockIdx.x * blockDim.x; 00265 const int y = threadIdx.y + blockIdx.y * blockDim.y; 00266 const int x_shifted = x * ft::smart_shift; 00267 00268 if (y < src1_.rows) 00269 { 00270 const T1* src1 = src1_.ptr(y); 00271 const T2* src2 = src2_.ptr(y); 00272 D* dst = dst_.ptr(y); 00273 00274 if (x_shifted + ft::smart_shift - 1 < src1_.cols) 00275 { 00276 const read_type1 src1_n_el = ((const read_type1*)src1)[x]; 00277 const read_type2 src2_n_el = ((const read_type2*)src2)[x]; 00278 write_type dst_n_el = ((const write_type*)dst)[x]; 00279 00280 OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y); 00281 00282 ((write_type*)dst)[x] = dst_n_el; 00283 } 00284 else 00285 { 00286 for (int real_x = x_shifted; real_x < src1_.cols; ++real_x) 00287 { 00288 if (mask(y, real_x)) 00289 dst[real_x] = op(src1[real_x], src2[real_x]); 00290 } 00291 } 00292 } 00293 } 00294 00295 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00296 static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, 00297 const Mask mask, const BinOp op) 00298 { 00299 const int x = blockDim.x * blockIdx.x + threadIdx.x; 00300 const int y = blockDim.y * blockIdx.y + threadIdx.y; 00301 00302 if (x < src1.cols && y < src1.rows && mask(y, x)) 00303 { 00304 const T1 src1_data = src1.ptr(y)[x]; 00305 const T2 src2_data = src2.ptr(y)[x]; 00306 dst.ptr(y)[x] = op(src1_data, src2_data); 00307 } 00308 } 00309 00310 template <bool UseSmart> struct TransformDispatcher; 00311 template<> struct TransformDispatcher<false> 00312 { 00313 template <typename T, typename D, typename UnOp, typename Mask> 00314 static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream) 00315 { 00316 typedef TransformFunctorTraits<UnOp> ft; 00317 00318 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); 00319 const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1); 00320 00321 transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op); 00322 cudaSafeCall( cudaGetLastError() ); 00323 00324 if (stream == 0) 00325 cudaSafeCall( cudaDeviceSynchronize() ); 00326 } 00327 00328 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00329 static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream) 00330 { 00331 typedef TransformFunctorTraits<BinOp> ft; 00332 00333 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); 00334 const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1); 00335 00336 transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op); 00337 cudaSafeCall( cudaGetLastError() ); 00338 00339 if (stream == 0) 00340 cudaSafeCall( cudaDeviceSynchronize() ); 00341 } 00342 }; 00343 template<> struct TransformDispatcher<true> 00344 { 00345 template <typename T, typename D, typename UnOp, typename Mask> 00346 static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream) 00347 { 00348 typedef TransformFunctorTraits<UnOp> ft; 00349 00350 CV_StaticAssert(ft::smart_shift != 1, ""); 00351 00352 if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) || 00353 !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D))) 00354 { 00355 TransformDispatcher<false>::call(src, dst, op, mask, stream); 00356 return; 00357 } 00358 00359 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); 00360 const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1); 00361 00362 transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op); 00363 cudaSafeCall( cudaGetLastError() ); 00364 00365 if (stream == 0) 00366 cudaSafeCall( cudaDeviceSynchronize() ); 00367 } 00368 00369 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> 00370 static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream) 00371 { 00372 typedef TransformFunctorTraits<BinOp> ft; 00373 00374 CV_StaticAssert(ft::smart_shift != 1, ""); 00375 00376 if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) || 00377 !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) || 00378 !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D))) 00379 { 00380 TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream); 00381 return; 00382 } 00383 00384 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); 00385 const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1); 00386 00387 transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op); 00388 cudaSafeCall( cudaGetLastError() ); 00389 00390 if (stream == 0) 00391 cudaSafeCall( cudaDeviceSynchronize() ); 00392 } 00393 }; 00394 } // namespace transform_detail 00395 }}} // namespace cv { namespace cuda { namespace cudev 00396 00397 //! @endcond 00398 00399 #endif // __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__ 00400
Generated on Tue Jul 12 2022 14:47:42 by
1.7.2
