Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project by
cuda_stream.cpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Third party copyrights are property of their respective owners. 00016 // 00017 // Redistribution and use in source and binary forms, with or without modification, 00018 // are permitted provided that the following conditions are met: 00019 // 00020 // * Redistribution's of source code must retain the above copyright notice, 00021 // this list of conditions and the following disclaimer. 00022 // 00023 // * Redistribution's in binary form must reproduce the above copyright notice, 00024 // this list of conditions and the following disclaimer in the documentation 00025 // and/or other materials provided with the distribution. 00026 // 00027 // * The name of the copyright holders may not be used to endorse or promote products 00028 // derived from this software without specific prior written permission. 00029 // 00030 // This software is provided by the copyright holders and contributors "as is" and 00031 // any express or implied warranties, including, but not limited to, the implied 00032 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00033 // In no event shall the Intel Corporation or contributors be liable for any direct, 00034 // indirect, incidental, special, exemplary, or consequential damages 00035 // (including, but not limited to, procurement of substitute goods or services; 00036 // loss of use, data, or profits; or business interruption) however caused 00037 // and on any theory of liability, whether in contract, strict liability, 00038 // or tort (including negligence or otherwise) arising in any way out of 00039 // the use of this software, even if advised of the possibility of such damage. 00040 // 00041 //M*/ 00042 00043 #include "precomp.hpp" 00044 00045 using namespace cv; 00046 using namespace cv::cuda; 00047 00048 ///////////////////////////////////////////////////////////// 00049 /// MemoryStack 00050 00051 #ifdef HAVE_CUDA 00052 00053 namespace 00054 { 00055 class MemoryPool; 00056 00057 class MemoryStack 00058 { 00059 public: 00060 uchar* requestMemory(size_t size); 00061 void returnMemory(uchar* ptr); 00062 00063 uchar* datastart; 00064 uchar* dataend; 00065 uchar* tip; 00066 00067 bool isFree; 00068 MemoryPool* pool; 00069 00070 #if !defined(NDEBUG) 00071 std::vector<size_t> allocations; 00072 #endif 00073 }; 00074 00075 uchar* MemoryStack::requestMemory(size_t size) 00076 { 00077 const size_t freeMem = dataend - tip; 00078 00079 if (size > freeMem) 00080 return 0; 00081 00082 uchar* ptr = tip; 00083 00084 tip += size; 00085 00086 #if !defined(NDEBUG) 00087 allocations.push_back(size); 00088 #endif 00089 00090 return ptr; 00091 } 00092 00093 void MemoryStack::returnMemory(uchar* ptr) 00094 { 00095 CV_DbgAssert( ptr >= datastart && ptr < dataend ); 00096 00097 #if !defined(NDEBUG) 00098 const size_t allocSize = tip - ptr; 00099 CV_Assert( allocSize == allocations.back() ); 00100 allocations.pop_back(); 00101 #endif 00102 00103 tip = ptr; 00104 } 00105 } 00106 00107 #endif 00108 00109 ///////////////////////////////////////////////////////////// 00110 /// MemoryPool 00111 00112 #ifdef HAVE_CUDA 00113 00114 namespace 00115 { 00116 class MemoryPool 00117 { 00118 public: 00119 MemoryPool(); 00120 00121 void initialize(size_t stackSize, int stackCount); 00122 void release(); 00123 00124 MemoryStack* getFreeMemStack(); 00125 void returnMemStack(MemoryStack* memStack); 00126 00127 private: 00128 void initilizeImpl(); 00129 00130 Mutex mtx_; 00131 00132 bool initialized_; 00133 size_t stackSize_; 00134 int stackCount_; 00135 00136 uchar* mem_; 00137 00138 std::vector<MemoryStack> stacks_; 00139 }; 00140 00141 MemoryPool::MemoryPool() : initialized_(false), mem_(0) 00142 { 00143 // default : 10 Mb, 5 stacks 00144 stackSize_ = 10 * 1024 * 1024; 00145 stackCount_ = 5; 00146 } 00147 00148 void MemoryPool::initialize(size_t stackSize, int stackCount) 00149 { 00150 AutoLock lock(mtx_); 00151 00152 release(); 00153 00154 stackSize_ = stackSize; 00155 stackCount_ = stackCount; 00156 00157 initilizeImpl(); 00158 } 00159 00160 void MemoryPool::initilizeImpl() 00161 { 00162 const size_t totalSize = stackSize_ * stackCount_; 00163 00164 if (totalSize > 0) 00165 { 00166 cudaError_t err = cudaMalloc(&mem_, totalSize); 00167 if (err != cudaSuccess) 00168 return; 00169 00170 stacks_.resize(stackCount_); 00171 00172 uchar* ptr = mem_; 00173 00174 for (int i = 0; i < stackCount_; ++i) 00175 { 00176 stacks_[i].datastart = ptr; 00177 stacks_[i].dataend = ptr + stackSize_; 00178 stacks_[i].tip = ptr; 00179 stacks_[i].isFree = true; 00180 stacks_[i].pool = this; 00181 00182 ptr += stackSize_; 00183 } 00184 00185 initialized_ = true; 00186 } 00187 } 00188 00189 void MemoryPool::release() 00190 { 00191 if (mem_) 00192 { 00193 #if !defined(NDEBUG) 00194 for (int i = 0; i < stackCount_; ++i) 00195 { 00196 CV_DbgAssert( stacks_[i].isFree ); 00197 CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart ); 00198 } 00199 #endif 00200 00201 cudaFree(mem_); 00202 00203 mem_ = 0; 00204 initialized_ = false; 00205 } 00206 } 00207 00208 MemoryStack* MemoryPool::getFreeMemStack() 00209 { 00210 AutoLock lock(mtx_); 00211 00212 if (!initialized_) 00213 initilizeImpl(); 00214 00215 if (!mem_) 00216 return 0; 00217 00218 for (int i = 0; i < stackCount_; ++i) 00219 { 00220 if (stacks_[i].isFree) 00221 { 00222 stacks_[i].isFree = false; 00223 return &stacks_[i]; 00224 } 00225 } 00226 00227 return 0; 00228 } 00229 00230 void MemoryPool::returnMemStack(MemoryStack* memStack) 00231 { 00232 AutoLock lock(mtx_); 00233 00234 CV_DbgAssert( !memStack->isFree ); 00235 00236 #if !defined(NDEBUG) 00237 bool found = false; 00238 for (int i = 0; i < stackCount_; ++i) 00239 { 00240 if (memStack == &stacks_[i]) 00241 { 00242 found = true; 00243 break; 00244 } 00245 } 00246 CV_DbgAssert( found ); 00247 #endif 00248 00249 CV_DbgAssert( memStack->tip == memStack->datastart ); 00250 00251 memStack->isFree = true; 00252 } 00253 } 00254 00255 #endif 00256 00257 //////////////////////////////////////////////////////////////// 00258 /// Stream::Impl 00259 00260 #ifndef HAVE_CUDA 00261 00262 class cv::cuda::Stream::Impl 00263 { 00264 public: 00265 Impl(void* ptr = 0) 00266 { 00267 (void) ptr; 00268 throw_no_cuda(); 00269 } 00270 }; 00271 00272 #else 00273 00274 namespace 00275 { 00276 class StackAllocator; 00277 } 00278 00279 class cv::cuda::Stream::Impl 00280 { 00281 public: 00282 cudaStream_t stream; 00283 bool ownStream; 00284 00285 Ptr<StackAllocator> stackAllocator; 00286 00287 Impl(); 00288 explicit Impl(cudaStream_t stream); 00289 00290 ~Impl(); 00291 }; 00292 00293 cv::cuda::Stream::Impl::Impl() : stream(0), ownStream(false) 00294 { 00295 cudaSafeCall( cudaStreamCreate(&stream) ); 00296 ownStream = true; 00297 00298 stackAllocator = makePtr<StackAllocator>(stream); 00299 } 00300 00301 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false) 00302 { 00303 stackAllocator = makePtr<StackAllocator>(stream); 00304 } 00305 00306 cv::cuda::Stream::Impl::~Impl() 00307 { 00308 stackAllocator.release(); 00309 00310 if (stream && ownStream) 00311 { 00312 cudaStreamDestroy(stream); 00313 } 00314 } 00315 00316 #endif 00317 00318 ///////////////////////////////////////////////////////////// 00319 /// DefaultDeviceInitializer 00320 00321 #ifdef HAVE_CUDA 00322 00323 namespace cv { namespace cuda 00324 { 00325 class DefaultDeviceInitializer 00326 { 00327 public: 00328 DefaultDeviceInitializer(); 00329 ~DefaultDeviceInitializer(); 00330 00331 Stream& getNullStream(int deviceId); 00332 MemoryPool* getMemoryPool(int deviceId); 00333 00334 private: 00335 void initStreams(); 00336 void initPools(); 00337 00338 std::vector<Ptr<Stream> > streams_; 00339 Mutex streams_mtx_; 00340 00341 std::vector<MemoryPool> pools_; 00342 Mutex pools_mtx_; 00343 }; 00344 00345 DefaultDeviceInitializer::DefaultDeviceInitializer() 00346 { 00347 } 00348 00349 DefaultDeviceInitializer::~DefaultDeviceInitializer() 00350 { 00351 streams_.clear(); 00352 00353 for (size_t i = 0; i < pools_.size(); ++i) 00354 { 00355 cudaSetDevice(static_cast<int>(i)); 00356 pools_[i].release(); 00357 } 00358 00359 pools_.clear(); 00360 } 00361 00362 Stream& DefaultDeviceInitializer::getNullStream(int deviceId) 00363 { 00364 AutoLock lock(streams_mtx_); 00365 00366 if (streams_.empty()) 00367 { 00368 int deviceCount = getCudaEnabledDeviceCount(); 00369 00370 if (deviceCount > 0) 00371 streams_.resize(deviceCount); 00372 } 00373 00374 CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) ); 00375 00376 if (streams_[deviceId].empty()) 00377 { 00378 cudaStream_t stream = NULL; 00379 Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream); 00380 streams_[deviceId] = Ptr<Stream>(new Stream(impl)); 00381 } 00382 00383 return *streams_[deviceId]; 00384 } 00385 00386 MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId) 00387 { 00388 AutoLock lock(pools_mtx_); 00389 00390 if (pools_.empty()) 00391 { 00392 int deviceCount = getCudaEnabledDeviceCount(); 00393 00394 if (deviceCount > 0) 00395 pools_.resize(deviceCount); 00396 } 00397 00398 CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) ); 00399 00400 return &pools_[deviceId]; 00401 } 00402 00403 DefaultDeviceInitializer initializer; 00404 }} 00405 00406 #endif 00407 00408 ///////////////////////////////////////////////////////////// 00409 /// Stream 00410 00411 cv::cuda::Stream::Stream() 00412 { 00413 #ifndef HAVE_CUDA 00414 throw_no_cuda(); 00415 #else 00416 impl_ = makePtr<Impl>(); 00417 #endif 00418 } 00419 00420 bool cv::cuda::Stream::queryIfComplete() const 00421 { 00422 #ifndef HAVE_CUDA 00423 throw_no_cuda(); 00424 return false; 00425 #else 00426 cudaError_t err = cudaStreamQuery(impl_->stream); 00427 00428 if (err == cudaErrorNotReady || err == cudaSuccess) 00429 return err == cudaSuccess; 00430 00431 cudaSafeCall(err); 00432 return false; 00433 #endif 00434 } 00435 00436 void cv::cuda::Stream::waitForCompletion() 00437 { 00438 #ifndef HAVE_CUDA 00439 throw_no_cuda(); 00440 #else 00441 cudaSafeCall( cudaStreamSynchronize(impl_->stream) ); 00442 #endif 00443 } 00444 00445 void cv::cuda::Stream::waitEvent(const Event& event) 00446 { 00447 #ifndef HAVE_CUDA 00448 (void) event; 00449 throw_no_cuda(); 00450 #else 00451 cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) ); 00452 #endif 00453 } 00454 00455 #if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000) 00456 00457 namespace 00458 { 00459 struct CallbackData 00460 { 00461 Stream::StreamCallback callback; 00462 void* userData; 00463 00464 CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {} 00465 }; 00466 00467 void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData) 00468 { 00469 CallbackData* data = reinterpret_cast<CallbackData*>(userData); 00470 data->callback(static_cast<int>(status), data->userData); 00471 delete data; 00472 } 00473 } 00474 00475 #endif 00476 00477 void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData) 00478 { 00479 #ifndef HAVE_CUDA 00480 (void) callback; 00481 (void) userData; 00482 throw_no_cuda(); 00483 #else 00484 #if CUDART_VERSION < 5000 00485 (void) callback; 00486 (void) userData; 00487 CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0"); 00488 #else 00489 CallbackData* data = new CallbackData(callback, userData); 00490 00491 cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) ); 00492 #endif 00493 #endif 00494 } 00495 00496 cv::cuda::Stream& cv::cuda::Stream::Null() 00497 { 00498 #ifndef HAVE_CUDA 00499 throw_no_cuda(); 00500 static Stream stream; 00501 return stream; 00502 #else 00503 const int deviceId = getDevice(); 00504 return initializer.getNullStream(deviceId); 00505 #endif 00506 } 00507 00508 cv::cuda::Stream::operator bool_type() const 00509 { 00510 #ifndef HAVE_CUDA 00511 return 0; 00512 #else 00513 return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0; 00514 #endif 00515 } 00516 00517 #ifdef HAVE_CUDA 00518 00519 cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream) 00520 { 00521 return stream.impl_->stream; 00522 } 00523 00524 Stream cv::cuda::StreamAccessor::wrapStream(cudaStream_t stream) 00525 { 00526 return Stream(makePtr<Stream::Impl>(stream)); 00527 } 00528 00529 #endif 00530 00531 ///////////////////////////////////////////////////////////// 00532 /// StackAllocator 00533 00534 #ifdef HAVE_CUDA 00535 00536 namespace 00537 { 00538 bool enableMemoryPool = true; 00539 00540 class StackAllocator : public GpuMat::Allocator 00541 { 00542 public: 00543 explicit StackAllocator(cudaStream_t stream); 00544 ~StackAllocator(); 00545 00546 bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize); 00547 void free(GpuMat* mat); 00548 00549 private: 00550 StackAllocator(const StackAllocator&); 00551 StackAllocator& operator =(const StackAllocator&); 00552 00553 cudaStream_t stream_; 00554 MemoryStack* memStack_; 00555 size_t alignment_; 00556 }; 00557 00558 StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0) 00559 { 00560 if (enableMemoryPool) 00561 { 00562 const int deviceId = getDevice(); 00563 memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack(); 00564 DeviceInfo devInfo(deviceId); 00565 alignment_ = devInfo.textureAlignment(); 00566 } 00567 } 00568 00569 StackAllocator::~StackAllocator() 00570 { 00571 cudaStreamSynchronize(stream_); 00572 00573 if (memStack_ != 0) 00574 memStack_->pool->returnMemStack(memStack_); 00575 } 00576 00577 size_t alignUp(size_t what, size_t alignment) 00578 { 00579 size_t alignMask = alignment-1; 00580 size_t inverseAlignMask = ~alignMask; 00581 size_t res = (what + alignMask) & inverseAlignMask; 00582 return res; 00583 } 00584 00585 bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize) 00586 { 00587 if (memStack_ == 0) 00588 return false; 00589 00590 size_t pitch, memSize; 00591 00592 if (rows > 1 && cols > 1) 00593 { 00594 pitch = alignUp(cols * elemSize, alignment_); 00595 memSize = pitch * rows; 00596 } 00597 else 00598 { 00599 // Single row or single column must be continuous 00600 pitch = elemSize * cols; 00601 memSize = alignUp(elemSize * cols * rows, 64); 00602 } 00603 00604 uchar* ptr = memStack_->requestMemory(memSize); 00605 00606 if (ptr == 0) 00607 return false; 00608 00609 mat->data = ptr; 00610 mat->step = pitch; 00611 mat->refcount = (int*) fastMalloc(sizeof(int)); 00612 00613 return true; 00614 } 00615 00616 void StackAllocator::free(GpuMat* mat) 00617 { 00618 if (memStack_ == 0) 00619 return; 00620 00621 memStack_->returnMemory(mat->datastart); 00622 fastFree(mat->refcount); 00623 } 00624 } 00625 00626 #endif 00627 00628 ///////////////////////////////////////////////////////////// 00629 /// BufferPool 00630 00631 void cv::cuda::setBufferPoolUsage(bool on) 00632 { 00633 #ifndef HAVE_CUDA 00634 (void)on; 00635 throw_no_cuda(); 00636 #else 00637 enableMemoryPool = on; 00638 #endif 00639 } 00640 00641 void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount) 00642 { 00643 #ifndef HAVE_CUDA 00644 (void)deviceId; 00645 (void)stackSize; 00646 (void)stackCount; 00647 throw_no_cuda(); 00648 #else 00649 const int currentDevice = getDevice(); 00650 00651 if (deviceId >= 0) 00652 { 00653 setDevice(deviceId); 00654 initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount); 00655 } 00656 else 00657 { 00658 const int deviceCount = getCudaEnabledDeviceCount(); 00659 00660 for (deviceId = 0; deviceId < deviceCount; ++deviceId) 00661 { 00662 setDevice(deviceId); 00663 initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount); 00664 } 00665 } 00666 00667 setDevice(currentDevice); 00668 #endif 00669 } 00670 00671 #ifdef HAVE_CUDA 00672 00673 cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator.get()) 00674 { 00675 } 00676 00677 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type) 00678 { 00679 GpuMat buf(allocator_); 00680 buf.create(rows, cols, type); 00681 return buf; 00682 } 00683 00684 #endif 00685 00686 //////////////////////////////////////////////////////////////// 00687 // Event 00688 00689 #ifndef HAVE_CUDA 00690 00691 class cv::cuda::Event::Impl 00692 { 00693 public: 00694 Impl(unsigned int) 00695 { 00696 throw_no_cuda(); 00697 } 00698 }; 00699 00700 #else 00701 00702 class cv::cuda::Event::Impl 00703 { 00704 public: 00705 cudaEvent_t event; 00706 bool ownEvent; 00707 00708 explicit Impl(unsigned int flags); 00709 explicit Impl(cudaEvent_t event); 00710 ~Impl(); 00711 }; 00712 00713 cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0), ownEvent(false) 00714 { 00715 cudaSafeCall( cudaEventCreateWithFlags(&event, flags) ); 00716 ownEvent = true; 00717 } 00718 00719 cv::cuda::Event::Impl::Impl(cudaEvent_t e) : event(e), ownEvent(false) 00720 { 00721 } 00722 00723 cv::cuda::Event::Impl::~Impl() 00724 { 00725 if (event && ownEvent) 00726 { 00727 cudaEventDestroy(event); 00728 } 00729 } 00730 00731 cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event) 00732 { 00733 return event.impl_->event; 00734 } 00735 00736 Event cv::cuda::EventAccessor::wrapEvent(cudaEvent_t event) 00737 { 00738 return Event(makePtr<Event::Impl>(event)); 00739 } 00740 00741 #endif 00742 00743 cv::cuda::Event::Event(CreateFlags flags) 00744 { 00745 #ifndef HAVE_CUDA 00746 (void) flags; 00747 throw_no_cuda(); 00748 #else 00749 impl_ = makePtr<Impl>(flags); 00750 #endif 00751 } 00752 00753 void cv::cuda::Event::record(Stream& stream) 00754 { 00755 #ifndef HAVE_CUDA 00756 (void) stream; 00757 throw_no_cuda(); 00758 #else 00759 cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) ); 00760 #endif 00761 } 00762 00763 bool cv::cuda::Event::queryIfComplete() const 00764 { 00765 #ifndef HAVE_CUDA 00766 throw_no_cuda(); 00767 return false; 00768 #else 00769 cudaError_t err = cudaEventQuery(impl_->event); 00770 00771 if (err == cudaErrorNotReady || err == cudaSuccess) 00772 return err == cudaSuccess; 00773 00774 cudaSafeCall(err); 00775 return false; 00776 #endif 00777 } 00778 00779 void cv::cuda::Event::waitForCompletion() 00780 { 00781 #ifndef HAVE_CUDA 00782 throw_no_cuda(); 00783 #else 00784 cudaSafeCall( cudaEventSynchronize(impl_->event) ); 00785 #endif 00786 } 00787 00788 float cv::cuda::Event::elapsedTime(const Event& start, const Event& end) 00789 { 00790 #ifndef HAVE_CUDA 00791 (void) start; 00792 (void) end; 00793 throw_no_cuda(); 00794 return 0.0f; 00795 #else 00796 float ms; 00797 cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) ); 00798 return ms; 00799 #endif 00800 } 00801
Generated on Tue Jul 12 2022 15:17:22 by
1.7.2
