the do / gr-peach-opencv-project

Fork of gr-peach-opencv-project by the do

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers cuda_stream.cpp Source File

cuda_stream.cpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Third party copyrights are property of their respective owners.
00016 //
00017 // Redistribution and use in source and binary forms, with or without modification,
00018 // are permitted provided that the following conditions are met:
00019 //
00020 //   * Redistribution's of source code must retain the above copyright notice,
00021 //     this list of conditions and the following disclaimer.
00022 //
00023 //   * Redistribution's in binary form must reproduce the above copyright notice,
00024 //     this list of conditions and the following disclaimer in the documentation
00025 //     and/or other materials provided with the distribution.
00026 //
00027 //   * The name of the copyright holders may not be used to endorse or promote products
00028 //     derived from this software without specific prior written permission.
00029 //
00030 // This software is provided by the copyright holders and contributors "as is" and
00031 // any express or implied warranties, including, but not limited to, the implied
00032 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00033 // In no event shall the Intel Corporation or contributors be liable for any direct,
00034 // indirect, incidental, special, exemplary, or consequential damages
00035 // (including, but not limited to, procurement of substitute goods or services;
00036 // loss of use, data, or profits; or business interruption) however caused
00037 // and on any theory of liability, whether in contract, strict liability,
00038 // or tort (including negligence or otherwise) arising in any way out of
00039 // the use of this software, even if advised of the possibility of such damage.
00040 //
00041 //M*/
00042 
00043 #include "precomp.hpp"
00044 
00045 using namespace cv;
00046 using namespace cv::cuda;
00047 
00048 /////////////////////////////////////////////////////////////
00049 /// MemoryStack
00050 
00051 #ifdef HAVE_CUDA
00052 
00053 namespace
00054 {
00055     class MemoryPool;
00056 
00057     class MemoryStack
00058     {
00059     public:
00060         uchar* requestMemory(size_t size);
00061         void returnMemory(uchar* ptr);
00062 
00063         uchar* datastart;
00064         uchar* dataend;
00065         uchar* tip;
00066 
00067         bool isFree;
00068         MemoryPool* pool;
00069 
00070     #if !defined(NDEBUG)
00071         std::vector<size_t> allocations;
00072     #endif
00073     };
00074 
00075     uchar* MemoryStack::requestMemory(size_t size)
00076     {
00077         const size_t freeMem = dataend - tip;
00078 
00079         if (size > freeMem)
00080             return 0;
00081 
00082         uchar* ptr = tip;
00083 
00084         tip += size;
00085 
00086     #if !defined(NDEBUG)
00087         allocations.push_back(size);
00088     #endif
00089 
00090         return ptr;
00091     }
00092 
00093     void MemoryStack::returnMemory(uchar* ptr)
00094     {
00095         CV_DbgAssert( ptr >= datastart && ptr < dataend );
00096 
00097     #if !defined(NDEBUG)
00098         const size_t allocSize = tip - ptr;
00099         CV_Assert( allocSize == allocations.back() );
00100         allocations.pop_back();
00101     #endif
00102 
00103         tip = ptr;
00104     }
00105 }
00106 
00107 #endif
00108 
00109 /////////////////////////////////////////////////////////////
00110 /// MemoryPool
00111 
00112 #ifdef HAVE_CUDA
00113 
00114 namespace
00115 {
00116     class MemoryPool
00117     {
00118     public:
00119         MemoryPool();
00120 
00121         void initialize(size_t stackSize, int stackCount);
00122         void release();
00123 
00124         MemoryStack* getFreeMemStack();
00125         void returnMemStack(MemoryStack* memStack);
00126 
00127     private:
00128         void initilizeImpl();
00129 
00130         Mutex mtx_;
00131 
00132         bool initialized_;
00133         size_t stackSize_;
00134         int stackCount_;
00135 
00136         uchar* mem_;
00137 
00138         std::vector<MemoryStack> stacks_;
00139     };
00140 
00141     MemoryPool::MemoryPool() : initialized_(false), mem_(0)
00142     {
00143         // default : 10 Mb, 5 stacks
00144         stackSize_ = 10 * 1024 * 1024;
00145         stackCount_ = 5;
00146     }
00147 
00148     void MemoryPool::initialize(size_t stackSize, int stackCount)
00149     {
00150         AutoLock lock(mtx_);
00151 
00152         release();
00153 
00154         stackSize_ = stackSize;
00155         stackCount_ = stackCount;
00156 
00157         initilizeImpl();
00158     }
00159 
00160     void MemoryPool::initilizeImpl()
00161     {
00162         const size_t totalSize = stackSize_ * stackCount_;
00163 
00164         if (totalSize > 0)
00165         {
00166             cudaError_t err = cudaMalloc(&mem_, totalSize);
00167             if (err != cudaSuccess)
00168                 return;
00169 
00170             stacks_.resize(stackCount_);
00171 
00172             uchar* ptr = mem_;
00173 
00174             for (int i = 0; i < stackCount_; ++i)
00175             {
00176                 stacks_[i].datastart = ptr;
00177                 stacks_[i].dataend = ptr + stackSize_;
00178                 stacks_[i].tip = ptr;
00179                 stacks_[i].isFree = true;
00180                 stacks_[i].pool = this;
00181 
00182                 ptr += stackSize_;
00183             }
00184 
00185             initialized_ = true;
00186         }
00187     }
00188 
00189     void MemoryPool::release()
00190     {
00191         if (mem_)
00192         {
00193 #if !defined(NDEBUG)
00194             for (int i = 0; i < stackCount_; ++i)
00195             {
00196                 CV_DbgAssert( stacks_[i].isFree );
00197                 CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
00198             }
00199 #endif
00200 
00201             cudaFree(mem_);
00202 
00203             mem_ = 0;
00204             initialized_ = false;
00205         }
00206     }
00207 
00208     MemoryStack* MemoryPool::getFreeMemStack()
00209     {
00210         AutoLock lock(mtx_);
00211 
00212         if (!initialized_)
00213             initilizeImpl();
00214 
00215         if (!mem_)
00216             return 0;
00217 
00218         for (int i = 0; i < stackCount_; ++i)
00219         {
00220             if (stacks_[i].isFree)
00221             {
00222                 stacks_[i].isFree = false;
00223                 return &stacks_[i];
00224             }
00225         }
00226 
00227         return 0;
00228     }
00229 
00230     void MemoryPool::returnMemStack(MemoryStack* memStack)
00231     {
00232         AutoLock lock(mtx_);
00233 
00234         CV_DbgAssert( !memStack->isFree );
00235 
00236 #if !defined(NDEBUG)
00237         bool found = false;
00238         for (int i = 0; i < stackCount_; ++i)
00239         {
00240             if (memStack == &stacks_[i])
00241             {
00242                 found = true;
00243                 break;
00244             }
00245         }
00246         CV_DbgAssert( found );
00247 #endif
00248 
00249         CV_DbgAssert( memStack->tip == memStack->datastart );
00250 
00251         memStack->isFree = true;
00252     }
00253 }
00254 
00255 #endif
00256 
00257 ////////////////////////////////////////////////////////////////
00258 /// Stream::Impl
00259 
00260 #ifndef HAVE_CUDA
00261 
00262 class cv::cuda::Stream::Impl
00263 {
00264 public:
00265     Impl(void* ptr = 0)
00266     {
00267         (void) ptr;
00268         throw_no_cuda();
00269     }
00270 };
00271 
00272 #else
00273 
00274 namespace
00275 {
00276     class StackAllocator;
00277 }
00278 
00279 class cv::cuda::Stream::Impl
00280 {
00281 public:
00282     cudaStream_t stream;
00283     bool ownStream;
00284 
00285     Ptr<StackAllocator>  stackAllocator;
00286 
00287     Impl();
00288     explicit Impl(cudaStream_t stream);
00289 
00290     ~Impl();
00291 };
00292 
00293 cv::cuda::Stream::Impl::Impl() : stream(0), ownStream(false)
00294 {
00295     cudaSafeCall( cudaStreamCreate(&stream) );
00296     ownStream = true;
00297 
00298     stackAllocator = makePtr<StackAllocator>(stream);
00299 }
00300 
00301 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false)
00302 {
00303     stackAllocator = makePtr<StackAllocator>(stream);
00304 }
00305 
00306 cv::cuda::Stream::Impl::~Impl()
00307 {
00308     stackAllocator.release();
00309 
00310     if (stream && ownStream)
00311     {
00312         cudaStreamDestroy(stream);
00313     }
00314 }
00315 
00316 #endif
00317 
00318 /////////////////////////////////////////////////////////////
00319 /// DefaultDeviceInitializer
00320 
00321 #ifdef HAVE_CUDA
00322 
00323 namespace cv { namespace cuda
00324 {
00325     class DefaultDeviceInitializer
00326     {
00327     public:
00328         DefaultDeviceInitializer();
00329         ~DefaultDeviceInitializer();
00330 
00331         Stream& getNullStream(int deviceId);
00332         MemoryPool* getMemoryPool(int deviceId);
00333 
00334     private:
00335         void initStreams();
00336         void initPools();
00337 
00338         std::vector<Ptr<Stream> > streams_;
00339         Mutex streams_mtx_;
00340 
00341         std::vector<MemoryPool> pools_;
00342         Mutex pools_mtx_;
00343     };
00344 
00345     DefaultDeviceInitializer::DefaultDeviceInitializer()
00346     {
00347     }
00348 
00349     DefaultDeviceInitializer::~DefaultDeviceInitializer()
00350     {
00351         streams_.clear();
00352 
00353         for (size_t i = 0; i < pools_.size(); ++i)
00354         {
00355             cudaSetDevice(static_cast<int>(i));
00356             pools_[i].release();
00357         }
00358 
00359         pools_.clear();
00360     }
00361 
00362     Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
00363     {
00364         AutoLock lock(streams_mtx_);
00365 
00366         if (streams_.empty())
00367         {
00368             int deviceCount = getCudaEnabledDeviceCount();
00369 
00370             if (deviceCount > 0)
00371                 streams_.resize(deviceCount);
00372         }
00373 
00374         CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
00375 
00376         if (streams_[deviceId].empty())
00377         {
00378             cudaStream_t stream = NULL;
00379             Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
00380             streams_[deviceId] = Ptr<Stream>(new Stream(impl));
00381         }
00382 
00383         return *streams_[deviceId];
00384     }
00385 
00386     MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
00387     {
00388         AutoLock lock(pools_mtx_);
00389 
00390         if (pools_.empty())
00391         {
00392             int deviceCount = getCudaEnabledDeviceCount();
00393 
00394             if (deviceCount > 0)
00395                 pools_.resize(deviceCount);
00396         }
00397 
00398         CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
00399 
00400         return &pools_[deviceId];
00401     }
00402 
00403     DefaultDeviceInitializer initializer;
00404 }}
00405 
00406 #endif
00407 
00408 /////////////////////////////////////////////////////////////
00409 /// Stream
00410 
00411 cv::cuda::Stream::Stream()
00412 {
00413 #ifndef HAVE_CUDA
00414     throw_no_cuda();
00415 #else
00416     impl_ = makePtr<Impl>();
00417 #endif
00418 }
00419 
00420 bool cv::cuda::Stream::queryIfComplete() const
00421 {
00422 #ifndef HAVE_CUDA
00423     throw_no_cuda();
00424     return false;
00425 #else
00426     cudaError_t err = cudaStreamQuery(impl_->stream);
00427 
00428     if (err == cudaErrorNotReady || err == cudaSuccess)
00429         return err == cudaSuccess;
00430 
00431     cudaSafeCall(err);
00432     return false;
00433 #endif
00434 }
00435 
00436 void cv::cuda::Stream::waitForCompletion()
00437 {
00438 #ifndef HAVE_CUDA
00439     throw_no_cuda();
00440 #else
00441     cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
00442 #endif
00443 }
00444 
00445 void cv::cuda::Stream::waitEvent(const Event& event)
00446 {
00447 #ifndef HAVE_CUDA
00448     (void) event;
00449     throw_no_cuda();
00450 #else
00451     cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
00452 #endif
00453 }
00454 
00455 #if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
00456 
00457 namespace
00458 {
00459     struct CallbackData
00460     {
00461         Stream::StreamCallback callback;
00462         void* userData;
00463 
00464         CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
00465     };
00466 
00467     void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
00468     {
00469         CallbackData* data = reinterpret_cast<CallbackData*>(userData);
00470         data->callback(static_cast<int>(status), data->userData);
00471         delete data;
00472     }
00473 }
00474 
00475 #endif
00476 
00477 void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
00478 {
00479 #ifndef HAVE_CUDA
00480     (void) callback;
00481     (void) userData;
00482     throw_no_cuda();
00483 #else
00484     #if CUDART_VERSION < 5000
00485         (void) callback;
00486         (void) userData;
00487         CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
00488     #else
00489         CallbackData* data = new CallbackData(callback, userData);
00490 
00491         cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
00492     #endif
00493 #endif
00494 }
00495 
00496 cv::cuda::Stream& cv::cuda::Stream::Null()
00497 {
00498 #ifndef HAVE_CUDA
00499     throw_no_cuda();
00500     static Stream stream;
00501     return stream;
00502 #else
00503     const int deviceId = getDevice();
00504     return initializer.getNullStream(deviceId);
00505 #endif
00506 }
00507 
00508 cv::cuda::Stream::operator bool_type() const
00509 {
00510 #ifndef HAVE_CUDA
00511     return 0;
00512 #else
00513     return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
00514 #endif
00515 }
00516 
00517 #ifdef HAVE_CUDA
00518 
00519 cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
00520 {
00521     return stream.impl_->stream;
00522 }
00523 
00524 Stream cv::cuda::StreamAccessor::wrapStream(cudaStream_t stream)
00525 {
00526     return Stream(makePtr<Stream::Impl>(stream));
00527 }
00528 
00529 #endif
00530 
00531 /////////////////////////////////////////////////////////////
00532 /// StackAllocator
00533 
00534 #ifdef HAVE_CUDA
00535 
00536 namespace
00537 {
00538     bool enableMemoryPool = true;
00539 
00540     class StackAllocator : public GpuMat::Allocator 
00541     {
00542     public:
00543         explicit StackAllocator(cudaStream_t stream);
00544         ~StackAllocator();
00545 
00546         bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
00547         void free(GpuMat* mat);
00548 
00549     private:
00550         StackAllocator(const StackAllocator&);
00551         StackAllocator& operator =(const StackAllocator&);
00552 
00553         cudaStream_t stream_;
00554         MemoryStack* memStack_;
00555         size_t alignment_;
00556     };
00557 
00558     StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
00559     {
00560         if (enableMemoryPool)
00561         {
00562             const int deviceId = getDevice();
00563             memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
00564             DeviceInfo devInfo(deviceId);
00565             alignment_ = devInfo.textureAlignment();
00566         }
00567     }
00568 
00569     StackAllocator::~StackAllocator()
00570     {
00571         cudaStreamSynchronize(stream_);
00572 
00573         if (memStack_ != 0)
00574             memStack_->pool->returnMemStack(memStack_);
00575     }
00576 
00577     size_t alignUp(size_t what, size_t alignment)
00578     {
00579         size_t alignMask = alignment-1;
00580         size_t inverseAlignMask = ~alignMask;
00581         size_t res = (what + alignMask) & inverseAlignMask;
00582         return res;
00583     }
00584 
00585     bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
00586     {
00587         if (memStack_ == 0)
00588             return false;
00589 
00590         size_t pitch, memSize;
00591 
00592         if (rows > 1 && cols > 1)
00593         {
00594             pitch = alignUp(cols * elemSize, alignment_);
00595             memSize = pitch * rows;
00596         }
00597         else
00598         {
00599             // Single row or single column must be continuous
00600             pitch = elemSize * cols;
00601             memSize = alignUp(elemSize * cols * rows, 64);
00602         }
00603 
00604         uchar* ptr = memStack_->requestMemory(memSize);
00605 
00606         if (ptr == 0)
00607             return false;
00608 
00609         mat->data = ptr;
00610         mat->step = pitch;
00611         mat->refcount = (int*) fastMalloc(sizeof(int));
00612 
00613         return true;
00614     }
00615 
00616     void StackAllocator::free(GpuMat* mat)
00617     {
00618         if (memStack_ == 0)
00619             return;
00620 
00621         memStack_->returnMemory(mat->datastart);
00622         fastFree(mat->refcount);
00623     }
00624 }
00625 
00626 #endif
00627 
00628 /////////////////////////////////////////////////////////////
00629 /// BufferPool
00630 
00631 void cv::cuda::setBufferPoolUsage(bool on)
00632 {
00633 #ifndef HAVE_CUDA
00634     (void)on;
00635     throw_no_cuda();
00636 #else
00637     enableMemoryPool = on;
00638 #endif
00639 }
00640 
00641 void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
00642 {
00643 #ifndef HAVE_CUDA
00644     (void)deviceId;
00645     (void)stackSize;
00646     (void)stackCount;
00647     throw_no_cuda();
00648 #else
00649     const int currentDevice = getDevice();
00650 
00651     if (deviceId >= 0)
00652     {
00653         setDevice(deviceId);
00654         initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
00655     }
00656     else
00657     {
00658         const int deviceCount = getCudaEnabledDeviceCount();
00659 
00660         for (deviceId = 0; deviceId < deviceCount; ++deviceId)
00661         {
00662             setDevice(deviceId);
00663             initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
00664         }
00665     }
00666 
00667     setDevice(currentDevice);
00668 #endif
00669 }
00670 
00671 #ifdef HAVE_CUDA
00672 
00673 cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator.get())
00674 {
00675 }
00676 
00677 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
00678 {
00679     GpuMat buf(allocator_);
00680     buf.create(rows, cols, type);
00681     return buf;
00682 }
00683 
00684 #endif
00685 
00686 ////////////////////////////////////////////////////////////////
00687 // Event
00688 
00689 #ifndef HAVE_CUDA
00690 
00691 class cv::cuda::Event::Impl
00692 {
00693 public:
00694     Impl(unsigned int)
00695     {
00696         throw_no_cuda();
00697     }
00698 };
00699 
00700 #else
00701 
00702 class cv::cuda::Event::Impl
00703 {
00704 public:
00705     cudaEvent_t event;
00706     bool ownEvent;
00707 
00708     explicit Impl(unsigned int flags);
00709     explicit Impl(cudaEvent_t event);
00710     ~Impl();
00711 };
00712 
00713 cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0), ownEvent(false)
00714 {
00715     cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
00716     ownEvent = true;
00717 }
00718 
00719 cv::cuda::Event::Impl::Impl(cudaEvent_t e) : event(e), ownEvent(false)
00720 {
00721 }
00722 
00723 cv::cuda::Event::Impl::~Impl()
00724 {
00725     if (event && ownEvent)
00726     {
00727         cudaEventDestroy(event);
00728     }
00729 }
00730 
00731 cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
00732 {
00733     return event.impl_->event;
00734 }
00735 
00736 Event cv::cuda::EventAccessor::wrapEvent(cudaEvent_t event)
00737 {
00738     return Event(makePtr<Event::Impl>(event));
00739 }
00740 
00741 #endif
00742 
00743 cv::cuda::Event::Event(CreateFlags flags)
00744 {
00745 #ifndef HAVE_CUDA
00746     (void) flags;
00747     throw_no_cuda();
00748 #else
00749     impl_ = makePtr<Impl>(flags);
00750 #endif
00751 }
00752 
00753 void cv::cuda::Event::record(Stream& stream)
00754 {
00755 #ifndef HAVE_CUDA
00756     (void) stream;
00757     throw_no_cuda();
00758 #else
00759     cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
00760 #endif
00761 }
00762 
00763 bool cv::cuda::Event::queryIfComplete() const
00764 {
00765 #ifndef HAVE_CUDA
00766     throw_no_cuda();
00767     return false;
00768 #else
00769     cudaError_t err = cudaEventQuery(impl_->event);
00770 
00771     if (err == cudaErrorNotReady || err == cudaSuccess)
00772         return err == cudaSuccess;
00773 
00774     cudaSafeCall(err);
00775     return false;
00776 #endif
00777 }
00778 
00779 void cv::cuda::Event::waitForCompletion()
00780 {
00781 #ifndef HAVE_CUDA
00782     throw_no_cuda();
00783 #else
00784     cudaSafeCall( cudaEventSynchronize(impl_->event) );
00785 #endif
00786 }
00787 
00788 float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
00789 {
00790 #ifndef HAVE_CUDA
00791     (void) start;
00792     (void) end;
00793     throw_no_cuda();
00794     return 0.0f;
00795 #else
00796     float ms;
00797     cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
00798     return ms;
00799 #endif
00800 }
00801