//////////////////////////////////////////////////////////////////// // UtilCUDA.h // // Copyright 2007 cDc@seacave // Distributed under the Boost Software License, Version 1.0 // (See http://www.boost.org/LICENSE_1_0.txt) #ifndef __SEACAVE_CUDA_H__ #define __SEACAVE_CUDA_H__ #ifdef _USE_CUDA // I N C L U D E S ///////////////////////////////////////////////// // CUDA driver #include // CUDA toolkit #include #include #include #include #include // D E F I N E S /////////////////////////////////////////////////// // S T R U C T S /////////////////////////////////////////////////// namespace SEACAVE { namespace CUDA { extern int desiredDeviceID; // global list of initialized devices struct Device { CUdevice ID; int major, minor; int computeMode; CUdevprop prop; CUcontext ctx; inline Device() : ctx(NULL) {} inline ~Device() { if (ctx != NULL) cuCtxDestroy(ctx); } }; typedef CLISTDEF0(Device) Devices; extern Devices devices; // outputs the proper CUDA error code in the event that a CUDA host call returns an error inline CUresult __reportCudaError(CUresult result, LPCSTR errorMessage) { if (result == CUDA_SUCCESS) return CUDA_SUCCESS; LPCSTR szName; cuGetErrorName(result, &szName); LPCSTR szError; cuGetErrorString(result, &szError); #ifdef _DEBUG VERBOSE("CUDA error at %s:%d: %s (%s (code %d) - %s)", __FILE__, __LINE__, errorMessage, szName, static_cast(result), szError); #else DEBUG("CUDA error: %s (%s (code %d) - %s)", errorMessage, szName, static_cast(result), szError); #endif ASSERT("CudaError" == NULL); return result; } #define reportCudaError(val) CUDA::__reportCudaError(val, #val) #define checkCudaError(val) { const CUresult ret(CUDA::__reportCudaError(val, #val)); if (ret != CUDA_SUCCESS) return ret; } // outputs the proper CUDA error code and abort in the event that a CUDA host call returns an error inline void __ensureCudaResult(CUresult result, LPCSTR errorMessage) { if (__reportCudaError(result, errorMessage)) return; ASSERT("CudaAbort" == NULL); exit(EXIT_FAILURE); } #define ensureCudaResult(val) CUDA::__ensureCudaResult(val, #val) inline void checkCudaCall(const cudaError_t error) { if (error == cudaSuccess) return; #ifdef _DEBUG VERBOSE("CUDA error at %s:%d: %s (code %d)", __FILE__, __LINE__, cudaGetErrorString(error), error); #else DEBUG("CUDA error: %s (code %d)", cudaGetErrorString(error), error); #endif ASSERT("CudaError" == NULL); exit(EXIT_FAILURE); } // rounds up addr to the align boundary template inline T align(T o, T a) { a -= T(1); return (o + a)&~a; } // initialize the given CUDA device and add it to the array of initialized devices; // if the given device is -1, the best available device is selected CUresult initDevice(int deviceID=-1); // load/read module (program) from file/string and compile it enum JIT { AUTO=0, STRING=1, FILE=2 }; CUresult ptxJIT(LPCSTR program, CUmodule& hModule, int mode=JIT::AUTO); // requested function (kernel) from module (program) CUresult ptxGetFunc(const CUmodule& hModule, LPCSTR functionName, CUfunction& hKernel); // add a new parameter to the given kernel template inline CUresult addKernelParam(CUfunction& hKernel, int& paramOffset, const T& param) { paramOffset = align(paramOffset, (int)alignof(T)); const CUresult result(cuParamSetv(hKernel, paramOffset, (void*)¶m, sizeof(T))); paramOffset += sizeof(T); return result; } // allocate on the CUDA device a chunk of memory of the given size inline CUresult allocMemDevice(size_t size, CUdeviceptr& dataDevice) { return cuMemAlloc(&dataDevice, size); } // copy on the CUDA device the given chunk of memory inline CUresult copyMemDevice(const void* data, size_t size, CUdeviceptr dataDevice) { return cuMemcpyHtoD(dataDevice, data, size); } // allocate and copy on the CUDA device the given chunk of memory inline CUresult createReplicaDevice(const void* data, size_t size, CUdeviceptr& dataDevice) { if (cuMemAlloc(&dataDevice, size) != CUDA_SUCCESS) return CUDA_ERROR_OUT_OF_MEMORY; return cuMemcpyHtoD(dataDevice, data, size); } // copy from the CUDA device the given chunk of memory inline CUresult fetchMemDevice(void* data, size_t size, const CUdeviceptr dataDevice) { return cuMemcpyDtoH(data, dataDevice, size); } // free the given memory on the CUDA device inline CUresult freeMemDevice(CUdeviceptr& dataDevice) { if (cuMemFree(dataDevice) != CUDA_SUCCESS) return CUDA_ERROR_NOT_INITIALIZED; dataDevice = 0; return CUDA_SUCCESS; } /*----------------------------------------------------------------*/ class MemDevice { protected: CUdeviceptr pData; size_t nSize; public: inline MemDevice() : pData(0) {} inline MemDevice(size_t size) : pData(0) { reportCudaError(Reset(size)); } inline MemDevice(const void* pDataHost, size_t size) : pData(0) { reportCudaError(Reset(pDataHost, size)); } template inline MemDevice(const TImage& param) : pData(0) { reportCudaError(Reset(param)); } template inline MemDevice(const cList& param) : pData(0) { reportCudaError(Reset(param)); } inline ~MemDevice() { Release(); } MemDevice(MemDevice& rhs) : pData(rhs.pData) { rhs.pData = 0; } MemDevice& operator=(MemDevice& rhs) { pData = rhs.pData; rhs.pData = 0; return *this; } inline bool IsValid() const { return (pData != 0); } void Release(); CUresult Reset(size_t size); CUresult Reset(const void* pDataHost, size_t size); template inline CUresult Reset(const TImage& param) { ASSERT(!param.empty() && param.isContinuous()); return Reset(param.getData(), sizeof(TYPE)*param.area()); } template inline CUresult Reset(const cList& param) { ASSERT(!param.IsEmpty()); return Reset(param.GetData(), param.GetDataSize()); } CUresult SetData(const void* pDataHost, size_t size); template inline CUresult SetData(const TImage& param) { ASSERT(!param.empty() && param.isContinuous()); return SetData(param.getData(), sizeof(TYPE)*param.area()); } template inline CUresult SetData(const cList& param) { ASSERT(!param.IsEmpty()); return SetData(param.GetData(), param.GetDataSize()); } CUresult GetData(void* pDataHost, size_t size) const; template inline CUresult GetData(TImage& param) const { ASSERT(!param.empty() && param.isContinuous()); return GetData(param.getData(), sizeof(TYPE)*param.area()); } template inline CUresult GetData(cList& param) const { ASSERT(!param.IsEmpty()); return GetData(param.GetData(), param.GetDataSize()); } inline operator CUdeviceptr() const { return pData; } }; typedef CSharedPtr MemDevicePtr; typedef CLISTDEFIDX(MemDevice,int) MemDeviceArr; /*----------------------------------------------------------------*/ class EventRT { protected: CUevent hEvent; protected: EventRT(const EventRT&); EventRT& operator=(const EventRT&); public: inline EventRT(unsigned flags = CU_EVENT_DEFAULT) { reportCudaError(Reset(flags)); } inline ~EventRT() { Release(); } inline bool IsValid() const { return (hEvent != NULL); } void Release(); CUresult Reset(unsigned flags = CU_EVENT_DEFAULT); inline operator CUevent() const { return hEvent; } }; typedef CSharedPtr EventRTPtr; /*----------------------------------------------------------------*/ class StreamRT { protected: CUstream hStream; protected: StreamRT(const StreamRT&); StreamRT& operator=(const StreamRT&); public: inline StreamRT(unsigned flags = CU_STREAM_DEFAULT) { reportCudaError(Reset(flags)); } inline ~StreamRT() { Release(); } inline bool IsValid() const { return (hStream != NULL); } void Release(); CUresult Reset(unsigned flags = CU_STREAM_DEFAULT); inline operator CUstream() const { return hStream; } CUresult Wait(CUevent hEvent); }; typedef CSharedPtr StreamRTPtr; /*----------------------------------------------------------------*/ class ModuleRT { protected: CUmodule hModule; protected: ModuleRT(const ModuleRT&); ModuleRT& operator=(const ModuleRT&); public: inline ModuleRT() : hModule(NULL) {} inline ModuleRT(LPCSTR program, int mode=JIT::AUTO) { Reset(program, mode); } inline ~ModuleRT() { Release(); } inline bool IsValid() const { return (hModule != NULL); } void Release(); CUresult Reset(LPCSTR program, int mode=JIT::AUTO); inline operator CUmodule() const { return hModule; } }; typedef CSharedPtr ModuleRTPtr; /*----------------------------------------------------------------*/ class KernelRT { public: ModuleRTPtr ptrModule; StreamRTPtr ptrStream; CUfunction hKernel; MemDeviceArr inDatas; // array of pointers to the allocated memory read by the program MemDeviceArr outDatas; // array of pointers to the allocated memory written by the program int paramOffset; // used during parameter insertion to remember current parameter position protected: KernelRT(const KernelRT&); KernelRT& operator=(const KernelRT&); public: inline KernelRT() : hKernel(NULL) {} inline KernelRT(const ModuleRTPtr& _ptrModule, LPCSTR functionName) : ptrModule(_ptrModule) { Reset(functionName); } inline KernelRT(LPCSTR program, LPCSTR functionName, int mode=JIT::AUTO) { Reset(program, functionName, mode); } inline ~KernelRT() { Release(); } inline bool IsValid() const { ASSERT(hKernel == NULL || (ptrModule != NULL && ptrModule->IsValid())); return (hKernel != NULL); } void Release(); void Reset(); CUresult Reset(LPCSTR functionName); CUresult Reset(const ModuleRTPtr& _ptrModule, LPCSTR functionName); CUresult Reset(LPCSTR program, LPCSTR functionName, int mode=JIT::AUTO); struct InputParam { const void* data; // pointer to host data to be allocated and copied to the CUDA device size_t size; // size in bytes of the data inline InputParam() {} inline InputParam(const void* _data, size_t _size) : data(_data), size(_size) {} }; struct OutputParam { size_t size; // size in bytes of the data to be allocated on the CUDA device inline OutputParam() {} inline OutputParam(size_t _size) : size(_size) {} }; // lunch the program with the given parameters; // numThreads - total number of threads to run // args - variadic parameters to be passed to the kernel #ifdef _SUPPORT_CPP11 template CUresult operator()(int numThreads, Args&&... args) { ASSERT(IsValid()); Reset(); CUresult result; // set the kernel parameters (Driver API) if ((result=AddParam(std::forward(args)...)) != CUDA_SUCCESS) return result; if ((result=cuParamSetSize(hKernel, paramOffset)) != CUDA_SUCCESS) return result; // launch the kernel (Driver API) const CUdevprop& deviceProp = CUDA::devices.back().prop; const int numBlockThreads(MINF(numThreads, deviceProp.maxThreadsPerBlock)); const int nBlocks(MAXF((numThreads+numBlockThreads-1)/numBlockThreads, 1)); if ((result=cuFuncSetBlockShape(hKernel, numBlockThreads, 1, 1)) != CUDA_SUCCESS) return result; if (ptrStream != NULL) return cuLaunchGridAsync(hKernel, nBlocks, 1, *ptrStream); return cuLaunchGrid(hKernel, nBlocks, 1); } // same for 2D data template CUresult operator()(const TPoint2& numThreads, Args&&... args) { ASSERT(IsValid()); Reset(); CUresult result; // set the kernel parameters (Driver API) if ((result=AddParam(std::forward(args)...)) != CUDA_SUCCESS) return result; if ((result=cuParamSetSize(hKernel, paramOffset)) != CUDA_SUCCESS) return result; // launch the kernel (Driver API) const CUdevprop& deviceProp = CUDA::devices.back().prop; const REAL scale(MINF(REAL(1), SQRT((REAL)deviceProp.maxThreadsPerBlock/(REAL)(numThreads.x*numThreads.y)))); const SEACAVE::TPoint2 numBlockThreads(FLOOR2INT(SEACAVE::TPoint2(numThreads)*scale)); const TPoint2 nBlocks( MAXF((numThreads.x+numBlockThreads.x-1)/numBlockThreads.x, 1), MAXF((numThreads.y+numBlockThreads.y-1)/numBlockThreads.y, 1)); if ((result=cuFuncSetBlockShape(hKernel, numBlockThreads.x, numBlockThreads.y, 1)) != CUDA_SUCCESS) return result; if (ptrStream != NULL) return cuLaunchGridAsync(hKernel, nBlocks.x, nBlocks.y, *ptrStream); return cuLaunchGrid(hKernel, nBlocks.x, nBlocks.y); } #endif // _SUPPORT_CPP11 struct ReturnParam { void* data; // pointer to host data to be written with the output data from the CUDA device size_t size; // size in bytes of the data inline ReturnParam() {} inline ReturnParam(void* _data, size_t _size) : data(_data), size(_size) {} }; CUresult GetResult(const CUdeviceptr data, const ReturnParam& param) const; inline CUresult GetResult(const MemDevice& memDev, const ReturnParam& param) const { return memDev.GetData(param.data, param.size); } inline CUresult GetResult(int idx, const ReturnParam& param) const { return GetResult(outDatas[idx], param); } template inline CUresult GetResult(int idx, const TImage& param) const { ASSERT(!param.empty() && param.isContinuous()); return GetResult(idx, ReturnParam(param.getData(), sizeof(TYPE)*param.area())); } template inline CUresult GetResult(int idx, const cList& param) const { ASSERT(!param.IsEmpty()); return GetResult(idx, ReturnParam(param.GetData(), param.GetDataSize())); } CUresult GetResult(const std::initializer_list& params) const; protected: CUresult _AddParam(const InputParam& param); CUresult _AddParam(const OutputParam& param); template inline CUresult _AddParam(const T& param) { return addKernelParam(hKernel, paramOffset, param); } inline CUresult _AddParam(const MemDevice& param) { ASSERT(param.IsValid()); return addKernelParam(hKernel, paramOffset, (CUdeviceptr)param); } template inline CUresult _AddParam(const TImage& param) { ASSERT(!param.empty() && param.isContinuous()); return _AddParam(InputParam(param.getData(), sizeof(TYPE)*param.area())); } template inline CUresult _AddParam(const cList& param) { ASSERT(!param.IsEmpty()); return _AddParam(InputParam(param.GetData(), param.GetDataSize())); } #ifdef _SUPPORT_CPP11 template inline CUresult AddParam(T&& param) { return _AddParam(std::forward(param)); } template inline CUresult AddParam(T&& param, Args&&... args) { CUresult result(AddParam(std::forward(param))); if (result != CUDA_SUCCESS) return result; if ((result=AddParam(std::forward(args)...)) != CUDA_SUCCESS) return result; return CUDA_SUCCESS; } #endif // _SUPPORT_CPP11 }; typedef CSharedPtr KernelRTPtr; /*----------------------------------------------------------------*/ namespace ARRAY { template struct traits { static const CUarray_format format; }; template<> struct traits { static const CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8; }; template<> struct traits { static const CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT16; }; template<> struct traits { static const CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT32; }; template<> struct traits { static const CUarray_format format = CU_AD_FORMAT_SIGNED_INT8; }; template<> struct traits { static const CUarray_format format = CU_AD_FORMAT_SIGNED_INT16; }; template<> struct traits { static const CUarray_format format = CU_AD_FORMAT_SIGNED_INT32; }; template<> struct traits { static const CUarray_format format = CU_AD_FORMAT_HALF; }; template<> struct traits { static const CUarray_format format = CU_AD_FORMAT_FLOAT; }; } // namespace ARRAY template class TArrayRT { public: typedef TYPE Type; typedef TImage ImageType; protected: CUarray hArray; public: inline TArrayRT() : hArray(NULL) {} inline TArrayRT(const Image8U::Size& size, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(size, flags)); } inline TArrayRT(unsigned width, unsigned height, unsigned depth=0, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(width, height, depth, flags)); } inline ~TArrayRT() { Release(); } TArrayRT(TArrayRT& rhs) : hArray(rhs.hArray) { rhs.hArray = NULL; } TArrayRT& operator=(TArrayRT& rhs) { hArray = rhs.hArray; rhs.hArray = NULL; return *this; } inline bool IsValid() const { return (hArray != NULL); } void Release() { if (hArray) { reportCudaError(cuArrayDestroy(hArray)); hArray = NULL; } } inline CUresult Reset(const Image8U::Size& size, unsigned flags=0) { return Reset((unsigned)size.width, (unsigned)size.height, 0, flags); } CUresult Reset(unsigned width, unsigned height, unsigned depth=0, unsigned flags=0) { Release(); CUDA_ARRAY3D_DESCRIPTOR prop; prop.Width = width; prop.Height = height; prop.Depth = depth; prop.Format = ARRAY::traits::format; prop.NumChannels = cv::DataType::channels; prop.Flags = flags; CUresult ret(cuArray3DCreate(&hArray, &prop)); if (ret != CUDA_SUCCESS) hArray = NULL; return ret; } operator CUarray() const { return hArray; } operator CUarray&() { return hArray; } CUDA_ARRAY3D_DESCRIPTOR GetDescriptor() const { CUDA_ARRAY3D_DESCRIPTOR prop; cuArray3DGetDescriptor(&prop, hArray); return prop; } unsigned Width() const { return (unsigned)GetDescriptor().Width; } unsigned Height() const { return (unsigned)GetDescriptor().Height; } unsigned Depth() const { return (unsigned)GetDescriptor().Depth; } unsigned NumChannels() const { return GetDescriptor().NumChannels; } CUarray_format Format() const { return GetDescriptor().Format; } unsigned Flags() const { return GetDescriptor().Flags; } size_t Size() const { return sizeof(Type)*Width()*Height()*(Depth()>0?Depth():1)*NumChannels(); } // copy some data from host memory to device memory CUresult SetData(const ImageType& image) { ASSERT(IsValid() && !image.empty()); CUDA_MEMCPY2D param; memset(¶m, 0, sizeof(CUDA_MEMCPY2D)); param.dstMemoryType = CU_MEMORYTYPE_ARRAY; param.dstArray = hArray; param.srcMemoryType = CU_MEMORYTYPE_HOST; param.srcHost = image.getData(); param.srcPitch = image.row_stride(); param.WidthInBytes = image.row_stride(); param.Height = image.height(); return cuMemcpy2D(¶m); } // copy data from device memory to host memory CUresult GetData(ImageType& image) const { ASSERT(IsValid() && !image.empty()); CUDA_MEMCPY2D param; memset(¶m, 0, sizeof(CUDA_MEMCPY2D)); param.dstMemoryType = CU_MEMORYTYPE_HOST; param.dstHost = image.getData(); param.dstPitch = image.row_stride(); param.srcMemoryType = CU_MEMORYTYPE_ARRAY; param.srcArray = hArray; param.WidthInBytes = image.row_stride(); param.Height = image.height(); return cuMemcpy2D(¶m); } }; typedef TArrayRT ArrayRT8U; typedef TArrayRT ArrayRT32U; typedef TArrayRT ArrayRT16F; typedef TArrayRT ArrayRT32F; /*----------------------------------------------------------------*/ template class TTextureRT { public: typedef TArrayRT ArrayType; typedef typename ArrayType::Type Type; typedef typename ArrayType::ImageType ImageType; public: ModuleRTPtr ptrModule; CUtexref hTexref; public: inline TTextureRT() : hTexref(NULL) {} inline TTextureRT(const ModuleRTPtr& _ptrModule, LPCSTR texrefName, CUfilter_mode filtermode=CU_TR_FILTER_MODE_POINT, CUaddress_mode addrmode=CU_TR_ADDRESS_MODE_CLAMP, bool bNormalizedCoords=false) : ptrModule(_ptrModule) { Reset(texrefName, filtermode, addrmode, bNormalizedCoords); } inline ~TTextureRT() { Release(); } inline bool IsValid() const { ASSERT(hTexref == NULL || (ptrModule != NULL && ptrModule->IsValid())); return (hTexref != NULL); } void Release() { ptrModule.Release(); hTexref = NULL; } CUresult Reset(LPCSTR texrefName, CUfilter_mode filtermode=CU_TR_FILTER_MODE_POINT, CUaddress_mode addrmode=CU_TR_ADDRESS_MODE_CLAMP, bool bNormalizedCoords=false) { // get the texture-reference handle (Driver API) ASSERT(ptrModule != NULL && ptrModule->IsValid()); CUresult result(cuModuleGetTexRef(&hTexref, *ptrModule, texrefName)); if (result != CUDA_SUCCESS) Release(); // set texture parameters checkCudaError(cuTexRefSetFilterMode(hTexref, filtermode)); if (bNormalizedCoords) { checkCudaError(cuTexRefSetFlags(hTexref, CU_TRSF_NORMALIZED_COORDINATES)); for (int i=0; i<2; ++i) checkCudaError(cuTexRefSetAddressMode(hTexref, i, addrmode)); } else { for (int i=0; i<2; ++i) checkCudaError(cuTexRefSetAddressMode(hTexref, i, CU_TR_ADDRESS_MODE_CLAMP)); } cuTexRefSetFormat(hTexref, ARRAY::traits::format, cv::DataType::channels); return result; } inline CUresult Reset(const ModuleRTPtr& _ptrModule, LPCSTR texrefName, CUfilter_mode filtermode=CU_TR_FILTER_MODE_POINT, CUaddress_mode addrmode=CU_TR_ADDRESS_MODE_CLAMP, bool bNormalizedCoords=false) { // set module ptrModule = _ptrModule; // set texture return Reset(texrefName, filtermode, addrmode, bNormalizedCoords); } // bind the given array to the texture CUresult Bind(ArrayType& array) { return cuTexRefSetArray(hTexref, array, CU_TRSA_OVERRIDE_FORMAT); } // fetch the array bind to the texture CUresult Fetch(ArrayType& array) { return cuTexRefGetArray(hTexref, array); } }; typedef TTextureRT TextureRT8U; typedef TTextureRT TextureRT32U; typedef TTextureRT TextureRT16F; typedef TTextureRT TextureRT32F; /*----------------------------------------------------------------*/ template class TSurfaceRT { public: typedef TArrayRT ArrayType; typedef typename ArrayType::Type Type; typedef typename ArrayType::ImageType ImageType; public: ModuleRTPtr ptrModule; CUsurfref hSurfref; public: inline TSurfaceRT() : hSurfref(NULL) {} inline TSurfaceRT(const ModuleRTPtr& _ptrModule, LPCSTR surfrefName) : ptrModule(_ptrModule) { Reset(surfrefName); } inline ~TSurfaceRT() { Release(); } inline bool IsValid() const { ASSERT(hSurfref == NULL || (ptrModule != NULL && ptrModule->IsValid())); return (hSurfref != NULL); } void Release() { ptrModule.Release(); hSurfref = NULL; } CUresult Reset(LPCSTR surfrefName) { // get the surface-reference handle (Driver API) ASSERT(ptrModule != NULL && ptrModule->IsValid()); const CUresult result(cuModuleGetSurfRef(&hSurfref, *ptrModule, surfrefName)); if (result != CUDA_SUCCESS) Release(); return result; } inline CUresult Reset(const ModuleRTPtr& _ptrModule, LPCSTR texrefName) { // set module ptrModule = _ptrModule; // set texture return Reset(texrefName); } // bind the given array to the surface CUresult Bind(const ArrayType& array) { return cuSurfRefSetArray(hSurfref, array, 0); } // fetch the array bind to the surface CUresult Fetch(const ArrayType& array) const { return cuSurfRefGetArray(hSurfref, array); } }; typedef TSurfaceRT SurfaceRT8U; typedef TSurfaceRT SurfaceRT32U; typedef TSurfaceRT SurfaceRT16F; typedef TSurfaceRT SurfaceRT32F; /*----------------------------------------------------------------*/ } // namespace CUDA } // namespace SEACAVE #endif // _USE_CUDA #endif // __SEACAVE_CUDA_H__