/*========================================================================= * * Copyright Insight Software Consortium * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0.txt * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * *=========================================================================*/ #ifndef itkGPUReduction_hxx #define itkGPUReduction_hxx #include "itkMacro.h" #include "itkGPUReduction.h" //#define CPU_VERIFY namespace itk { /** * Default constructor */ template< typename TElement > GPUReduction< TElement > ::GPUReduction() { /*** Prepare GPU opencl program ***/ m_GPUKernelManager = GPUKernelManager::New(); m_GPUDataManager = ITK_NULLPTR; m_ReduceGPUKernelHandle = 0; m_TestGPUKernelHandle = 0; m_Size = 0; m_SmallBlock = false; } template< typename TElement > GPUReduction< TElement > ::~GPUReduction() { this->ReleaseGPUInputBuffer(); } /** * Standard "PrintSelf" method. */ template< typename TElement > void GPUReduction< TElement > ::PrintSelf(std::ostream & os, Indent indent) const { Superclass::PrintSelf(os, indent); //GetTypenameInString( typeid(TElement), os); } template< typename TElement > unsigned int GPUReduction< TElement > ::NextPow2( unsigned int x ) { --x; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; return ++x; } template< typename TElement > bool GPUReduction< TElement > ::isPow2(unsigned int x) { return ((x&(x-1)) == 0); } template< typename TElement > void GPUReduction< TElement > ::GetNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads) { if (whichKernel < 3) { threads = (n < maxThreads) ? this->NextPow2(n) : maxThreads; blocks = (n + threads - 1) / threads; } else { threads = (n < maxThreads*2) ? this->NextPow2((n + 1)/ 2) : maxThreads; blocks = (n + (threads * 2 - 1)) / (threads * 2); } if (whichKernel == 6) { if (maxBlocks < blocks) { blocks = maxBlocks; } } } template< typename TElement > unsigned int GPUReduction< TElement > ::GetReductionKernel(int whichKernel, int blockSize, int isPowOf2) { if ( whichKernel != 5 && whichKernel != 6 ) { itkExceptionMacro(<< "Reduction kernel undefined!"); return 0; } std::ostringstream defines; defines << "#define blockSize " << blockSize << std::endl; defines << "#define nIsPow2 " << isPowOf2 << std::endl; defines << "#define T "; GetTypenameInString( typeid ( TElement ), defines ); std::cout << "Defines: " << defines.str() << std::endl; const char* GPUSource = GPUReduction::GetOpenCLSource(); // load and build program this->m_GPUKernelManager->LoadProgramFromString( GPUSource, defines.str().c_str() ); std::ostringstream kernelName; kernelName << "reduce" << whichKernel; unsigned int handle = this->m_GPUKernelManager->CreateKernel(kernelName.str().c_str()); size_t wgSize; cl_int ciErrNum = this->m_GPUKernelManager->GetKernelWorkGroupInfo(handle, CL_KERNEL_WORK_GROUP_SIZE, &wgSize); OpenCLCheckError(ciErrNum, __FILE__, __LINE__, ITK_LOCATION); m_SmallBlock = (wgSize == 64); // NOTE: the program will get deleted when the kernel is also released //this->m_GPUKernelManager->ReleaseProgram(); return handle; } template< typename TElement > void GPUReduction< TElement > ::AllocateGPUInputBuffer(TElement *h_idata) { unsigned int bytes = m_Size * sizeof(TElement); m_GPUDataManager = GPUDataManager::New(); m_GPUDataManager->SetBufferSize( bytes ); m_GPUDataManager->SetCPUBufferPointer( h_idata ); m_GPUDataManager->Allocate(); if (h_idata) { m_GPUDataManager->SetGPUDirtyFlag(true); } } template< typename TElement > void GPUReduction< TElement > ::ReleaseGPUInputBuffer() { if (m_GPUDataManager == (GPUDataPointer)ITK_NULLPTR) { return; } m_GPUDataManager->Initialize(); } template< typename TElement > TElement GPUReduction< TElement > ::RandomTest() { int size = (1<<24)-1917; // number of elements to reduce this->InitializeKernel(size); unsigned int bytes = size * sizeof(TElement); TElement* h_idata = (TElement*)malloc(bytes); for(int i=0; iAllocateGPUInputBuffer(h_idata); TElement gpu_result = this->GPUGenerateData(); std::cout << "GPU result = " << gpu_result << std::endl << std::flush; TElement cpu_result = this->CPUGenerateData(h_idata, size); std::cout << "CPU result = " << cpu_result << std::endl; this->ReleaseGPUInputBuffer(); free(h_idata); return 0; } template< typename TElement > void GPUReduction< TElement > ::InitializeKernel(unsigned int size) { m_Size = size; // Create a testing kernel to decide block size // m_TestGPUKernelHandle = this->GetReductionKernel(6, 64, 1); //m_GPUKernelManager->ReleaseKernel(kernelHandle); // number of threads per block int maxThreads = m_SmallBlock ? 64 : 128; int whichKernel = 6; int maxBlocks = 64; int numBlocks = 0; int numThreads = 0; this->GetNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); m_ReduceGPUKernelHandle = this->GetReductionKernel(whichKernel, numThreads, isPow2(size) ); } template< typename TElement > TElement GPUReduction< TElement > ::GPUGenerateData() { unsigned int size = m_Size; // number of threads per block int maxThreads = m_SmallBlock ? 64 : 128; int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = true; int cpuFinalThreshold = 1; int numBlocks = 0; int numThreads = 0; this->GetNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; // allocate output data for the result TElement* h_odata = (TElement*)malloc(numBlocks * sizeof(TElement)); GPUDataPointer odata = GPUDataManager::New(); odata->SetBufferSize( numBlocks * sizeof(TElement) ); odata->SetCPUBufferPointer( h_odata ); odata->Allocate(); odata->SetCPUDirtyFlag(true); double dTotalTime = 0.0; m_GPUResult = 0; m_GPUResult = this->GPUReduce(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, cpuFinalReduction, cpuFinalThreshold, &dTotalTime, m_GPUDataManager, odata); // cleanup free(h_odata); return m_GPUResult; } template< typename TElement > TElement GPUReduction< TElement > ::GPUReduce( cl_int n, int numThreads, int numBlocks, int itkNotUsed(maxThreads), int itkNotUsed(maxBlocks), int itkNotUsed(whichKernel), bool itkNotUsed(cpuFinalReduction), int itkNotUsed(cpuFinalThreshold), double* itkNotUsed(dTotalTime), GPUDataPointer idata, GPUDataPointer odata) { TElement gpu_result = 0; // arguments set up int argidx = 0; this->m_GPUKernelManager->SetKernelArgWithImage(m_ReduceGPUKernelHandle, argidx++, idata); this->m_GPUKernelManager->SetKernelArgWithImage(m_ReduceGPUKernelHandle, argidx++, odata); this->m_GPUKernelManager->SetKernelArg(m_ReduceGPUKernelHandle, argidx++, sizeof(cl_int), &n); //shared memory below this->m_GPUKernelManager->SetKernelArg(m_ReduceGPUKernelHandle, argidx++, sizeof(TElement) * numThreads, ITK_NULLPTR); size_t globalSize[1]; size_t localSize[1]; gpu_result = 0; // execute the kernel globalSize[0] = numBlocks * numThreads; localSize[0] = numThreads; this->m_GPUKernelManager->LaunchKernel(m_ReduceGPUKernelHandle, 1, globalSize, localSize ); odata->SetCPUDirtyFlag(true); TElement* h_odata = (TElement*)odata->GetCPUBufferPointer(); #ifdef CPU_VERIFY idata->SetCPUDirtyFlag(true); TElement* h_idata = (TElement*)idata->GetCPUBufferPointer(); //debug if (!h_idata) { h_idata = (TElement*)malloc(sizeof(TElement) * n); idata->SetCPUBufferPointer(h_idata); idata->SetCPUDirtyFlag(true); h_idata = (TElement*)idata->GetCPUBufferPointer(); //debug } TElement CPUSum = this->CPUGenerateData(h_idata, n); std::cout << "CPU_VERIFY sum = " << CPUSum << std::endl; #endif for(int i=0; i TElement GPUReduction< TElement > ::CPUGenerateData(TElement *data, int size) { TElement sum = data[0]; // TElement c = (TElement)0.0; for (int i = 1; i < size; i++) { // TODO consider using compensated sum algorithm sum = sum + data[i]; } m_CPUResult = sum; return sum; } } // end namespace itk #endif