Skip to content
Snippets Groups Projects
Commit e46226fa authored by psychocrypt's avatar psychocrypt
Browse files

OpenCL: auto config two threads per GPU

The auto config generates for AMD devices now by default two threads per GPU.

- remove the savety 128MiB memory now only from the max available GPU memory not from the avaialble memory for one alloc call
- extend the memory documentation in amd.txt
parent a8d09606
No related branches found
No related tags found
No related merge requests found
...@@ -703,9 +703,9 @@ std::vector<GpuContext> getAMDDevices(int index) ...@@ -703,9 +703,9 @@ std::vector<GpuContext> getAMDDevices(int index)
{ {
GpuContext ctx; GpuContext ctx;
std::vector<char> devNameVec(1024); std::vector<char> devNameVec(1024);
size_t maxMem;
if( devVendor.find("NVIDIA Corporation") != std::string::npos) ctx.isNVIDIA = isNVIDIADevice;
ctx.isNVIDIA = true; ctx.isAMD = isAMDDevice;
if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS) if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
{ {
...@@ -713,7 +713,7 @@ std::vector<GpuContext> getAMDDevices(int index) ...@@ -713,7 +713,7 @@ std::vector<GpuContext> getAMDDevices(int index)
continue; continue;
} }
if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS) if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS)
{ {
printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
continue; continue;
...@@ -726,8 +726,8 @@ std::vector<GpuContext> getAMDDevices(int index) ...@@ -726,8 +726,8 @@ std::vector<GpuContext> getAMDDevices(int index)
} }
// the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation // the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation
if(ctx.isNVIDIA) if(isNVIDIADevice)
maxMem = ctx.freeMem; ctx.maxMemPerAlloc = ctx.freeMem;
if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
{ {
...@@ -746,7 +746,6 @@ std::vector<GpuContext> getAMDDevices(int index) ...@@ -746,7 +746,6 @@ std::vector<GpuContext> getAMDDevices(int index)
// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
ctx.deviceIdx = k; ctx.deviceIdx = k;
ctx.freeMem = std::min(ctx.freeMem, maxMem);
ctx.name = std::string(devNameVec.data()); ctx.name = std::string(devNameVec.data());
ctx.DeviceID = device_list[k]; ctx.DeviceID = device_list[k];
ctx.interleave = 40; ctx.interleave = 40;
......
...@@ -40,6 +40,7 @@ struct GpuContext ...@@ -40,6 +40,7 @@ struct GpuContext
int memChunk; int memChunk;
int unroll = 0; int unroll = 0;
bool isNVIDIA = false; bool isNVIDIA = false;
bool isAMD = false;
int compMode; int compMode;
/*Output vars*/ /*Output vars*/
...@@ -51,6 +52,7 @@ struct GpuContext ...@@ -51,6 +52,7 @@ struct GpuContext
cl_program Program[2]; cl_program Program[2];
cl_kernel Kernels[2][8]; cl_kernel Kernels[2][8];
size_t freeMem; size_t freeMem;
size_t maxMemPerAlloc;
int computeUnits; int computeUnits;
std::string name; std::string name;
std::shared_ptr<InterleaveData> interleaveData; std::shared_ptr<InterleaveData> interleaveData;
......
...@@ -158,14 +158,29 @@ private: ...@@ -158,14 +158,29 @@ private:
if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
maxThreads *= 2u; maxThreads *= 2u;
// keep 128MiB memory free (value is randomly chosen) // keep 128MiB memory free (value is randomly chosen) from the max available memory
size_t availableMem = ctx.freeMem - minFreeMem; const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem;
size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem);
uint32_t numThreads = 1u;
if(ctx.isAMD)
{
numThreads = 2;
size_t memDoubleThread = maxAvailableFreeMem / numThreads;
memPerThread = std::min(memPerThread, memDoubleThread);
}
// 224byte extra memory is used per thread for meta data // 224byte extra memory is used per thread for meta data
size_t perThread = hashMemSize + 224u; size_t perThread = hashMemSize + 224u;
size_t maxIntensity = availableMem / perThread; size_t maxIntensity = memPerThread / perThread;
size_t possibleIntensity = std::min( maxThreads , maxIntensity ); size_t possibleIntensity = std::min( maxThreads , maxIntensity );
// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
// in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units
if(numThreads == 2)
intensity = (possibleIntensity / 8) * 8;
//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
if (intensity == 0) if (intensity == 0)
{ {
...@@ -175,18 +190,22 @@ private: ...@@ -175,18 +190,22 @@ private:
} }
if (intensity != 0) if (intensity != 0)
{ {
conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n"; for(uint32_t thd = 0; thd < numThreads; ++thd)
conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n"; {
// set 8 threads per block (this is a good value for the most gpus) conf += " // gpu: " + ctx.name + std::string(" compute units: ") + std::to_string(ctx.computeUnits) + "\n";
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + conf += " // memory:" + std::to_string(memPerThread / byteToMiB) + "|" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
" \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" // set 8 threads per block (this is a good value for the most gpus)
" \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" },\n"; " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
" \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
" \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
" },\n";
}
} }
else else
{ {
printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str()); printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(memPerThread / byteToMiB).c_str());
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment