OpenCL: auto config two threads per GPU

The auto config generates for AMD devices now by default two threads per GPU. - remove the savety 128MiB memory now only from the max available GPU memory not from the avaialble memory for one alloc call - extend the memory documentation in amd.txt

OpenCL: auto config two threads per GPU
The auto config generates for AMD devices now by default two threads per GPU. - remove the savety 128MiB memory now only from the max available GPU memory not from the avaialble memory for one alloc call - extend the memory documentation in amd.txt
e46226fa · psychocrypt · a8d09606 · e46226fa · e46226fa · e46226fa
Commit e46226fa authored 6 years ago by psychocrypt
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -703,9 +703,9 @@ std::vector<GpuContext> getAMDDevices(int index)
 		{
 			GpuContext ctx;
 			std::vector<char> devNameVec(1024);
-			size_t maxMem;
-			if( devVendor.find("NVIDIA Corporation") != std::string::npos)
+			ctx.isNVIDIA = isNVIDIADevice;
-				ctx.isNVIDIA = true;
+			ctx.isAMD = isAMDDevice;
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
 			{
@@ -713,7 +713,7 @@ std::vector<GpuContext> getAMDDevices(int index)
 				continue;
 			}
-			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS)
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
 				continue;
@@ -726,8 +726,8 @@ std::vector<GpuContext> getAMDDevices(int index)
 			}
 			// the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation
-			if(ctx.isNVIDIA)
+			if(isNVIDIADevice)
-				maxMem = ctx.freeMem;
+				ctx.maxMemPerAlloc = ctx.freeMem;
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 			{
@@ -746,7 +746,6 @@ std::vector<GpuContext> getAMDDevices(int index)
 			// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
 			ctx.deviceIdx = k;
-			ctx.freeMem = std::min(ctx.freeMem, maxMem);
 			ctx.name = std::string(devNameVec.data());
 			ctx.DeviceID = device_list[k];
 			ctx.interleave = 40;

--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -40,6 +40,7 @@ struct GpuContext
 	int memChunk;
 	int unroll = 0;
 	bool isNVIDIA = false;
+	bool isAMD = false;
 	int compMode;
 	/*Output vars*/
@@ -51,6 +52,7 @@ struct GpuContext
 	cl_program Program[2];
 	cl_kernel Kernels[2][8];
 	size_t freeMem;
+	size_t maxMemPerAlloc;
 	int computeUnits;
 	std::string name;
 	std::shared_ptr<InterleaveData> interleaveData;

--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -158,14 +158,29 @@ private:
 			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
 				maxThreads *= 2u;
-			// keep 128MiB memory free (value is randomly chosen)
+			// keep 128MiB memory free (value is randomly chosen) from the max available memory
-			size_t availableMem = ctx.freeMem - minFreeMem;
+			const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem;
+			size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem);
+			uint32_t numThreads = 1u;
+			if(ctx.isAMD)
+			{
+				numThreads = 2;
+				size_t memDoubleThread = maxAvailableFreeMem / numThreads;
+				memPerThread = std::min(memPerThread, memDoubleThread);
+			}
 			// 224byte extra memory is used per thread for meta data
 			size_t perThread = hashMemSize + 224u;
-			size_t maxIntensity = availableMem / perThread;
+			size_t maxIntensity = memPerThread / perThread;
 			size_t possibleIntensity = std::min( maxThreads , maxIntensity );
 			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
 			size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
+			// in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units
+			if(numThreads == 2)
+				intensity = (possibleIntensity / 8) * 8;
 			//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
 			if (intensity == 0)
 			{
@@ -175,18 +190,22 @@ private:
 			}
 			if (intensity != 0)
 			{
-				conf += std::string("  // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n";
+				for(uint32_t thd = 0; thd < numThreads; ++thd)
-				conf += std::string("  // compute units: ") + std::to_string(ctx.computeUnits) + "\n";
+				{
-				// set 8 threads per block (this is a good value for the most gpus)
+					conf += "  // gpu: " + ctx.name + std::string("  compute units: ") + std::to_string(ctx.computeUnits) + "\n";
-				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
+					conf += "  // memory:" + std::to_string(memPerThread / byteToMiB) + "|" +
-					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+						std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" +  std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
-					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+					// set 8 threads per block (this is a good value for the most gpus)
-					"    \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
+					conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
-					"  },\n";
+						"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+						"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+						"    \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
+						"  },\n";
+				}
 			}
 			else
 			{
-				printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str());
+				printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(memPerThread / byteToMiB).c_str());
 			}
 		}