From cff6b6cbfbb3da44d85753885466de5122e20472 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 12 Feb 2018 20:39:49 +0100
Subject: [PATCH] add OpenCL compatibility mode

- add new option `comp_mode` to the amd config
- disable `if guards` within opencl kernel if `comp_mode : false`
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp           | 17 +++++++++++------
 xmrstak/backend/amd/amd_gpu/gpu.hpp           |  1 +
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 19 ++++++++++++++++---
 xmrstak/backend/amd/autoAdjust.hpp            |  3 ++-
 xmrstak/backend/amd/config.tpl                | 12 ++++++++----
 xmrstak/backend/amd/jconf.cpp                 | 10 ++++++++--
 xmrstak/backend/amd/jconf.hpp                 |  1 +
 xmrstak/backend/amd/minethd.cpp               |  1 +
 8 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 054ffc4..2f16b67 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -332,8 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 	char options[256];
 	snprintf(options, sizeof(options), 
-		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d",
-		hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk));
+		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d  -DCOMP_MODE=%d",
+		hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0);
 	ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
 	if(ret != CL_SUCCESS)
 	{
@@ -873,10 +873,15 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 
 	size_t g_intensity = ctx->rawIntensity;
 	size_t w_size = ctx->workSize;
-	// round up to next multiple of w_size
-	size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
-	// number of global threads must be a multiple of the work group size (w_size)
-	assert(g_thd%w_size == 0);
+	size_t g_thd = g_intensity;
+
+	if(ctx->compMode)
+	{
+		// round up to next multiple of w_size
+		size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
+		// number of global threads must be a multiple of the work group size (w_size)
+		assert(g_thd%w_size == 0);
+	}
 
 	for(int i = 2; i < 6; ++i)
 	{
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index abfad5c..8fb7168 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -26,6 +26,7 @@ struct GpuContext
 	size_t workSize;
 	int stridedIndex;
 	int memChunk;
+	int compMode;
 
 	/*Output vars*/
 	cl_device_id DeviceID;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 53299ec..4bac68c 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -451,8 +451,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 		
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		states += 25 * gIdx;
 
@@ -483,9 +485,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	}
 
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
-
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		#pragma unroll
 		for(int i = 0; i < 25; ++i) states[i] = State[i];
@@ -499,9 +502,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	}
 
 	mem_fence(CLK_LOCAL_MEM_FENCE);
-
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		#pragma unroll 2
 		for(int i = 0; i < (ITERATIONS >> 5); ++i)
@@ -536,9 +540,10 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	uint4 b_x;
-
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
@@ -559,8 +564,10 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 
 	mem_fence(CLK_LOCAL_MEM_FENCE);
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		#pragma unroll 8
 		for(int i = 0; i < ITERATIONS; ++i)
@@ -612,8 +619,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
@@ -641,8 +650,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		#pragma unroll 2
 		for(int i = 0; i < (ITERATIONS >> 5); ++i)
@@ -659,8 +670,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 
 	barrier(CLK_GLOBAL_MEM_FENCE);
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		if(!get_local_id(1))
 		{
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index b88d3ee..8d60b94 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -143,7 +143,8 @@ private:
 				// set 8 threads per block (this is a good value for the most gpus)
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4\n"
+					"    \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4,\n"
+					"    \"comp_mode\" : true\n" +
 					"  },\n";
 			}
 			else
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 8914130..84251c7 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -1,9 +1,9 @@
 R"===(
 /*
  * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary.
- *      index    - GPU index number usually starts from 0
- *  intensity    - Number of parallel GPU threads (nothing to do with CPU threads)
- *   worksize    - Number of local GPU threads (nothing to do with CPU threads)
+ * index         - GPU index number usually starts from 0
+ * intensity     - Number of parallel GPU threads (nothing to do with CPU threads)
+ * worksize      - Number of local GPU threads (nothing to do with CPU threads)
  * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
  * strided_index - switch memory pattern used for the scratch pad memory
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
@@ -13,9 +13,13 @@ R"===(
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
  *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ * comp_mode     - Compatibility enable/disable the automatic guard around compute kernel which allows
+ *                 to use a intensity which is not the multiple of the worksize.
+ *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
+ *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4 },
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4, "comp_mode" : true },
  * ],
  * If you do not wish to mine with your AMD GPU(s) then use:
  * "gpu_threads_conf" :
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 22381e1..93ba709 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -106,15 +106,17 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	stridedIndex = GetObjectMember(oThdConf, "strided_index");
 	memChunk = GetObjectMember(oThdConf, "mem_chunk");
+	compMode = GetObjectMember(oThdConf, "comp_mode");
 
-	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr || memChunk == nullptr)
+	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
+		stridedIndex == nullptr || compMode == nullptr)
 		return false;
 
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -148,9 +150,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 	}
 
+	if(!compMode->IsBool())
+		return false;
+
 	cfg.index = idx->GetUint64();
 	cfg.w_size = w_size->GetUint64();
 	cfg.intensity = intensity->GetUint64();
+	cfg.compMode = compMode->GetBool();
 
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt64();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index 91e5d0d..580b69f 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -28,6 +28,7 @@ public:
 		long long cpu_aff;
 		int stridedIndex;
 		int memChunk;
+		bool compMode;
 	};
 
 	size_t GetThreadCount();
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index ca5e163..8dfbce5 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -98,6 +98,7 @@ bool minethd::init_gpus()
 		vGpuData[i].workSize = cfg.w_size;
 		vGpuData[i].stridedIndex = cfg.stridedIndex;
 		vGpuData[i].memChunk = cfg.memChunk;
+		vGpuData[i].compMode = cfg.compMode;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
-- 
GitLab