From 3f6bd5a25e5b6c0e22a99f5d0b296be61b766a63 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 1 Feb 2019 22:30:33 +0100
Subject: [PATCH] CUDA: optimze cn_gpu auto suggestion

optimize the algorithm for cryptonight_gpu autosuggestion
---
 .../backend/nvidia/nvcc_code/cuda_extra.cu    | 45 +++++++++++++++----
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index e4574e2..a37ecc8 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -593,6 +593,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		}
 	}
 
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+	bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
+
+
 	// set all device option those marked as auto (-1) to a valid value
 	if(ctx->device_blocks == -1)
 	{
@@ -600,8 +604,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 *   - 3 * SMX count for >=sm_30
 		 *   - 2 * SMX count for  <sm_30
 		 */
-		ctx->device_blocks = props.multiProcessorCount *
-			( props.major < 3 ? 2 : 3 );
+		ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 3);
+
+		// use 6 blocks per SM for sm_2X else 8 blocks
+		if(useCryptonight_gpu)
+			ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 6 : 8);
 
 		// increase bfactor for low end devices to avoid that the miner is killed by the OS
 		if(props.multiProcessorCount <= 6)
@@ -613,7 +620,16 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 * `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts
 		 * `8 * ctx->device_threads` threads per block
 		 */
-		ctx->device_threads = 64;
+		const uint32_t maxThreadsPerBlock = props.major < 3 ? 512 : 1024;
+
+		// for the most algorithms we are using 8 threads per hash
+		uint32_t threadsPerHash = 8;
+
+		// phase2_gpu uses 16 threads per hash
+		if(useCryptonight_gpu)
+			threadsPerHash = 16;
+
+		ctx->device_threads = maxThreadsPerBlock / threadsPerHash;
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
 		// no limit by default 1TiB
@@ -678,8 +694,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		ctx->total_device_memory = totalMemory;
 		ctx->free_device_memory = freeMemory;
 
-		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
-
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
@@ -725,10 +739,9 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		// use only odd number of threads
 		ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
 
-		if(props.major == 2 && ctx->device_threads > 64)
+		if(ctx->device_threads > maxThreadsPerBlock / threadsPerHash)
 		{
-			// Fermi gpus only support 512 threads per block (we need start 4 * configured threads)
-			ctx->device_threads = 64;
+			ctx->device_threads = maxThreadsPerBlock / threadsPerHash;
 		}
 
 		// check if cryptonight_monero_v8 is selected for the user pool
@@ -749,6 +762,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 				ctx->device_blocks = blockOptimal;
 			}
 		}
+		else if(useCryptonight_gpu)
+		{
+			// 8 based on my profiling sessions maybe it must be adjusted later
+			size_t threads = 8;
+			// 8 is chosen by checking the occupancy calculator
+			size_t blockOptimal = 8 * ctx->device_mpcount;
+			if(gpuArch >= 70)
+				blockOptimal = 5 *  ctx->device_mpcount;
+			
+			if(blockOptimal * threads * hashMemSize < limitedMemory)
+			{
+				ctx->device_threads = threads;
+				ctx->device_blocks = blockOptimal;
+			}
+
+		}
 	}
 	printf("device init succeeded\n");
 
-- 
GitLab