CUDA: tine cryptonight_v8

Read memory in bigger chunks per thread to increase the used memory bandwith. Use for Kepla and Fermi GPUs the old autosuggestion instead of the new settings for cryptonight_v8.

CUDA: tine cryptonight_v8
Read memory in bigger chunks per thread to increase the used memory bandwith. Use for Kepla and Fermi GPUs the old autosuggestion instead of the new settings for cryptonight_v8.
99a12cb6 · psychocrypt · 98554a0f · 99a12cb6 · 99a12cb6
Commit 99a12cb6 authored 6 years ago by psychocrypt
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -329,11 +329,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	{
 		ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0];

-		#pragma unroll 4
-		for(int x = 0; x < 8; x += 2)
-		{
-			myChunks[x + sub] = ptr0[ x + sub ];
-		}
+		((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];

 		uint32_t idx1 = (idx0 & 0x30) >> 3;

@@ -362,17 +358,13 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		}

 		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
-		for(int x = 0; x < 8; x += 2)
-			ptr0[ x + sub ] = myChunks[x + sub];
+		((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];

 		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
 		idx1 = (idx0 & 0x30) >> 3;
 		ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0];
-		#pragma unroll 4
-		for(int x = 0; x < 8; x += 2)
-		{
-			myChunks[x + sub] = ptr0[ x + sub ];
-		}
+
+		((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];

 		if(ALGO != cryptonight_monero_v8)
 			bx0 = cx_aes;
@@ -428,10 +420,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			bx0 = cx_aes;
 		} 
 		myChunks[ idx1 + sub ] = ax0;
-		for(int x = 0; x < 8; x += 2)
-		{
-			ptr0[ x + sub ] = myChunks[x + sub];
-		}
+		((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
 		ax0 ^= c;
 		idx0 = shuffle<2>(sPtr, sub, ax0.x, 0);
 	}

--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -692,8 +692,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
 			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8;

-		// overwrite default config if cryptonight_monero_v8 is mined
-		if(useCryptonight_v8)
+		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
+		if(useCryptonight_v8 && gpuArch >= 50)
 		{
 			// 4 based on my test maybe it must be adjusted later
 			size_t threads = 4;