From ed2168b48d16a9870cbef067d38a5ad16b26c9f9 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 10 Oct 2018 11:52:40 +0200
Subject: [PATCH] CUDA: fix invalid results

If `comp_mode` is false the results on a windows platform will be invalid.
The reason for that is that `ulong4` is in windows 16byte and in linux 32byte.

thx @xmrig for finding and solving the issue

fix #1873
---
 xmrstak/backend/nvidia/autoAdjust.hpp         | 2 +-
 xmrstak/backend/nvidia/config.tpl             | 2 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index 6354f60..27783ac 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -96,7 +96,7 @@ private:
 					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
 					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
 					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
-					"    \"comp_mode\" : true,\n" +
+					"    \"comp_mode\" : false,\n" +
 					"  },\n";
 			}
 		}
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index e2a76d9..8803f6f 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -17,7 +17,7 @@ R"===(// generated by XMRSTAK_VERSION
  *                 2 = cudaDeviceScheduleYield
  *                 3 = cudaDeviceScheduleBlockingSync (default)
  * comp_mode     - Compatibility if true it will use 64bit memory loads and if false it will use
- *                               128bit memory loads (can produce invalid results)
+ *                               256bit memory loads (can produce invalid results)
  *                               (this option has only a meaning for cryptonight_v8 and monero)
  *
  * On the first run the miner will look at your system and suggest a basic configuration that will work,
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 1c9c9df..3dce3e4 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -343,7 +343,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
+			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
 
 		uint32_t idx1 = (idx0 & 0x30) >> 3;
 
@@ -381,7 +381,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
+			((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
 
 		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
 		idx1 = (idx0 & 0x30) >> 3;
@@ -396,7 +396,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
+			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
 
 		if(ALGO != cryptonight_monero_v8)
 			bx0 = cx_aes;
@@ -461,7 +461,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
+			((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
 		ax0 ^= c;
 		idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
 	}
-- 
GitLab