diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 4e34e75a9b1129e5f663076995679b83c6a547aa..5638147026a3e3127e92bb7079ed470a52f8f316 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -215,10 +215,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti __shared__ uint32_t sharedMemory[1024]; cn_aes_gpu_init( sharedMemory ); - __shared__ uint32_t RCP[256]; - for (int i = threadIdx.x; i < 256; i += blockDim.x) + uint32_t* RCP; + if(ALGO == cryptonight_monero_v8) { - RCP[i] = RCP_C[i]; + __shared__ uint32_t RCP_shared[256]; + for (int i = threadIdx.x; i < 256; i += blockDim.x) + { + RCP_shared[i] = RCP_C[i]; + } + RCP = RCP_shared; } @@ -268,14 +273,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti } } - uint32_t bx1, division_result, sqrt_result; + uint32_t bx1, sqrt_result; + uint64_t division_result; if(ALGO == cryptonight_monero_v8) { d[1] = (d_ctx_b + thread * 12)[sub]; bx1 = (d_ctx_b + thread * 12 + 4)[sub]; // must be valid only for `sub < 2` - division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; + division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0]; sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; } else @@ -415,21 +421,17 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti { // Use division and square root results from the _previous_ iteration to hide the latency const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); - - uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); - ((uint32_t*)&division_result_64)[1] ^= sqrt_result; + ((uint32_t*)&division_result)[1] ^= sqrt_result; if(sub < 2) - *((uint64_t*)yy) ^= division_result_64; + *((uint64_t*)yy) ^= division_result; const uint32_t dd = (static_cast<uint32_t>(cx0) + (sqrt_result << 1)) | 0x80000001UL; const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); - const uint64_t division_result_tmp = fast_div_v2(RCP, cx1, dd); - - division_result = ((uint32_t*)&division_result_tmp)[sub % 2]; - + division_result = fast_div_v2(RCP, cx1, dd); + // Use division_result as an input for the square root to prevent parallel implementation in hardware - sqrt_result = fast_sqrt_v2(cx0 + division_result_tmp); + sqrt_result = fast_sqrt_v2(cx0 + division_result); } uint32_t zz[2]; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp index 41ec70e1c471d53fc1ef6e6e4f69985161d02af5..2a25a9c073cb221138b9098561fb33746cae0a7d 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -71,11 +71,11 @@ __device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a, q[1] = (k < a) ? 1 : 0; const int64_t tmp = a - *((uint64_t*)(q)) * b; - const bool overshoot = (tmp < 0); - const bool undershoot = (tmp >= b); + const uint32_t overshoot = (tmp < 0) ? 1u : 0U; + const uint32_t undershoot = (tmp >= b) ? 1u : 0U; - q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); - q[1] = (uint32_t)(tmp) + (overshoot ? b : 0U) - (undershoot ? b : 0U); + q[0] += undershoot - overshoot; + q[1] = (uint32_t)(tmp) + (overshoot == 1 ? b : 0U) - (undershoot ? b : 0U); return *((uint64_t*)(q)); }