From 659918f26bf07a49059417735f02626545ca1f36 Mon Sep 17 00:00:00 2001 From: psychocrypt <psychocryptHPC@gmail.com> Date: Wed, 19 Sep 2018 20:50:32 +0200 Subject: [PATCH] NVIDIA: optimize div and sqrt - use optimzed div and sqrt - reduce memory footprint --- xmrstak/backend/amd/jconf.cpp | 2 +- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 62 ++++++------------- 2 files changed, 19 insertions(+), 45 deletions(-) diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 777dbdb..fb1a04b 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -151,7 +151,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) cfg.memChunk = (int)memChunk->GetInt64(); - if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || ) + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128) { printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two"); return false; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 1273f89..4e34e75 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -7,6 +7,8 @@ #include <cuda_runtime.h> #include "xmrstak/jconf.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" + #ifdef _WIN32 #include <windows.h> @@ -203,22 +205,6 @@ __forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint3 return tmp; } -__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0) -{ - uint64_t x = (n0 >> 12) + (1023ULL << 52); - const double xx = sqrt( *reinterpret_cast<double*>(&x) ); - uint64_t r = *reinterpret_cast<const uint64_t*>(&xx); - - const uint64_t s = r >> 20; - r >>= 19; - - uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); - - if (x2 < n0) ++r; - - return r; -} - template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO> #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) @@ -229,6 +215,12 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti __shared__ uint32_t sharedMemory[1024]; cn_aes_gpu_init( sharedMemory ); + __shared__ uint32_t RCP[256]; + for (int i = threadIdx.x; i < 256; i += blockDim.x) + { + RCP[i] = RCP_C[i]; + } + __syncthreads( ); @@ -284,7 +276,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti // must be valid only for `sub < 2` division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; - sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2]; + sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; } else d[1] = (d_ctx_b + thread * 4)[sub]; @@ -421,39 +413,23 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti if(ALGO == cryptonight_monero_v8 ) { - const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1); - // Use division and square root results from the _previous_ iteration to hide the latency const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); - - const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); - const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32); - + uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); + ((uint32_t*)&division_result_64)[1] ^= sqrt_result; + if(sub < 2) - *((uint64_t*)yy) ^= cl_rhs; - - - const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL; + *((uint64_t*)yy) ^= division_result_64; - // Most and least significant bits in the divisor are set to 1 - // to make sure we don't divide by a small or even number, - // so there are no shortcuts for such cases - // - // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 - // We drop the highest bit to fit both quotient and remainder in 32 bits - - // Compiler will optimize it to a single div instruction + const uint32_t dd = (static_cast<uint32_t>(cx0) + (sqrt_result << 1)) | 0x80000001UL; const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); - - - const uint64_t division_result_tmp = static_cast<uint32_t>(cx1 / dd) + ((cx1 % dd) << 32); + const uint64_t division_result_tmp = fast_div_v2(RCP, cx1, dd); division_result = ((uint32_t*)&division_result_tmp)[sub % 2]; // Use division_result as an input for the square root to prevent parallel implementation in hardware - const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp); - sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2]; + sqrt_result = fast_sqrt_v2(cx0 + division_result_tmp); } uint32_t zz[2]; @@ -706,7 +682,6 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { - if(miner_algo == cryptonight_monero) { cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce); @@ -745,11 +720,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t } else if(miner_algo == cryptonight_haven) { - cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>(ctx, startNonce); + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>(ctx, startNonce); } else if(miner_algo == cryptonight_bittube2) { - cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce); + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce); } - } -- GitLab