From d035dbc160de3df3a800e872c37453b1d277db2b Mon Sep 17 00:00:00 2001 From: psychocrypt <psychocryptHPC@gmail.com> Date: Mon, 10 Sep 2018 08:35:00 +0200 Subject: [PATCH] NVIDIA: cryptonight_v8 implement `cryptonight_v8` --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 164 +++++++++++++++++- .../backend/nvidia/nvcc_code/cuda_extra.cu | 25 ++- 2 files changed, 184 insertions(+), 5 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 6c64751..3e62792 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -194,6 +194,31 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_ #endif } +template<size_t group_n> +__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2) +{ + uint64_t tmp; + ((uint32_t*)&tmp)[0] = shuffle<group_n>(ptr, sub, val, src); + ((uint32_t*)&tmp)[1] = shuffle<group_n>(ptr, sub, val, src2); + return tmp; +} + +__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0) +{ + uint64_t x = (n0 >> 12) + (1023ULL << 52); + const double xx = sqrt( *reinterpret_cast<double*>(&x) ); + uint64_t r = *reinterpret_cast<const uint64_t*>(&xx); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + + if (x2 < n0) ++r; + + return r; +} + template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO> #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) @@ -227,7 +252,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti const int start = partidx * batchsize; const int end = start + batchsize; uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY]; - uint32_t a, d[2], idx0; + uint32_t a, a1, d[2], idx0; uint32_t t1[2], t2[2], res; uint32_t tweak1_2[2]; @@ -250,7 +275,19 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = *(d_ctx_b + threads * 4 + thread); } } - d[1] = (d_ctx_b + thread * 4)[sub]; + + uint32_t bx1, division_result, sqrt_result; + if(ALGO == cryptonight_monero_v8) + { + d[1] = (d_ctx_b + thread * 12)[sub]; + bx1 = (d_ctx_b + thread * 12 + 4)[sub]; + + // must be valid only for `sub < 2` + division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; + sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2]; + } + else + d[1] = (d_ctx_b + thread * 4)[sub]; #pragma unroll 2 for ( i = start; i < end; ++i ) @@ -296,6 +333,10 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); + if(ALGO == cryptonight_monero_v8) + { + a1 = a; + } d[x] = a ^ t_fn0( x_0 & 0xff ) ^ t_fn1( (x_1 >> 8) & 0xff ) ^ @@ -303,6 +344,33 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti t_fn3( ( x_3 >> 24 ) ); } + // Shuffle the other 3x16 byte chunks in the current 64-byte cache line + if(ALGO == cryptonight_monero_v8) + { + // Shuffle constants here were chosen carefully + // to maximize permutation cycle length + // and have no 2-byte elements stay in their places + const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10) ); + const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20) ); + const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30) ); + + uint32_t src = sub & 2; + const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1); + const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1); + const uint64_t cc3 = bx1_64 + chunk3_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]); + + const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1); + const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1); + const uint64_t cc1 = bx0_64 + chunk1_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]); + + const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1); + const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1); + const uint64_t cc2 = ax0_64 + chunk2_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]); + + } //XOR_BLOCKS_DST(c, b, &long_state[j]); t1[0] = shuffle<4>(sPtr,sub, d[x], 0); @@ -331,10 +399,76 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti uint32_t yy[2]; *( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) ); + + if(ALGO == cryptonight_monero_v8 ) + { + const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1); + + // Use division and square root results from the _previous_ iteration to hide the latency + const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); + + + const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); + const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32); + + if(sub < 2) + *((uint64_t*)yy) ^= cl_rhs; + + + const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL; + + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + // + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + + // Compiler will optimize it to a single div instruction + const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); + + + const uint64_t division_result_tmp = static_cast<uint32_t>(cx1 / dd) + ((cx1 % dd) << 32); + + division_result = ((uint32_t*)&division_result_tmp)[sub % 2]; + + // Use division_result as an input for the square root to prevent parallel implementation in hardware + const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp); + sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2]; + } + uint32_t zz[2]; zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); - + // Shuffle the other 3x16 byte chunks in the current 64-byte cache line + if(ALGO == cryptonight_monero_v8) + { + // Shuffle constants here were chosen carefully + // to maximize permutation cycle length + // and have no 2-byte elements stay in their places + const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10) ); + const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20) ); + const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30) ); + + uint32_t src = sub & 2; + const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1); + const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1); + const uint64_t cc3 = bx1_64 + chunk3_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]); + + + + const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1); + const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1); + const uint64_t cc1 = bx0_64 + chunk1_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]); + + const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1); + const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1); + const uint64_t cc2 = ax0_64 + chunk2_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]); + } + t1[1] = shuffle<4>(sPtr,sub, d[x], 1); #pragma unroll for ( k = 0; k < 2; k++ ) @@ -384,13 +518,31 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = (~d) ^ q; } + if(ALGO == cryptonight_monero_v8) + { + bx1 = d[(x + 1) % 2]; + } } } if ( bfactor > 0 ) { (d_ctx_a + thread * 4)[sub] = a; - (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_monero_v8) + { + (d_ctx_b + thread * 12)[sub] = d[1]; + (d_ctx_b + thread * 12 + 4)[sub] = bx1; + + if(sub < 2) + { + // must be valid only for `sub < 2` + (d_ctx_b + thread * 12 + 4 * 2)[sub % 2] = division_result; + (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2] = sqrt_result; + } + } + else + (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; @@ -534,6 +686,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t { cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce); } + else if(miner_algo == cryptonight_monero_v8) + { + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8>(ctx, startNonce); + } else if(miner_algo == cryptonight_heavy) { cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>(ctx, startNonce); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index b455f55..1ea54dd 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -142,7 +142,19 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); - memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + if(ALGO == cryptonight_monero_v8) + { + memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 ); + // bx1 + XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b ); + memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 ); + // division_result + memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 ); + // sqrt_result + memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + } + else + memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); @@ -298,6 +310,12 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } + else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) + { + // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) + ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; + } else ctx->d_ctx_state2 = ctx->d_ctx_state; @@ -340,6 +358,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + if(miner_algo == cryptonight_monero_v8) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, -- GitLab