Skip to content
Snippets Groups Projects
Commit fd27561b authored by psychocrypt's avatar psychocrypt
Browse files

NVIDIA: optimze v8

- fix that shared memory for fast div is always used even if an algorithm is not using it
- optimize fast div algo
- store `division_result` (64_bit) per thread instead of shuffle around and store it as 32bit
parent 659918f2
No related branches found
No related tags found
No related merge requests found
...@@ -215,10 +215,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti ...@@ -215,10 +215,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
__shared__ uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init( sharedMemory ); cn_aes_gpu_init( sharedMemory );
__shared__ uint32_t RCP[256]; uint32_t* RCP;
for (int i = threadIdx.x; i < 256; i += blockDim.x) if(ALGO == cryptonight_monero_v8)
{ {
RCP[i] = RCP_C[i]; __shared__ uint32_t RCP_shared[256];
for (int i = threadIdx.x; i < 256; i += blockDim.x)
{
RCP_shared[i] = RCP_C[i];
}
RCP = RCP_shared;
} }
...@@ -268,14 +273,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti ...@@ -268,14 +273,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
} }
} }
uint32_t bx1, division_result, sqrt_result; uint32_t bx1, sqrt_result;
uint64_t division_result;
if(ALGO == cryptonight_monero_v8) if(ALGO == cryptonight_monero_v8)
{ {
d[1] = (d_ctx_b + thread * 12)[sub]; d[1] = (d_ctx_b + thread * 12)[sub];
bx1 = (d_ctx_b + thread * 12 + 4)[sub]; bx1 = (d_ctx_b + thread * 12 + 4)[sub];
// must be valid only for `sub < 2` // must be valid only for `sub < 2`
division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
} }
else else
...@@ -415,21 +421,17 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti ...@@ -415,21 +421,17 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
{ {
// Use division and square root results from the _previous_ iteration to hide the latency // Use division and square root results from the _previous_ iteration to hide the latency
const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1);
((uint32_t*)&division_result)[1] ^= sqrt_result;
uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1);
((uint32_t*)&division_result_64)[1] ^= sqrt_result;
if(sub < 2) if(sub < 2)
*((uint64_t*)yy) ^= division_result_64; *((uint64_t*)yy) ^= division_result;
const uint32_t dd = (static_cast<uint32_t>(cx0) + (sqrt_result << 1)) | 0x80000001UL; const uint32_t dd = (static_cast<uint32_t>(cx0) + (sqrt_result << 1)) | 0x80000001UL;
const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3);
const uint64_t division_result_tmp = fast_div_v2(RCP, cx1, dd); division_result = fast_div_v2(RCP, cx1, dd);
division_result = ((uint32_t*)&division_result_tmp)[sub % 2];
// Use division_result as an input for the square root to prevent parallel implementation in hardware // Use division_result as an input for the square root to prevent parallel implementation in hardware
sqrt_result = fast_sqrt_v2(cx0 + division_result_tmp); sqrt_result = fast_sqrt_v2(cx0 + division_result);
} }
uint32_t zz[2]; uint32_t zz[2];
......
...@@ -71,11 +71,11 @@ __device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a, ...@@ -71,11 +71,11 @@ __device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a,
q[1] = (k < a) ? 1 : 0; q[1] = (k < a) ? 1 : 0;
const int64_t tmp = a - *((uint64_t*)(q)) * b; const int64_t tmp = a - *((uint64_t*)(q)) * b;
const bool overshoot = (tmp < 0); const uint32_t overshoot = (tmp < 0) ? 1u : 0U;
const bool undershoot = (tmp >= b); const uint32_t undershoot = (tmp >= b) ? 1u : 0U;
q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); q[0] += undershoot - overshoot;
q[1] = (uint32_t)(tmp) + (overshoot ? b : 0U) - (undershoot ? b : 0U); q[1] = (uint32_t)(tmp) + (overshoot == 1 ? b : 0U) - (undershoot ? b : 0U);
return *((uint64_t*)(q)); return *((uint64_t*)(q));
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment