From b361b3950ca6b7904e9d07d0cb6ad0328e2b19b0 Mon Sep 17 00:00:00 2001 From: psychocrypt <psychocryptHPC@gmail.com> Date: Sun, 10 Feb 2019 12:48:34 +0100 Subject: [PATCH] CUDA: use shared mem object Combine the shared memory for a hash within one struct. Reduce the shared memory footprint per hash by 64 byte. --- .../amd/amd_gpu/opencl/cryptonight_gpu.cl | 58 ++++++++++--------- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 2 +- .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 55 +++++++++--------- 3 files changed, 59 insertions(+), 56 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl index f73312a..4420169 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl @@ -195,6 +195,12 @@ static const __constant float ccnt[16] = { 1.4609375f }; +struct SharedMemChunk +{ + int4 out[16]; + float4 va[16]; +}; + __attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1))) __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads) { @@ -211,13 +217,8 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u __global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16)); #endif - __local int4 smem2[1 * 4 * WORKSIZE]; - __local int4 smemOut2[1 * 16 * WORKSIZE]; - __local float4 smemVa2[1 * 16 * WORKSIZE]; - - __local int4* smem = smem2 + 4 * chunk; - __local int4* smemOut = smemOut2 + 16 * chunk; - __local float4* smemVa = smemVa2 + 16 * chunk; + __local struct SharedMemChunk smem_in[WORKSIZE]; + __local struct SharedMemChunk* smem = smem_in + chunk; uint tid = get_local_id(0) % 16; @@ -235,50 +236,51 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u for(size_t i = 0; i < ITERATIONS; i++) { mem_fence(CLK_LOCAL_MEM_FENCE); - ((__local int*)smem)[tid] = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm]; + int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm]; + ((__local int*)smem)[tid] = tmp; mem_fence(CLK_LOCAL_MEM_FENCE); { single_comupte_wrap( tidm, - *(smem + look[tid][0]), - *(smem + look[tid][1]), - *(smem + look[tid][2]), - *(smem + look[tid][3]), - ccnt[tid], vs, smemVa + tid, - smemOut + tid + *(smem->out + look[tid][0]), + *(smem->out + look[tid][1]), + *(smem->out + look[tid][2]), + *(smem->out + look[tid][3]), + ccnt[tid], vs, smem->va + tid, + smem->out + tid ); } mem_fence(CLK_LOCAL_MEM_FENCE); - int outXor = ((__local int*)smemOut)[block]; + int outXor = ((__local int*)smem->out)[block]; for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4) - outXor ^= ((__local int*)smemOut)[dd]; + outXor ^= ((__local int*)smem->out)[dd]; - ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ ((__local int*)smem)[tid]; - ((__local int*)smemOut)[tid] = outXor; + ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp; + ((__local int*)smem->out)[tid] = outXor; - float va_tmp1 = ((__local float*)smemVa)[block] + ((__local float*)smemVa)[block + 4]; - float va_tmp2 = ((__local float*)smemVa)[block+ 8] + ((__local float*)smemVa)[block + 12]; - ((__local float*)smemVa)[tid] = va_tmp1 + va_tmp2; + float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4]; + float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12]; + ((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2; mem_fence(CLK_LOCAL_MEM_FENCE); - int out2 = ((__local int*)smemOut)[tid] ^ ((__local int*)smemOut)[tid + 4 ] ^ ((__local int*)smemOut)[tid + 8] ^ ((__local int*)smemOut)[tid + 12]; - va_tmp1 = ((__local float*)smemVa)[block] + ((__local float*)smemVa)[block + 4]; - va_tmp2 = ((__local float*)smemVa)[block + 8] + ((__local float*)smemVa)[block + 12]; + int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12]; + va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4]; + va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12]; va_tmp1 = va_tmp1 + va_tmp2; va_tmp1 = fabs(va_tmp1); float xx = va_tmp1 * 16777216.0f; int xx_int = (int)xx; - ((__local int*)smemOut)[tid] = out2 ^ xx_int; - ((__local float*)smemVa)[tid] = va_tmp1 / 64.0f; + ((__local int*)smem->out)[tid] = out2 ^ xx_int; + ((__local float*)smem->va)[tid] = va_tmp1 / 64.0f; mem_fence(CLK_LOCAL_MEM_FENCE); - vs = smemVa[0]; - s = smemOut->x ^ smemOut->y ^ smemOut->z ^ smemOut->w; + vs = smem->va[0]; + s = smem->out->x ^ smem->out->y ^ smem->out->z ^ smem->out->w; } } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 3f5b9c0..e151e8c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -895,7 +895,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_ ctx->device_id, // 36 x 16byte x numThreads xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu - <<<ctx->device_blocks, ctx->device_threads * 16, 36 * 16 * ctx->device_threads>>> + <<<ctx->device_blocks, ctx->device_threads * 16, 32 * 16 * ctx->device_threads>>> ( ITERATIONS, MEM, diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp index a2522b7..fee7e13 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -411,6 +411,12 @@ __forceinline__ __device__ void sync() #endif } +struct SharedMemChunk +{ + __m128i out[16]; + __m128 va[16]; +}; + __global__ void cryptonight_core_gpu_phase2_gpu( const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS) @@ -418,20 +424,14 @@ __global__ void cryptonight_core_gpu_phase2_gpu( const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); - extern __shared__ __m128i smemExtern_in[]; + extern __shared__ SharedMemChunk smemExtern_in[]; const uint32_t chunk = threadIdx.x / 16; const uint32_t numHashPerBlock = blockDim.x / 16; int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk)); - __m128i* smem = smemExtern_in + 4 * chunk; - - __m128i* smemExtern = smemExtern_in + numHashPerBlock * 4; - __m128i* smemOut = smemExtern + 16 * chunk; - - smemExtern = smemExtern + numHashPerBlock * 16; - __m128* smemVa = (__m128*)smemExtern + 16 * chunk; + SharedMemChunk* smem = smemExtern_in + chunk; uint32_t tid = threadIdx.x % 16; @@ -458,43 +458,44 @@ __global__ void cryptonight_core_gpu_phase2_gpu( for(size_t i = 0; i < batchsize; i++) { sync(); - ((int*)smem)[tid] = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm]; + int tmp = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm]; + ((int*)smem->out)[tid] = tmp; sync(); __m128 rc = vs; single_comupte_wrap( tidm, - *(smem + look[tid][0]), - *(smem + look[tid][1]), - *(smem + look[tid][2]), - *(smem + look[tid][3]), - ccnt[tid], rc, smemVa[tid], - smemOut[tid] + *(smem->out + look[tid][0]), + *(smem->out + look[tid][1]), + *(smem->out + look[tid][2]), + *(smem->out + look[tid][3]), + ccnt[tid], rc, smem->va[tid], + smem->out[tid] ); sync(); - int outXor = ((int*)smemOut)[block]; + int outXor = ((int*)smem->out)[block]; for(uint32_t dd = block + 4; dd < (tidd + 1) * 16; dd += 4) - outXor ^= ((int*)smemOut)[dd]; + outXor ^= ((int*)smem->out)[dd]; - ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ ((int*)smem)[tid]; - ((int*)smemOut)[tid] = outXor; + ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ tmp; + ((int*)smem->out)[tid] = outXor; - float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4]; - float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12]; - ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; + float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4]; + float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12]; + ((float*)smem->va)[tid] = va_tmp1 + va_tmp2; sync(); - __m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3]; - va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4]; - va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12]; - ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; + __m128i out2 = smem->out[0] ^ smem->out[1] ^ smem->out[2] ^ smem->out[3]; + va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4]; + va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12]; + ((float*)smem->va)[tid] = va_tmp1 + va_tmp2; sync(); - vs = smemVa[0]; + vs = smem->va[0]; vs.abs(); // take abs(va) by masking the float sign bit auto xx = _mm_mul_ps(vs, __m128(16777216.0f)); // vs range 0 - 64 -- GitLab