Skip to content
Snippets Groups Projects
Commit b361b395 authored by psychocrypt's avatar psychocrypt
Browse files

CUDA: use shared mem object

Combine the shared memory for a hash within one struct.
Reduce the shared memory footprint per hash by 64 byte.
parent 0c26cb7e
No related branches found
No related tags found
No related merge requests found
......@@ -195,6 +195,12 @@ static const __constant float ccnt[16] = {
1.4609375f
};
struct SharedMemChunk
{
int4 out[16];
float4 va[16];
};
__attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1)))
__kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads)
{
......@@ -211,13 +217,8 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u
__global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16));
#endif
__local int4 smem2[1 * 4 * WORKSIZE];
__local int4 smemOut2[1 * 16 * WORKSIZE];
__local float4 smemVa2[1 * 16 * WORKSIZE];
__local int4* smem = smem2 + 4 * chunk;
__local int4* smemOut = smemOut2 + 16 * chunk;
__local float4* smemVa = smemVa2 + 16 * chunk;
__local struct SharedMemChunk smem_in[WORKSIZE];
__local struct SharedMemChunk* smem = smem_in + chunk;
uint tid = get_local_id(0) % 16;
......@@ -235,50 +236,51 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u
for(size_t i = 0; i < ITERATIONS; i++)
{
mem_fence(CLK_LOCAL_MEM_FENCE);
((__local int*)smem)[tid] = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm];
int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm];
((__local int*)smem)[tid] = tmp;
mem_fence(CLK_LOCAL_MEM_FENCE);
{
single_comupte_wrap(
tidm,
*(smem + look[tid][0]),
*(smem + look[tid][1]),
*(smem + look[tid][2]),
*(smem + look[tid][3]),
ccnt[tid], vs, smemVa + tid,
smemOut + tid
*(smem->out + look[tid][0]),
*(smem->out + look[tid][1]),
*(smem->out + look[tid][2]),
*(smem->out + look[tid][3]),
ccnt[tid], vs, smem->va + tid,
smem->out + tid
);
}
mem_fence(CLK_LOCAL_MEM_FENCE);
int outXor = ((__local int*)smemOut)[block];
int outXor = ((__local int*)smem->out)[block];
for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
outXor ^= ((__local int*)smemOut)[dd];
outXor ^= ((__local int*)smem->out)[dd];
((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ ((__local int*)smem)[tid];
((__local int*)smemOut)[tid] = outXor;
((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp;
((__local int*)smem->out)[tid] = outXor;
float va_tmp1 = ((__local float*)smemVa)[block] + ((__local float*)smemVa)[block + 4];
float va_tmp2 = ((__local float*)smemVa)[block+ 8] + ((__local float*)smemVa)[block + 12];
((__local float*)smemVa)[tid] = va_tmp1 + va_tmp2;
float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12];
((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2;
mem_fence(CLK_LOCAL_MEM_FENCE);
int out2 = ((__local int*)smemOut)[tid] ^ ((__local int*)smemOut)[tid + 4 ] ^ ((__local int*)smemOut)[tid + 8] ^ ((__local int*)smemOut)[tid + 12];
va_tmp1 = ((__local float*)smemVa)[block] + ((__local float*)smemVa)[block + 4];
va_tmp2 = ((__local float*)smemVa)[block + 8] + ((__local float*)smemVa)[block + 12];
int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12];
va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12];
va_tmp1 = va_tmp1 + va_tmp2;
va_tmp1 = fabs(va_tmp1);
float xx = va_tmp1 * 16777216.0f;
int xx_int = (int)xx;
((__local int*)smemOut)[tid] = out2 ^ xx_int;
((__local float*)smemVa)[tid] = va_tmp1 / 64.0f;
((__local int*)smem->out)[tid] = out2 ^ xx_int;
((__local float*)smem->va)[tid] = va_tmp1 / 64.0f;
mem_fence(CLK_LOCAL_MEM_FENCE);
vs = smemVa[0];
s = smemOut->x ^ smemOut->y ^ smemOut->z ^ smemOut->w;
vs = smem->va[0];
s = smem->out->x ^ smem->out->y ^ smem->out->z ^ smem->out->w;
}
}
......
......@@ -895,7 +895,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_
ctx->device_id,
// 36 x 16byte x numThreads
xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu
<<<ctx->device_blocks, ctx->device_threads * 16, 36 * 16 * ctx->device_threads>>>
<<<ctx->device_blocks, ctx->device_threads * 16, 32 * 16 * ctx->device_threads>>>
(
ITERATIONS,
MEM,
......
......@@ -411,6 +411,12 @@ __forceinline__ __device__ void sync()
#endif
}
struct SharedMemChunk
{
__m128i out[16];
__m128 va[16];
};
__global__ void cryptonight_core_gpu_phase2_gpu(
const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK,
int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
......@@ -418,20 +424,14 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
extern __shared__ __m128i smemExtern_in[];
extern __shared__ SharedMemChunk smemExtern_in[];
const uint32_t chunk = threadIdx.x / 16;
const uint32_t numHashPerBlock = blockDim.x / 16;
int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk));
__m128i* smem = smemExtern_in + 4 * chunk;
__m128i* smemExtern = smemExtern_in + numHashPerBlock * 4;
__m128i* smemOut = smemExtern + 16 * chunk;
smemExtern = smemExtern + numHashPerBlock * 16;
__m128* smemVa = (__m128*)smemExtern + 16 * chunk;
SharedMemChunk* smem = smemExtern_in + chunk;
uint32_t tid = threadIdx.x % 16;
......@@ -458,43 +458,44 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
for(size_t i = 0; i < batchsize; i++)
{
sync();
((int*)smem)[tid] = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm];
int tmp = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm];
((int*)smem->out)[tid] = tmp;
sync();
__m128 rc = vs;
single_comupte_wrap(
tidm,
*(smem + look[tid][0]),
*(smem + look[tid][1]),
*(smem + look[tid][2]),
*(smem + look[tid][3]),
ccnt[tid], rc, smemVa[tid],
smemOut[tid]
*(smem->out + look[tid][0]),
*(smem->out + look[tid][1]),
*(smem->out + look[tid][2]),
*(smem->out + look[tid][3]),
ccnt[tid], rc, smem->va[tid],
smem->out[tid]
);
sync();
int outXor = ((int*)smemOut)[block];
int outXor = ((int*)smem->out)[block];
for(uint32_t dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
outXor ^= ((int*)smemOut)[dd];
outXor ^= ((int*)smem->out)[dd];
((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ ((int*)smem)[tid];
((int*)smemOut)[tid] = outXor;
((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ tmp;
((int*)smem->out)[tid] = outXor;
float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12];
((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4];
float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12];
((float*)smem->va)[tid] = va_tmp1 + va_tmp2;
sync();
__m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3];
va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12];
((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
__m128i out2 = smem->out[0] ^ smem->out[1] ^ smem->out[2] ^ smem->out[3];
va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4];
va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12];
((float*)smem->va)[tid] = va_tmp1 + va_tmp2;
sync();
vs = smemVa[0];
vs = smem->va[0];
vs.abs(); // take abs(va) by masking the float sign bit
auto xx = _mm_mul_ps(vs, __m128(16777216.0f));
// vs range 0 - 64
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment