Skip to content
Snippets Groups Projects
Commit eb8376fa authored by psychocrypt's avatar psychocrypt
Browse files

CUDA: use volatile pointer

Use volatile pointer to be sure that the compiler is not caching the values.
parent 9e51acb6
No related branches found
No related tags found
No related merge requests found
...@@ -278,15 +278,15 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ...@@ -278,15 +278,15 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
} }
#if( __CUDA_ARCH__ < 300 ) #if( __CUDA_ARCH__ < 300 )
extern __shared__ u64 externShared[]; extern __shared__ uint64_t externShared[];
// 8 x 64bit values // 8 x 64bit values
u64* myChunks = (u64*)(externShared + (threadIdx.x >> 1) * 8); volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8);
volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE);
#else #else
extern __shared__ u64 chunkMem[]; extern __shared__ uint64_t chunkMem[];
volatile uint32_t* sPtr = NULL; volatile uint32_t* sPtr = NULL;
// 8 x 64bit values // 8 x 64bit values
u64* myChunks = (u64*)(chunkMem + (threadIdx.x >> 1) * 8); volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8);
#endif #endif
...@@ -301,25 +301,25 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ...@@ -301,25 +301,25 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY]; uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY];
u64 ax0 = ((u64*)(d_ctx_a + thread * 4))[sub]; uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub];
u64 bx0; uint64_t bx0;
uint32_t idx0 = shuffle<2>(sPtr, sub, ax0.x, 0); uint32_t idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
u64* ptr0; uint64_t* ptr0;
u64 bx1; uint64_t bx1;
uint32_t sqrt_result; uint32_t sqrt_result;
uint64_t division_result; uint64_t division_result;
if(ALGO == cryptonight_monero_v8) if(ALGO == cryptonight_monero_v8)
{ {
bx0 = ((u64*)(d_ctx_b + thread * 12))[sub]; bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub];
bx1 = ((u64*)(d_ctx_b + thread * 12 + 4))[sub]; bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub];
division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0]; division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
} }
else else
bx0 = ((u64*)(d_ctx_b + thread * 4))[sub]; bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
const int start = partidx * batchsize; const int start = partidx * batchsize;
...@@ -327,7 +327,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ...@@ -327,7 +327,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
for(int i = start; i < end; ++i) for(int i = start; i < end; ++i)
{ {
ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0]; ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
...@@ -344,9 +344,9 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ...@@ -344,9 +344,9 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
if(ALGO == cryptonight_monero_v8) if(ALGO == cryptonight_monero_v8)
{ {
const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ]; const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ];
const u64 chunk2 = myChunks[ idx1 ^ 4 + sub ]; const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ]; const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
#if (__CUDACC_VER_MAJOR__ >= 9) #if (__CUDACC_VER_MAJOR__ >= 9)
__syncwarp(); __syncwarp();
#else #else
...@@ -362,7 +362,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ...@@ -362,7 +362,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
idx1 = (idx0 & 0x30) >> 3; idx1 = (idx0 & 0x30) >> 3;
ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0]; ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
...@@ -399,10 +399,10 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ...@@ -399,10 +399,10 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl; uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
if(ALGO == cryptonight_monero_v8) if(ALGO == cryptonight_monero_v8)
{ {
const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res; const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
u64 chunk2 = myChunks[ idx1 ^ 4 + sub ]; uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
res ^= ((uint64_t*)&chunk2)[0]; res ^= ((uint64_t*)&chunk2)[0];
const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ]; const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
#if (__CUDACC_VER_MAJOR__ >= 9) #if (__CUDACC_VER_MAJOR__ >= 9)
__syncwarp(); __syncwarp();
#else #else
...@@ -422,16 +422,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ...@@ -422,16 +422,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
myChunks[ idx1 + sub ] = ax0; myChunks[ idx1 + sub ] = ax0;
((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
ax0 ^= c; ax0 ^= c;
idx0 = shuffle<2>(sPtr, sub, ax0.x, 0); idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
} }
if ( bfactor > 0 ) if ( bfactor > 0 )
{ {
((u64*)(d_ctx_a + thread * 4))[sub] = ax0; ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
if(ALGO == cryptonight_monero_v8) if(ALGO == cryptonight_monero_v8)
{ {
((u64*)(d_ctx_b + thread * 12))[sub] = bx0; ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
((u64*)(d_ctx_b + thread * 12 + 4))[sub] = bx1; ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
if(sub == 1) if(sub == 1)
{ {
...@@ -441,7 +441,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ...@@ -441,7 +441,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
} }
} }
else else
((u64*)(d_ctx_b + thread * 12))[sub] = bx0; ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment