CUDA: reduce cn-v8 shared mem footprint

Use only the half AES matrix and compute the other half in place. This PR increases the possible occupancy.

CUDA: reduce cn-v8 shared mem footprint
Use only the half AES matrix and compute the other half in place. This PR increases the possible occupancy.
ae8ba7f0 · psychocrypt · 447fef4b · ae8ba7f0 · ae8ba7f0
Commit ae8ba7f0 authored 6 years ago by psychocrypt
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
@@ -303,3 +303,9 @@ __device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
 	for(int i = threadIdx.x; i < 1024; i += blockDim.x)
 		sharedMemory[i] = d_t_fn[i];
 }
+
+__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory)
+{
+        for(int i = threadIdx.x; i < 512; i += blockDim.x)
+                sharedMemory[i] = d_t_fn[i];
+}
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -121,6 +121,11 @@ __device__ __forceinline__ void storeGlobal64( T* addr, T const & val )
 #endif
 }

+__device__ __forceinline__ uint32_t rotate16( const uint32_t n )
+{
+	return (n >> 16u) | (n << 16u);
+}
+
 template<size_t ITERATIONS, uint32_t MEMORY>
 __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
 {
@@ -267,9 +272,9 @@ __launch_bounds__( XMR_STAK_THREADS * 2 )
 __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t sharedMemory[512];

-	cn_aes_gpu_init( sharedMemory );
+	cn_aes_gpu_init_half( sharedMemory );

 #if( __CUDA_ARCH__ < 300 )
 	extern __shared__ uint64_t externShared[];
@@ -340,8 +345,8 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];

 		u64 cx_aes = ax0 ^ u64(
-			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ),
-			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) )
+			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )),
+			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) ))
 		);

 		if(ALGO == cryptonight_monero_v8)
@@ -840,9 +845,9 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
 	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
-	
+
 	if(miner_algo == invalid_algo) return;
-	
+
 	static const cuda_hash_fn func_table[] = {
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,