diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp index 12468093c9fa043ce9aa43337aa378ceca889444..6354f60f03e4a5f5afdfe37cb393bba156cc47fd 100644 --- a/xmrstak/backend/nvidia/autoAdjust.hpp +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -96,6 +96,7 @@ private: " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + + " \"comp_mode\" : true,\n" + " },\n"; } } diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl index 144da80b9261abf0b4ad93d69604195bb4919da7..e2a76d90f1b8bb93e208c1e26c255da02b2cde5d 100644 --- a/xmrstak/backend/nvidia/config.tpl +++ b/xmrstak/backend/nvidia/config.tpl @@ -16,6 +16,9 @@ R"===(// generated by XMRSTAK_VERSION * 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu * 2 = cudaDeviceScheduleYield * 3 = cudaDeviceScheduleBlockingSync (default) + * comp_mode - Compatibility if true it will use 64bit memory loads and if false it will use + * 128bit memory loads (can produce invalid results) + * (this option has only a meaning for cryptonight_v8 and monero) * * On the first run the miner will look at your system and suggest a basic configuration that will work, * you can try to tweak it from there to get the best performance. diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp index c9d4f194c0097d747e16dddb0cfac33731bb6f36..b1059f359313b6e165251f6e902aa99bc51cf939 100644 --- a/xmrstak/backend/nvidia/jconf.cpp +++ b/xmrstak/backend/nvidia/jconf.cpp @@ -123,7 +123,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode; + const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *compMode; gid = GetObjectMember(oThdConf, "index"); blocks = GetObjectMember(oThdConf, "blocks"); threads = GetObjectMember(oThdConf, "threads"); @@ -131,9 +131,11 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) bsleep = GetObjectMember(oThdConf, "bsleep"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); syncMode = GetObjectMember(oThdConf, "sync_mode"); + compMode = GetObjectMember(oThdConf, "comp_mode"); if(gid == nullptr || blocks == nullptr || threads == nullptr || - bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr) + bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr || + compMode == nullptr) { return false; } @@ -161,13 +163,19 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)"); return false; } + + if(!compMode->IsBool()) + return false; + + cfg.id = gid->GetInt(); cfg.blocks = blocks->GetInt(); cfg.threads = threads->GetInt(); cfg.bfactor = bfactor->GetInt(); cfg.bsleep = bsleep->GetInt(); cfg.syncMode = syncMode->GetInt(); - + cfg.compMode = compMode->GetBool(); + if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt(); else diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp index b4ebaa03549c003ae82b1d7e96e524070f8d47d6..5ee1f81339fd2c9d35932c05022a427711f1f477 100644 --- a/xmrstak/backend/nvidia/jconf.hpp +++ b/xmrstak/backend/nvidia/jconf.hpp @@ -29,6 +29,7 @@ public: bool bNoPrefetch; int32_t cpu_aff; int syncMode; + bool compMode; long long iCpuAff; }; diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 0153eed199de0db8fb28b7586cc5832bf009c87a..135f26ea9768a4bb8cb84546e06b583c2c5a69fc 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -78,6 +78,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) ctx.device_bfactor = (int)cfg.bfactor; ctx.device_bsleep = (int)cfg.bsleep; ctx.syncMode = cfg.syncMode; + ctx.compMode = cfg.compMode; this->affinity = cfg.cpu_aff; std::future<void> numa_guard = numa_promise.get_future(); diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index d588641b4849864dfa653c44652963c557bccb10..8167395e3ec3aa6c5247178e0d337575ae587636 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -16,6 +16,7 @@ typedef struct { int device_bfactor; int device_bsleep; int syncMode; + bool compMode; uint32_t *d_input; uint32_t inputlen; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 8e69c202974a426aec574f3a594a60aff6ca1ce3..1c9c9df64bbcb73f588effde0ed3542515aea674 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -5,6 +5,7 @@ #include <string.h> #include <cuda.h> #include <cuda_runtime.h> +#include <bitset> #include "xmrstak/jconf.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" @@ -254,8 +255,12 @@ struct u64 : public uint2 } }; - -template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO> +/** cryptonight with two threads per hash + * + * @tparam COMP_MODE if true than 64bit memory transfers per thread will be used to store/load data within shared memory + * else 128bit operations will be used + */ +template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO, bool COMP_MODE> #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 2 ) #endif @@ -329,7 +334,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in { ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; - ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; + if(COMP_MODE) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + myChunks[x + sub] = ptr0[ x + sub ]; + } + } + else + ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; uint32_t idx1 = (idx0 & 0x30) >> 3; @@ -358,13 +372,31 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } myChunks[ idx1 + sub ] = cx_aes ^ bx0; - ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; + if(COMP_MODE) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + ptr0[ x + sub ] = myChunks[x + sub]; + } + } + else + ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); idx1 = (idx0 & 0x30) >> 3; ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; - ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; + if(COMP_MODE) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + myChunks[x + sub] = ptr0[ x + sub ]; + } + } + else + ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; if(ALGO != cryptonight_monero_v8) bx0 = cx_aes; @@ -420,7 +452,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in bx0 = cx_aes; } myChunks[ idx1 + sub ] = ax0; - ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; + if(COMP_MODE) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + ptr0[ x + sub ] = myChunks[x + sub]; + } + } + else + ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; ax0 ^= c; idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0); } @@ -699,7 +740,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); } -template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO> +template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, bool COMP_MODE> void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) { dim3 grid( ctx->device_blocks ); @@ -741,7 +782,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", - cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO><<< + cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO, COMP_MODE><<< grid, block2, sizeof(uint64_t) * block2.x * 8 + @@ -807,26 +848,50 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) } } -typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); - void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { + typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); + if(miner_algo == invalid_algo) return; static const cuda_hash_fn func_table[] = { - cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>, - cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>, - cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>, - cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>, - cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon>, - cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc>, - cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite>, - cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari>, - cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>, - cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>, - cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8> + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, true>, + + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, false>, + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, true> }; - cuda_hash_fn selected_function = func_table[ miner_algo - 1u ]; + std::bitset<1> digit; + digit.set(0, ctx->compMode); + + cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; selected_function(ctx, startNonce); }