diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp index ccb836e41b13a36e4a9b8708c8aafddda991326d..9720c7815c70c3df0ca832a009e8abc51b846229 100644 --- a/xmrstak/backend/amd/OclCryptonightR_gen.cpp +++ b/xmrstak/backend/amd/OclCryptonightR_gen.cpp @@ -13,6 +13,7 @@ #include <chrono> #include <thread> #include <iostream> +#include <regex> namespace xmrstak @@ -63,15 +64,15 @@ static std::string get_code(const V4_Instruction* code, int code_size) struct CacheEntry { - CacheEntry(xmrstak_algo algo, uint64_t height, size_t deviceIdx, cl_program program) : + CacheEntry(xmrstak_algo algo, uint64_t height_offset, size_t deviceIdx, cl_program program) : algo(algo), - height(height), + height_offset(height_offset), deviceIdx(deviceIdx), program(program) {} xmrstak_algo algo; - uint64_t height; + uint64_t height_offset; size_t deviceIdx; cl_program program; }; @@ -99,6 +100,34 @@ static std::mutex background_tasks_mutex; static std::vector<BackgroundTaskBase*> background_tasks; static std::thread* background_thread = nullptr; +static cl_program search_program( + const GpuContext* ctx, + xmrstak_algo algo, + uint64_t height_offset, + bool lock_cache = true +) +{ + if(lock_cache) + CryptonightR_cache_mutex.ReadLock(); + + // Check if the cache has this program + for (const CacheEntry& entry : CryptonightR_cache) + { + if ((entry.algo == algo) && (entry.height_offset == height_offset) && (entry.deviceIdx == ctx->deviceIdx)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu found in cache", height_offset); + auto result = entry.program; + if(lock_cache) + CryptonightR_cache_mutex.UnLock(); + return result; + } + } + if(lock_cache) + CryptonightR_cache_mutex.UnLock(); + + return nullptr; +} + static void background_thread_proc() { std::vector<BackgroundTaskBase*> tasks; @@ -133,60 +162,48 @@ static void background_exec(T&& func) static cl_program CryptonightR_build_program( const GpuContext* ctx, xmrstak_algo algo, - uint64_t height, + uint64_t height_offset, + uint64_t height_chunk_size, uint32_t precompile_count, std::string source_code, std::string options) { - std::vector<cl_program> old_programs; - old_programs.reserve(32); - { + std::vector<cl_program> old_programs; + old_programs.reserve(32); + { CryptonightR_cache_mutex.WriteLock(); - // Remove old programs from cache - for(size_t i = 0; i < CryptonightR_cache.size();) - { - const CacheEntry& entry = CryptonightR_cache[i]; - if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); - old_programs.push_back(entry.program); - CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); - CryptonightR_cache.pop_back(); - } - else - { - ++i; - } - } + // Remove old programs from cache + for(size_t i = 0; i < CryptonightR_cache.size();) + { + const CacheEntry& entry = CryptonightR_cache[i]; + if ((entry.algo == algo) && (entry.height_offset + (2 + precompile_count) * height_chunk_size < height_offset)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu released (old program)", entry.height_offset); + old_programs.push_back(entry.program); + CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); + CryptonightR_cache.pop_back(); + } + else + { + ++i; + } + } CryptonightR_cache_mutex.UnLock(); - } + } - for(cl_program p : old_programs) { - clReleaseProgram(p); - } + for(cl_program p : old_programs) + { + clReleaseProgram(p); + } - std::lock_guard<std::mutex> g1(CryptonightR_build_mutex); + std::lock_guard<std::mutex> g1(CryptonightR_build_mutex); - cl_program program = nullptr; - { - CryptonightR_cache_mutex.ReadLock(); + cl_program program = search_program(ctx, algo, height_offset); - // Check if the cache already has this program (some other thread might have added it first) - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx)) - { - program = entry.program; - break; - } - } - CryptonightR_cache_mutex.UnLock(); - } - - if (program) { - return program; - } + if(program) { + return program; + } cl_int ret; const char* source = source_code.c_str(); @@ -239,54 +256,83 @@ static cl_program CryptonightR_build_program( } while(status == CL_BUILD_IN_PROGRESS); + CryptonightR_cache_mutex.WriteLock(); + auto cached_program = search_program(ctx, algo, height_offset, false); - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); + if(cached_program) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: release already existing program %llu", height_offset); + clReleaseProgram(program); + program = cached_program; + } + else + { + CryptonightR_cache.emplace_back(algo, height_offset, ctx->deviceIdx, program); + printer::inst()->print_msg(LDEBUG, "CryptonightR: cache compiled program for height_offset %llu", height_offset); + } - CryptonightR_cache_mutex.WriteLock(); - CryptonightR_cache.emplace_back(algo, height, ctx->deviceIdx, program); CryptonightR_cache_mutex.UnLock(); - return program; + return program; } -cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background) +cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background) { - printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height); + if (background) + { + background_exec([=](){ CryptonightR_get_program(ctx, algo, height_offset, height_chunk_size, precompile_count, false); }); + return nullptr; + } - if (background) { - background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false); }); - return nullptr; - } + auto program = search_program(ctx, algo, height_offset); - const char* source_code_template = - #include "amd_gpu/opencl/wolf-aes.cl" - #include "amd_gpu/opencl/cryptonight_r.cl" - ; - const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; - const char* offset = strstr(source_code_template, include_name); - if (!offset) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo); - return nullptr; - } + if(program != nullptr) + return program; - V4_Instruction code[256]; - int code_size; - switch (algo.Id()) - { - case cryptonight_r_wow: - code_size = v4_random_math_init<cryptonight_r_wow>(code, height); - break; - case cryptonight_r: - code_size = v4_random_math_init<cryptonight_r>(code, height); - break; - default: - printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo); - return nullptr; - } + printer::inst()->print_msg(LDEBUG, "CryptonightR: create code for block %llu to %llu",height_offset, height_offset + height_chunk_size); + + const char* source_code_definitions= + #include "amd_gpu/opencl/wolf-aes.cl" + #include "amd_gpu/opencl/cryptonight_r_def.rtcl" + ; + + const char* source_code_template = + #include "amd_gpu/opencl/cryptonight_r.rtcl" + ; + const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; + const char* offset = strstr(source_code_template, include_name); + if (!offset) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo); + return nullptr; + } + + std::string source_code(source_code_definitions); + + for(uint64_t c = 0; c < height_chunk_size; ++c) + { + V4_Instruction code[256]; + int code_size; + switch (algo.Id()) + { + case cryptonight_r_wow: + code_size = v4_random_math_init<cryptonight_r_wow>(code, height_offset + c); + break; + case cryptonight_r: + code_size = v4_random_math_init<cryptonight_r>(code, height_offset + c); + break; + default: + printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo); + return nullptr; + } - std::string source_code(source_code_template, offset); - source_code.append(get_code(code, code_size)); - source_code.append(offset + sizeof(include_name) - 1); + std::string kernel_code(source_code_template, offset); + kernel_code.append(get_code(code, code_size)); + kernel_code.append(offset + sizeof(include_name) - 1); + + std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height_offset + c); + + source_code += std::regex_replace(kernel_code, std::regex("cn1_cryptonight_r"), kernel_name); + } // scratchpad size for the selected mining algorithm size_t hashMemSize = algo.Mem(); @@ -325,27 +371,12 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t options += " -cl-fp32-correctly-rounded-divide-sqrt"; - const char* source = source_code.c_str(); + program = search_program(ctx, algo, height_offset); - { - CryptonightR_cache_mutex.ReadLock(); - - // Check if the cache has this program - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); - auto result = entry.program; - CryptonightR_cache_mutex.UnLock(); - return result; - } - } - CryptonightR_cache_mutex.UnLock(); - - } + if(program != nullptr) + return program; - return CryptonightR_build_program(ctx, algo, height, precompile_count, source, options); + return CryptonightR_build_program(ctx, algo, height_offset, precompile_count, height_chunk_size, source_code, options); } } // namespace amd diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp index 7dce77b850568eb6fe777c6ef400ece97200544e..b504b5d0c8cb39bbe8c416df0511a78d316ffed0 100644 --- a/xmrstak/backend/amd/OclCryptonightR_gen.hpp +++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp @@ -20,7 +20,7 @@ namespace amd { cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo, - uint64_t height, uint32_t precompile_count, bool background = false); + uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background = false); } // namespace amd } // namespace xmrstak diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 9f3f75469fae7666bd854fca82ea96ffa47aa9c3..6dd7c45ee6af31e84798c2dc5c6fb5f7b6203098 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -938,14 +938,17 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) { - uint32_t PRECOMPILATION_DEPTH = 4; + uint32_t PRECOMPILATION_DEPTH = 1; + constexpr uint64_t height_chunk_size = 25; + uint64_t height_offset = (height / height_chunk_size) * height_chunk_size; // Get new kernel - cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH); + cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset, height_chunk_size, PRECOMPILATION_DEPTH); - if (program != ctx->ProgramCryptonightR) { + if (program != ctx->ProgramCryptonightR || ctx->last_block_height != height) { cl_int ret; - cl_kernel kernel = clCreateKernel(program, "cn1_cryptonight_r", &ret); + std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height); + cl_kernel kernel = clCreateKernel(program, kernel_name.c_str(), &ret); if (ret != CL_SUCCESS) { printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret)); @@ -958,10 +961,12 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar Kernels[1] = kernel; } ctx->ProgramCryptonightR = program; + ctx->last_block_height = height; + printer::inst()->print_msg(LDEBUG, "Set height %llu", height); // Precompile next program in background for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i) - xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true); + xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset + i * height_chunk_size, height_chunk_size, PRECOMPILATION_DEPTH, true); printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx); } diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index ae2b506dbeeb673ac68b21c2db8adb20cfde7b61..ff56f0a9e4c588433f23750226cfdc43b53f882a 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -56,6 +56,7 @@ struct GpuContext std::map<xmrstak_algo_id, cl_program> Program; std::map<xmrstak_algo_id, std::array<cl_kernel,8>> Kernels; cl_program ProgramCryptonightR = nullptr; + uint64_t last_block_height = 0u; size_t freeMem; size_t maxMemPerAlloc; int computeUnits; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl similarity index 88% rename from xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl rename to xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl index 9edb774adcbc2a31bb25f3d6e581960d6b71a00f..cdb5aef3edc1f59e2f6604b3a9694e8d27bded96 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl @@ -1,4 +1,5 @@ R"===( + /* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -15,29 +16,15 @@ R"===( * */ -#define cryptonight_r_wow 15 -#define cryptonight_r 16 - -#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT) - -#if(STRIDED_INDEX==0) -# define IDX(x) (x) -#elif(STRIDED_INDEX==1) -# define IDX(x) (mul24(((uint)(x)), Threads)) -#elif(STRIDED_INDEX==2) -# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) -#elif(STRIDED_INDEX==3) -# define IDX(x) ((x) * WORKSIZE) -#endif - +#ifndef SCRATCHPAD_CHUNK // __NV_CL_C_VERSION checks if NVIDIA opencl is used -#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) -# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4)))) -# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) -#else -# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)]) +# if((ALGO == cryptonight_r_wow || ALGO == cryptonight_r) && defined(__NV_CL_C_VERSION)) +# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4)))) +# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) +# else +# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)]) +# endif #endif - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *states, uint Threads) { @@ -162,7 +149,9 @@ __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *stat #endif #define ROT_BITS 32 - XMRSTAK_INCLUDE_RANDOM_MATH +XMRSTAK_INCLUDE_RANDOM_MATH + +#undef ROT_BITS #if (ALGO == cryptonight_r) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl new file mode 100644 index 0000000000000000000000000000000000000000..2c318fcbf4cb4a5ebfc8410b2413fed6437350f0 --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl @@ -0,0 +1,33 @@ +R"===( +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#define cryptonight_r_wow 15 +#define cryptonight_r 16 + +#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT) + +#if(STRIDED_INDEX==0) +# define IDX(x) (x) +#elif(STRIDED_INDEX==1) +# define IDX(x) (mul24(((uint)(x)), Threads)) +#elif(STRIDED_INDEX==2) +# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) +#elif(STRIDED_INDEX==3) +# define IDX(x) ((x) * WORKSIZE) +#endif + +)==="