Skip to content
Snippets Groups Projects
Unverified Commit 5f84ed0d authored by fireice-uk's avatar fireice-uk Committed by GitHub
Browse files

Merge pull request #2366 from psychocrypt/fix-driverMemoryLeakAMD_block

AMD: workaround driver memory leaks
parents d3a7dc1a 920a334f
No related branches found
No related tags found
No related merge requests found
......@@ -13,6 +13,7 @@
#include <chrono>
#include <thread>
#include <iostream>
#include <regex>
namespace xmrstak
......@@ -63,15 +64,15 @@ static std::string get_code(const V4_Instruction* code, int code_size)
struct CacheEntry
{
CacheEntry(xmrstak_algo algo, uint64_t height, size_t deviceIdx, cl_program program) :
CacheEntry(xmrstak_algo algo, uint64_t height_offset, size_t deviceIdx, cl_program program) :
algo(algo),
height(height),
height_offset(height_offset),
deviceIdx(deviceIdx),
program(program)
{}
xmrstak_algo algo;
uint64_t height;
uint64_t height_offset;
size_t deviceIdx;
cl_program program;
};
......@@ -99,6 +100,34 @@ static std::mutex background_tasks_mutex;
static std::vector<BackgroundTaskBase*> background_tasks;
static std::thread* background_thread = nullptr;
static cl_program search_program(
const GpuContext* ctx,
xmrstak_algo algo,
uint64_t height_offset,
bool lock_cache = true
)
{
if(lock_cache)
CryptonightR_cache_mutex.ReadLock();
// Check if the cache has this program
for (const CacheEntry& entry : CryptonightR_cache)
{
if ((entry.algo == algo) && (entry.height_offset == height_offset) && (entry.deviceIdx == ctx->deviceIdx))
{
printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu found in cache", height_offset);
auto result = entry.program;
if(lock_cache)
CryptonightR_cache_mutex.UnLock();
return result;
}
}
if(lock_cache)
CryptonightR_cache_mutex.UnLock();
return nullptr;
}
static void background_thread_proc()
{
std::vector<BackgroundTaskBase*> tasks;
......@@ -133,60 +162,48 @@ static void background_exec(T&& func)
static cl_program CryptonightR_build_program(
const GpuContext* ctx,
xmrstak_algo algo,
uint64_t height,
uint64_t height_offset,
uint64_t height_chunk_size,
uint32_t precompile_count,
std::string source_code,
std::string options)
{
std::vector<cl_program> old_programs;
old_programs.reserve(32);
{
std::vector<cl_program> old_programs;
old_programs.reserve(32);
{
CryptonightR_cache_mutex.WriteLock();
// Remove old programs from cache
for(size_t i = 0; i < CryptonightR_cache.size();)
{
const CacheEntry& entry = CryptonightR_cache[i];
if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
{
printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
old_programs.push_back(entry.program);
CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
CryptonightR_cache.pop_back();
}
else
{
++i;
}
}
// Remove old programs from cache
for(size_t i = 0; i < CryptonightR_cache.size();)
{
const CacheEntry& entry = CryptonightR_cache[i];
if ((entry.algo == algo) && (entry.height_offset + (2 + precompile_count) * height_chunk_size < height_offset))
{
printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu released (old program)", entry.height_offset);
old_programs.push_back(entry.program);
CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
CryptonightR_cache.pop_back();
}
else
{
++i;
}
}
CryptonightR_cache_mutex.UnLock();
}
}
for(cl_program p : old_programs) {
clReleaseProgram(p);
}
for(cl_program p : old_programs)
{
clReleaseProgram(p);
}
std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
cl_program program = nullptr;
{
CryptonightR_cache_mutex.ReadLock();
cl_program program = search_program(ctx, algo, height_offset);
// Check if the cache already has this program (some other thread might have added it first)
for (const CacheEntry& entry : CryptonightR_cache)
{
if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx))
{
program = entry.program;
break;
}
}
CryptonightR_cache_mutex.UnLock();
}
if (program) {
return program;
}
if(program) {
return program;
}
cl_int ret;
const char* source = source_code.c_str();
......@@ -239,54 +256,83 @@ static cl_program CryptonightR_build_program(
}
while(status == CL_BUILD_IN_PROGRESS);
CryptonightR_cache_mutex.WriteLock();
auto cached_program = search_program(ctx, algo, height_offset, false);
printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
if(cached_program)
{
printer::inst()->print_msg(LDEBUG, "CryptonightR: release already existing program %llu", height_offset);
clReleaseProgram(program);
program = cached_program;
}
else
{
CryptonightR_cache.emplace_back(algo, height_offset, ctx->deviceIdx, program);
printer::inst()->print_msg(LDEBUG, "CryptonightR: cache compiled program for height_offset %llu", height_offset);
}
CryptonightR_cache_mutex.WriteLock();
CryptonightR_cache.emplace_back(algo, height, ctx->deviceIdx, program);
CryptonightR_cache_mutex.UnLock();
return program;
return program;
}
cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background)
cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background)
{
printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height);
if (background)
{
background_exec([=](){ CryptonightR_get_program(ctx, algo, height_offset, height_chunk_size, precompile_count, false); });
return nullptr;
}
if (background) {
background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false); });
return nullptr;
}
auto program = search_program(ctx, algo, height_offset);
const char* source_code_template =
#include "amd_gpu/opencl/wolf-aes.cl"
#include "amd_gpu/opencl/cryptonight_r.cl"
;
const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
const char* offset = strstr(source_code_template, include_name);
if (!offset)
{
printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo);
return nullptr;
}
if(program != nullptr)
return program;
V4_Instruction code[256];
int code_size;
switch (algo.Id())
{
case cryptonight_r_wow:
code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
break;
case cryptonight_r:
code_size = v4_random_math_init<cryptonight_r>(code, height);
break;
default:
printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
return nullptr;
}
printer::inst()->print_msg(LDEBUG, "CryptonightR: create code for block %llu to %llu",height_offset, height_offset + height_chunk_size);
const char* source_code_definitions=
#include "amd_gpu/opencl/wolf-aes.cl"
#include "amd_gpu/opencl/cryptonight_r_def.rtcl"
;
const char* source_code_template =
#include "amd_gpu/opencl/cryptonight_r.rtcl"
;
const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
const char* offset = strstr(source_code_template, include_name);
if (!offset)
{
printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo);
return nullptr;
}
std::string source_code(source_code_definitions);
for(uint64_t c = 0; c < height_chunk_size; ++c)
{
V4_Instruction code[256];
int code_size;
switch (algo.Id())
{
case cryptonight_r_wow:
code_size = v4_random_math_init<cryptonight_r_wow>(code, height_offset + c);
break;
case cryptonight_r:
code_size = v4_random_math_init<cryptonight_r>(code, height_offset + c);
break;
default:
printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
return nullptr;
}
std::string source_code(source_code_template, offset);
source_code.append(get_code(code, code_size));
source_code.append(offset + sizeof(include_name) - 1);
std::string kernel_code(source_code_template, offset);
kernel_code.append(get_code(code, code_size));
kernel_code.append(offset + sizeof(include_name) - 1);
std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height_offset + c);
source_code += std::regex_replace(kernel_code, std::regex("cn1_cryptonight_r"), kernel_name);
}
// scratchpad size for the selected mining algorithm
size_t hashMemSize = algo.Mem();
......@@ -325,27 +371,12 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
options += " -cl-fp32-correctly-rounded-divide-sqrt";
const char* source = source_code.c_str();
program = search_program(ctx, algo, height_offset);
{
CryptonightR_cache_mutex.ReadLock();
// Check if the cache has this program
for (const CacheEntry& entry : CryptonightR_cache)
{
if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx))
{
printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
auto result = entry.program;
CryptonightR_cache_mutex.UnLock();
return result;
}
}
CryptonightR_cache_mutex.UnLock();
}
if(program != nullptr)
return program;
return CryptonightR_build_program(ctx, algo, height, precompile_count, source, options);
return CryptonightR_build_program(ctx, algo, height_offset, precompile_count, height_chunk_size, source_code, options);
}
} // namespace amd
......
......@@ -20,7 +20,7 @@ namespace amd
{
cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,
uint64_t height, uint32_t precompile_count, bool background = false);
uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background = false);
} // namespace amd
} // namespace xmrstak
......@@ -938,14 +938,17 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) {
uint32_t PRECOMPILATION_DEPTH = 4;
uint32_t PRECOMPILATION_DEPTH = 1;
constexpr uint64_t height_chunk_size = 25;
uint64_t height_offset = (height / height_chunk_size) * height_chunk_size;
// Get new kernel
cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH);
cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset, height_chunk_size, PRECOMPILATION_DEPTH);
if (program != ctx->ProgramCryptonightR) {
if (program != ctx->ProgramCryptonightR || ctx->last_block_height != height) {
cl_int ret;
cl_kernel kernel = clCreateKernel(program, "cn1_cryptonight_r", &ret);
std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height);
cl_kernel kernel = clCreateKernel(program, kernel_name.c_str(), &ret);
if (ret != CL_SUCCESS) {
printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret));
......@@ -958,10 +961,12 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
Kernels[1] = kernel;
}
ctx->ProgramCryptonightR = program;
ctx->last_block_height = height;
printer::inst()->print_msg(LDEBUG, "Set height %llu", height);
// Precompile next program in background
for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true);
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset + i * height_chunk_size, height_chunk_size, PRECOMPILATION_DEPTH, true);
printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
}
......
......@@ -56,6 +56,7 @@ struct GpuContext
std::map<xmrstak_algo_id, cl_program> Program;
std::map<xmrstak_algo_id, std::array<cl_kernel,8>> Kernels;
cl_program ProgramCryptonightR = nullptr;
uint64_t last_block_height = 0u;
size_t freeMem;
size_t maxMemPerAlloc;
int computeUnits;
......
R"===(
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -15,29 +16,15 @@ R"===(
*
*/
#define cryptonight_r_wow 15
#define cryptonight_r 16
#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
#if(STRIDED_INDEX==0)
# define IDX(x) (x)
#elif(STRIDED_INDEX==1)
# define IDX(x) (mul24(((uint)(x)), Threads))
#elif(STRIDED_INDEX==2)
# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
#elif(STRIDED_INDEX==3)
# define IDX(x) ((x) * WORKSIZE)
#endif
#ifndef SCRATCHPAD_CHUNK
// __NV_CL_C_VERSION checks if NVIDIA opencl is used
#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
#else
# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)])
# if((ALGO == cryptonight_r_wow || ALGO == cryptonight_r) && defined(__NV_CL_C_VERSION))
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
# else
# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)])
# endif
#endif
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *states, uint Threads)
{
......@@ -162,7 +149,9 @@ __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *stat
#endif
#define ROT_BITS 32
XMRSTAK_INCLUDE_RANDOM_MATH
XMRSTAK_INCLUDE_RANDOM_MATH
#undef ROT_BITS
#if (ALGO == cryptonight_r)
......
R"===(
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#define cryptonight_r_wow 15
#define cryptonight_r 16
#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
#if(STRIDED_INDEX==0)
# define IDX(x) (x)
#elif(STRIDED_INDEX==1)
# define IDX(x) (mul24(((uint)(x)), Threads))
#elif(STRIDED_INDEX==2)
# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
#elif(STRIDED_INDEX==3)
# define IDX(x) ((x) * WORKSIZE)
#endif
)==="
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment