Skip to content
Snippets Groups Projects
Unverified Commit f785481b authored by fireice-uk's avatar fireice-uk Committed by GitHub
Browse files

Merge pull request #1087 from psychocrypt/topic-blockedStride3

AMD: option `mem_chunk`and new `strided_index`
parents 84febdf6 737185ee
No related branches found
No related tags found
No related merge requests found
......@@ -332,8 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
char options[256];
snprintf(options, sizeof(options),
"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d",
hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d",
hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk));
ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
if(ret != CL_SUCCESS)
{
......@@ -696,6 +696,13 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
for(int i = 0; i < num_gpus; ++i)
{
if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
{
size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
ctx[i].rawIntensity = reduced_intensity;
printer::inst()->print_msg(L0, "WARNING AMD: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", ctx[i].deviceIdx, int(reduced_intensity));
}
if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
{
return ret;
......
......@@ -25,6 +25,7 @@ struct GpuContext
size_t rawIntensity;
size_t workSize;
int stridedIndex;
int memChunk;
/*Output vars*/
cl_device_id DeviceID;
......
......@@ -411,12 +411,23 @@ void AESExpandKey256(uint *keybuf)
}
}
#define MEM_CHUNK (1<<4)
#if(STRIDED_INDEX==0)
# define IDX(x) (x)
#else
#elif(STRIDED_INDEX==1)
# define IDX(x) ((x) * (Threads))
#elif(STRIDED_INDEX==2)
# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
#endif
inline ulong getIdx()
{
#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2)
return get_global_id(0) - get_global_offset(0);
#endif
}
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
{
......@@ -425,7 +436,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
uint4 text;
const ulong gIdx = get_global_id(0) - get_global_offset(0);
const ulong gIdx = getIdx();
for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
i < 256;
......@@ -439,7 +450,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
}
barrier(CLK_LOCAL_MEM_FENCE);
// do not use early return here
if(gIdx < Threads)
{
......@@ -447,8 +458,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
#else
#elif(STRIDED_INDEX==1)
Scratchpad += gIdx;
#elif(STRIDED_INDEX==2)
Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0);
#endif
((ulong8 *)State)[0] = vload8(0, input);
......@@ -509,7 +522,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
ulong a[2], b[2];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
const ulong gIdx = get_global_id(0) - get_global_offset(0);
const ulong gIdx = getIdx();
for(int i = get_local_id(0); i < 256; i += WORKSIZE)
{
......@@ -523,15 +536,17 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
barrier(CLK_LOCAL_MEM_FENCE);
uint4 b_x;
// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
#else
#elif(STRIDED_INDEX==1)
Scratchpad += gIdx;
#elif(STRIDED_INDEX==2)
Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0);
#endif
a[0] = states[0] ^ states[4];
......@@ -582,7 +597,7 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
ulong State[25];
uint4 text;
const ulong gIdx = get_global_id(0) - get_global_offset(0);
const ulong gIdx = getIdx();
for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
i < 256;
......@@ -603,8 +618,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
#else
#elif(STRIDED_INDEX==1)
Scratchpad += gIdx;
#elif(STRIDED_INDEX==2)
Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0);
#endif
#if defined(__Tahiti__) || defined(__Pitcairn__)
......
......@@ -143,7 +143,7 @@ private:
// set 8 threads per block (this is a good value for the most gpus)
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
" \"affine_to_cpu\" : false, \"strided_index\" : true\n"
" \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4\n"
" },\n";
}
else
......
......@@ -6,11 +6,16 @@ R"===(
* worksize - Number of local GPU threads (nothing to do with CPU threads)
* affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
* strided_index - switch memory pattern used for the scratch pad memory
* true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
* false = use a contiguous block of memory per thread
* 2 = chunked memory, chunk size is controlled by 'mem_chunk'
* required: intensity must be a multiple of worksize
* 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
* 0 or false = use a contiguous block of memory per thread
* mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk
* this value is only used if 'strided_index' == 2
* element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
* "gpu_threads_conf" :
* [
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true },
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4 },
* ],
* If you do not wish to mine with your AMD GPU(s) then use:
* "gpu_threads_conf" :
......
......@@ -106,14 +106,15 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
const Value *idx, *intensity, *w_size, *aff, *stridedIndex;
const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk;
idx = GetObjectMember(oThdConf, "index");
intensity = GetObjectMember(oThdConf, "intensity");
w_size = GetObjectMember(oThdConf, "worksize");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
stridedIndex = GetObjectMember(oThdConf, "strided_index");
memChunk = GetObjectMember(oThdConf, "mem_chunk");
if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr)
if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr || memChunk == nullptr)
return false;
if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
......@@ -122,13 +123,34 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!aff->IsUint64() && !aff->IsBool())
return false;
if(!stridedIndex->IsBool())
if(!stridedIndex->IsBool() && !stridedIndex->IsNumber())
{
printer::inst()->print_msg(L0, "ERROR: strided_index must be a bool or a number");
return false;
}
if(stridedIndex->IsBool())
cfg.stridedIndex = stridedIndex->GetBool() ? 1 : 0;
else
cfg.stridedIndex = (int)stridedIndex->GetInt64();
if(cfg.stridedIndex > 2)
{
printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2");
return false;
}
cfg.memChunk = (int)memChunk->GetInt64();
if(!idx->IsUint64() || cfg.memChunk > 18 )
{
printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
return false;
}
cfg.index = idx->GetUint64();
cfg.intensity = intensity->GetUint64();
cfg.w_size = w_size->GetUint64();
cfg.stridedIndex = stridedIndex->GetBool();
cfg.intensity = intensity->GetUint64();
if(aff->IsNumber())
cfg.cpu_aff = aff->GetInt64();
......
......@@ -26,7 +26,8 @@ public:
size_t intensity;
size_t w_size;
long long cpu_aff;
bool stridedIndex;
int stridedIndex;
int memChunk;
};
size_t GetThreadCount();
......
......@@ -97,6 +97,7 @@ bool minethd::init_gpus()
vGpuData[i].rawIntensity = cfg.intensity;
vGpuData[i].workSize = cfg.w_size;
vGpuData[i].stridedIndex = cfg.stridedIndex;
vGpuData[i].memChunk = cfg.memChunk;
}
return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment