diff --git a/doc/tuning.md b/doc/tuning.md index 474553b4cb2900ca20b899b70059e5cd57eb1fb0..25c50655decf94ac691c7d4bb809b39a96efd0ba 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -8,6 +8,7 @@ * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize) * [Add more GPUs](#add-more-gpus) * [Increase Memory Pool](#increase-memory-pool) + * [Scratchpad Indexing](#scratchpad-indexing) ## NVIDIA Backend @@ -80,4 +81,9 @@ export GPU_MAX_ALLOC_PERCENT=99 export GPU_SINGLE_ALLOC_PERCENT=99 ``` -*Note:* Windows user must use `set` instead of `export` to define an environment variable. \ No newline at end of file +*Note:* Windows user must use `set` instead of `export` to define an environment variable. + +### Scratchpad Indexing + +The layout of the hash scratchpad memory can be changed for each GPU with the option `strided_index` in `amd.txt`. +Try to change the value from the default `false` to `true`. diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 879a2e4caa9f219f6355018f3b4d24a6191a6018..42f63884729eb9ea7185b44b7bbcd63648d00f9a 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -332,7 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[256]; snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu", hasIterations, threadMemMask, int_port(ctx->workSize)); + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", + hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0); ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); if(ret != CL_SUCCESS) { diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index abbd08d2f7edbaeadede8fc81ec806afdb5c80ef..c17bac11b0e998faa4315d6028c279c83c7ca0a1 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -24,6 +24,7 @@ struct GpuContext size_t deviceIdx; size_t rawIntensity; size_t workSize; + int stridedIndex; /*Output vars*/ cl_device_id DeviceID; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 966199bc418bd2a65ccbaec1012508b061fca8c2..255fcbbff79becb419a605b808bddfdcb798e2f3 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -411,7 +411,11 @@ void AESExpandKey256(uint *keybuf) } } -#define IDX(x) (x) +#if(STRIDED_INDEX==0) +# define IDX(x) (x) +#else +# define IDX(x) ((x) * (Threads)) +#endif __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) @@ -440,7 +444,12 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul if(gIdx < Threads) { states += 25 * gIdx; + +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif ((ulong8 *)State)[0] = vload8(0, input); State[8] = input[8]; @@ -519,7 +528,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre if(gIdx < Threads) { states += 25 * gIdx; +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif a[0] = states[0] ^ states[4]; b[0] = states[2] ^ states[6]; @@ -588,7 +601,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u if(gIdx < Threads) { states += 25 * gIdx; +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif #if defined(__Tahiti__) || defined(__Pitcairn__) diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 0b91212cbf5350bc18373772086e3919201d91e8..0abf84e837bbd434cea63f847ab1bd179baa8192 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -123,7 +123,7 @@ private: // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \n" + " \"affine_to_cpu\" : false, \"strided_index\" : false\n" " },\n"; ++i; } diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index a93859cdf013a6439c99c858df0110df5a9aed33..f31a678faf281525c443727801dc009b978a370e 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -5,9 +5,12 @@ R"===( * intensity - Number of parallel GPU threads (nothing to do with CPU threads) * worksize - Number of local GPU threads (nothing to do with CPU threads) * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. + * strided_index - switch memory pattern used for the scratch pad memory + * false = use a contiguous block of memory per thread + * true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : false }, * ], */ diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 0617aeb2fe3c78e4164d3a632173a6f110ca6f97..07afb1964d2a943cc1b2c0e8148ff5aa578b2a9a 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -103,13 +103,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + stridedIndex = GetObjectMember(oThdConf, "strided_index"); - if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr) + if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -118,9 +119,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!aff->IsUint64() && !aff->IsBool()) return false; + if(!stridedIndex->IsBool()) + return false; + cfg.index = idx->GetUint64(); cfg.intensity = intensity->GetUint64(); cfg.w_size = w_size->GetUint64(); + cfg.stridedIndex = stridedIndex->GetBool(); if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt64(); diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index da024a412246c17a4df8041b081c7fe03fa7776a..ee1882aad754ba32840d959f1c44f255f1af0380 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -26,6 +26,7 @@ public: size_t intensity; size_t w_size; long long cpu_aff; + bool stridedIndex; }; size_t GetThreadCount(); diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index c1399e0b8adef5220177168888857f3f428b3e48..103688f8a1f7327e62921c8a9b8568123a0552f5 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -96,6 +96,7 @@ bool minethd::init_gpus() vGpuData[i].deviceIdx = cfg.index; vGpuData[i].rawIntensity = cfg.intensity; vGpuData[i].workSize = cfg.w_size; + vGpuData[i].stridedIndex = cfg.stridedIndex; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;