From 5608f8df39504e69c2c1aaaa8ff5e60a83b06ee4 Mon Sep 17 00:00:00 2001 From: psychocrypt <psychocryptHPC@gmail.com> Date: Mon, 10 Sep 2018 08:30:36 +0200 Subject: [PATCH] OpenCl: cryptonight_v8 - implement cryptonight_v8 - update auto adjust to fit the special requirements of `cryptonight_v8` - add fast math integer implementation for `sqrt`, `reciprocal` and `division` Co-authored-by: SChernykh <sergey.v.chernykh@gmail.com> --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 27 +++- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 138 ++++++++++++++---- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 136 +++++++++++++++++ xmrstak/backend/amd/autoAdjust.hpp | 20 ++- 4 files changed, 290 insertions(+), 31 deletions(-) create mode 100644 xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 8d9b668..bb39c57 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -901,6 +901,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) //char* source_code = LoadTextFile(sSourcePath); + const char *fastIntMathV2CL = + #include "./opencl/fast_int_math_v2.cl" + ; const char *cryptonightCL = #include "./opencl/cryptonight.cl" ; @@ -921,6 +924,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) ; std::string source_code(cryptonightCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); @@ -930,16 +934,37 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + for(int i = 0; i < num_gpus; ++i) { + const std::string backendName = xmrstak::params::inst().openCLVendor; if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) { size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; ctx[i].rawIntensity = reduced_intensity; - const std::string backendName = xmrstak::params::inst().openCLVendor; printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity)); } + if(useCryptonight_v8) + { + if(ctx[i].stridedIndex == 1) + { + printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2) + { + printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + } + if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) { return ret; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 78cd30c..778c8d5 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width) } #endif +//#include "opencl/fast_int_math_v2.cl" +XMRSTAK_INCLUDE_FAST_INT_MATH_V2 //#include "opencl/wolf-aes.cl" XMRSTAK_INCLUDE_WOLF_AES //#include "opencl/wolf-skein.cl" @@ -556,6 +558,8 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } mem_fence(CLK_GLOBAL_MEM_FENCE); } + +#define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)]) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads @@ -565,9 +569,24 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ) { - ulong a[2], b[2]; + ulong a[2]; + +// cryptonight_monero_v8 +#if(ALGO==11) + ulong b[4]; + uint4 b_x[2]; +#else + ulong b[2]; + uint4 b_x[1]; +#endif __local uint AES0[256], AES1[256], AES2[256], AES3[256]; +// cryptonight_monero_v8 +#if(ALGO==11) + __local uint RCP[256]; + uint2 division_result; + uint sqrt_result; +#endif const ulong gIdx = getIdx(); for(int i = get_local_id(0); i < 256; i += WORKSIZE) @@ -577,6 +596,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES1[i] = rotate(tmp, 8U); AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); +// cryptonight_monero_v8 +#if(ALGO==11) + RCP[i] = RCP_C[i]; +#endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -584,7 +607,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint2 tweak1_2; #endif - uint4 b_x; + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -604,7 +627,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states a[1] = states[1] ^ states[5]; b[1] = states[3] ^ states[7]; - b_x = ((uint4 *)b)[0]; + b_x[0] = ((uint4 *)b)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + a[1] = states[1] ^ states[5]; + b[2] = states[8] ^ states[10]; + b[3] = states[9] ^ states[11]; + b_x[1] = ((uint4 *)b)[1]; + division_result = as_uint2(states[12]); + sqrt_result = as_uint2(states[13]).s0; +#endif // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) tweak1_2 = as_uint2(input[4]); @@ -622,37 +655,81 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states if(gIdx < Threads) #endif { - ulong idx0 = a[0]; + ulong idx0 = a[0] & MASK; #pragma unroll 8 for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; - ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; + ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 #if(ALGO == 10) ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #else ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif - b_x ^= ((uint4 *)c)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif + // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint table = 0x75310U; + b_x[0] ^= ((uint4 *)c)[0]; // cryptonight_stellite # if(ALGO == 7) - uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2); + uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2); # else - uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2); + uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2); # endif - b_x.s2 ^= ((table >> index) & 0x30U) << 24; + b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; + SCRATCHPAD_CHUNK(0) = b_x[0]; +// cryptonight_monero_v8 +#elif(ALGO==11) + SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; +#else + b_x[0] ^= ((uint4 *)c)[0]; + SCRATCHPAD_CHUNK(0) = b_x[0]; #endif - Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x; - + idx0 = c[0] & MASK; uint4 tmp; - tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; - + tmp = SCRATCHPAD_CHUNK(0); +// cryptonight_monero_v8 +#if(ALGO==11) + // Use division and square root results from the _previous_ iteration to hide the latency + tmp.s0 ^= division_result.s0; + tmp.s1 ^= division_result.s1 ^ sqrt_result; + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL; + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + division_result = fast_div_v2(RCP, c[1], d); + // Use division_result as an input for the square root to prevent parallel implementation in hardware + sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); +#endif +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); @@ -663,39 +740,42 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # if(ALGO == 6 || ALGO == 10) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; # else ((uint2 *)&(a[1]))[0] ^= tweak1_2; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= tweak1_2; # endif #else - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; #endif ((uint4 *)a)[0] ^= tmp; - idx0 = a[0]; - - b_x = ((uint4 *)c)[0]; + idx0 = a[0] & MASK; // cryptonight_heavy || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 10) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = d ^ q; -#endif + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = (d ^ q) & MASK; // cryptonight_haven -#if (ALGO == 9) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; +#elif (ALGO == 9) + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = (~d) ^ q; + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = ((~d) ^ q) & MASK; +#endif + +// cryptonight_monero_v8 +#if (ALGO == 11) + b_x[1] = b_x[0]; #endif + b_x[0] = ((uint4 *)c)[0]; } } mem_fence(CLK_GLOBAL_MEM_FENCE); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl new file mode 100644 index 0000000..fe7cea1 --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -0,0 +1,136 @@ +R"===( +/* + * @author SChernykh + */ +static const __constant uint RCP_C[256] = +{ + 0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu, + 0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au, + 0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu, + 0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u, + 0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u, + 0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u, + 0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u, + 0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u, + 0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u, + 0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u, + 0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u, + 0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u, + 0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au, + 0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u, + 0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u, + 0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu, + 0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u, + 0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u, + 0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u, + 0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u, + 0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu, + 0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u, + 0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u, + 0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u, + 0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u, + 0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu, + 0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau, + 0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u, + 0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u, + 0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu, + 0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u, + 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u, +}; + +inline uint get_reciprocal(const __local uchar *RCP, uint a) +{ + const uint index1 = (a & 0x7F000000U) >> 21; + const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768; + + const uint r1 = *(const __local uint*)(RCP + index1); + + uint r2_0 = *(const __local uint*)(RCP + index1 + 4); + if (index2 > 0) r2_0 >>= 16; + const int r2 = r2_0 & 0xFFFFU; + + const uint r = r1 - (uint)(mul24(r2, index2) >> 6); + + const ulong lo0 = (ulong)(r) * a; + ulong lo = lo0 + ((ulong)(a) << 32); + + a >>= 1; + const bool b = (a >= lo) || (lo >= lo0); + lo = a - lo; + + const ulong k = mul_hi(as_uint2(lo).s0, r) + ((ulong)(r) * as_uint2(lo).s1) + lo; + return as_uint2(k).s1 + (b ? r : 0); +} + +inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) +{ + const uint r = get_reciprocal((const __local uchar *)RCP, b); + const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; + + ulong q; + ((uint*)&q)[0] = as_uint2(k).s1;; + ((uint*)&q)[1] = (k < a) ? 1 : 0; + + const long tmp = a - q * b; + const bool overshoot = (tmp < 0); + const bool undershoot = (tmp >= b); + + return (uint2)( + as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U), + as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U) + ); +} + +inline void fast_div_full_q(const __local uint *RCP, ulong a, uint b, ulong *q, uint *r) +{ + const uint rcp = get_reciprocal((const __local uchar *)RCP, b); + const ulong k = mul_hi(as_uint2(a).s0, rcp) + ((ulong)(as_uint2(a).s1) * rcp) + a; + + ((uint*)q)[0] = as_uint2(k).s1; + ((uint*)q)[1] = (k < a) ? 1 : 0; + + long tmp = a - (*q) * b; + + const bool overshoot = (tmp < 0); + const bool undershoot = (tmp >= b); + + if (overshoot) + { + --(*q); + tmp += b; + } + + if (undershoot) + { + ++(*q); + tmp -= b; + } + + *r = tmp; +} + +inline uint fast_sqrt_v2(const ulong n1) +{ + float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + + float x1 = native_rsqrt(x); + x = native_sqrt(x); + + // The following line does x1 *= 4294967296.0f; + x1 = as_float(as_uint(x1) + (32U << 23)); + + const uint x0 = as_uint(x) - (158U << 23); + const long delta0 = n1 - (((long)(x0) * x0) << 18); + const float delta = convert_float_rte(as_int2(delta0).s1) * x1; + + uint result = (x0 << 10) + convert_int_rte(delta); + const uint s = result >> 1; + const uint b = result & 1; + + const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; + if ((long)(x2 + b) > 0) --result; + if ((long)(x2 + 0x100000000UL + s) < 0) ++result; + + return result; +} +)===" diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index d6acec9..4a2ffdb 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -127,6 +127,24 @@ private: minFreeMem = 512u * byteToMiB; } + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + + // set strided index to default + ctx.stridedIndex = 1; + + // nvidia performance is very bad if the scratchpad is not contiguous + if(ctx.isNVIDIA) + ctx.stridedIndex = 0; + + // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2` + if(useCryptonight_v8) + ctx.stridedIndex = 2; + // increase all intensity limits by two for aeon if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) maxThreads *= 2u; @@ -153,7 +171,7 @@ private: // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n" + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" " \"comp_mode\" : true\n" + " },\n"; } -- GitLab