diff --git a/doc/compile.md b/doc/compile.md index 4987260bc6be46db82e2974c562b864e91d79a8b..08801d8a06716d2ad385b7c21264f5b2a410c95d 100644 --- a/doc/compile.md +++ b/doc/compile.md @@ -85,3 +85,44 @@ After the configuration you need to compile the miner, follow the guide for your - `XMR-STAK_THREADS` give the compiler information which value for `threads` is used at runtime - default is `0` (compile time optimization) - if the miner is compiled and used at runtime with the some value it can increase the hash rate: `cmake .. -DXMR-STAK_THREADS=32` + +#### CUDA Runtime versus CUDA SDK +nVidia packages the CUDA **runtime** with the GPU drivers, and the CUDA **SDK** should match. +While it is possible to compile with old SDK and then run on newer runtime/driver, in most cases it does not work well. + +SDK usually bundles a driver that supports the particular CUDA version, but it is always best to get (usually newer) +drivers from the official site. + +For Example: Built with 8.0 SDK running on a 9.2 driver crashes randomly on some GPUs, however worked fine on most 9.1 +drivers. Backward compatibility "should" work, but in reality there are many cases where it does not (YMMV) + +**NOTE**: The inverse case, installing CUDA 10.0 SDK on a system with older driver +does not magically add CUDA 10.0 support to the old driver. You must build with +CUDA SDK to match that driver runtime (check driver release notes PDF under 'supported technologies' list within the +first several pages) - *OR* - upgrade the driver to minimum `411.63` to have the CUDA 10.0 runtime +(unless, Fermi... they can't use CUDA 9.x or 10.0, even though newer drivers still run their *graphics* parts) + +Other gotchas based on GPU family: +* Anything less than Fermi will never work +* Fermi (arch 2x) was removed after CUDA 8.0 +* Volta (arch 7x) was added in CUDA 9.0 +* Turing (arch 75) was added in CUDA 10.0 + +Here is a rough table of driver revisions and CUDA runtime contained: + +| CUDA | Driver min | Driver max | notes +| ----:| ----------:| ----------:| ----- +| 10.0 | 411.63 | (current) | +| 9.2 | 397.93 | 399.24 | +| 9.1 | 388.71 | 397.64 | +| 9.0 | 387.92 | 388.59 | Fermi removed (must use CUDA == 8.0) +| 8.0 | 372.70 | 386.28 | except 372.95 has CUDA7.5 +| 7.5 | | | *Don't bother, won't compile anymore* + +nVidia generally uses the same version numbering on all OS, the above was however based +on Windows Driver Release Notes +nVidia always puts the runtime-included CUDA version in the release notes PDF for whatever driver, doesn't hurt to +double check your specific one. + +For better navigation of CUDA version matching, xmr-stak will display both version numbers during CUDA detection phases +such as `[9.2/10.0]` which is the compiled (SDK) version and the current (driver) runtime version. \ No newline at end of file diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 6b0d13d794e28b85a6e3af19064867fe864fed90..30f97ac5102643ae2e335894c2964783725d1a1d 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -903,6 +903,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) const char *fastIntMathV2CL = #include "./opencl/fast_int_math_v2.cl" ; + const char *fastDivHeavyCL = + #include "./opencl/fast_div_heavy.cl" + ; const char *cryptonightCL = #include "./opencl/cryptonight.cl" ; @@ -924,6 +927,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) std::string source_code(cryptonightCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 9c9bcd08e91fee9d3ac3c24fd86db2b84513c41a..be5b21107e9c239423271b1dd9c8e23f9afbcaeb 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -80,6 +80,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width) //#include "opencl/fast_int_math_v2.cl" XMRSTAK_INCLUDE_FAST_INT_MATH_V2 +//#include "fast_div_heavy.cl" +XMRSTAK_INCLUDE_FAST_DIV_HEAVY //#include "opencl/wolf-aes.cl" XMRSTAK_INCLUDE_WOLF_AES //#include "opencl/wolf-skein.cl" @@ -802,14 +804,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if (ALGO == 4 || ALGO == 10) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; - long q = n / (d | 0x5); + long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; idx0 = (d ^ q) & MASK; // cryptonight_haven #elif (ALGO == 9) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; - long q = n / (d | 0x5); + long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; idx0 = ((~d) ^ q) & MASK; #endif diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl new file mode 100644 index 0000000000000000000000000000000000000000..1d078b893a5429d36d9d99909fc907a98508a810 --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl @@ -0,0 +1,53 @@ +R"===( +#ifndef FAST_DIV_HEAVY_CL +#define FAST_DIV_HEAVY_CL + +inline ulong get_reciprocal_heavy(uint a) +{ + const uint shift = clz(a); + a <<= shift; + + const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23)); + const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256); + + const float r = native_recip(a_hi); + + const uint tmp0 = as_uint(r); + const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23); + const float r_scaled = as_float(tmp1); + + const float h = fma(a_lo, r, fma(a_hi, r, -1.0f)); + + const float r_scaled_hi = as_float(tmp1 & ~4095U); + const float h_hi = as_float(as_uint(h) & ~4095U); + + const float r_scaled_lo = r_scaled - r_scaled_hi; + const float h_lo = h - h_hi; + + const float x1 = h_hi * r_scaled_hi; + const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo; + + const long h1 = convert_long_rte(x1); + const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2)); + + const ulong result = tmp0 & 0xFFFFFF; + return (result << (shift + 9)) - ((h1 + h2) >> 2); +} + +inline long fast_div_heavy(long _a, int _b) +{ + const ulong a = abs(_a); + const uint b = abs(_b); + ulong q = mul_hi(a, get_reciprocal_heavy(b)); + + const long tmp = a - q * b; + const int overshoot = (tmp < 0) ? 1 : 0; + const int undershoot = (tmp >= b) ? 1 : 0; + q += undershoot - overshoot; + + return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q; +} + +#endif +)===" + \ No newline at end of file diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index 4c2bf49767e816a182d9e702190f00ac0682c18b..d489bff82fb6519b1e561ef2503fe830cdded040 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -18,7 +18,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.5.2" +#define XMR_STAK_VERSION "2.6.0" #if defined(_WIN32) #define OS_TYPE "win"