diff --git a/doc/compile.md b/doc/compile.md
index 4987260bc6be46db82e2974c562b864e91d79a8b..08801d8a06716d2ad385b7c21264f5b2a410c95d 100644
--- a/doc/compile.md
+++ b/doc/compile.md
@@ -85,3 +85,44 @@ After the configuration you need to compile the miner, follow the guide for your
 - `XMR-STAK_THREADS` give the compiler information which value for `threads` is used at runtime
   - default is `0` (compile time optimization)
   - if the miner is compiled and used at runtime with the some value it can increase the hash rate: `cmake .. -DXMR-STAK_THREADS=32`
+
+#### CUDA Runtime versus CUDA SDK
+nVidia packages the CUDA **runtime** with the GPU drivers, and the CUDA **SDK** should match.
+While it is possible to compile with old SDK and then run on newer runtime/driver, in most cases it does not work well.
+
+SDK usually bundles a driver that supports the particular CUDA version, but it is always best to get (usually newer)
+drivers from the official site.
+
+For Example: Built with 8.0 SDK running on a 9.2 driver crashes randomly on some GPUs, however worked fine on most 9.1
+drivers.  Backward compatibility "should" work, but in reality there are many cases where it does not (YMMV)
+
+**NOTE**: The inverse case, installing CUDA 10.0 SDK on a system with older driver
+does not magically add CUDA 10.0 support to the old driver. You must build with
+CUDA SDK to match that driver runtime (check driver release notes PDF under 'supported technologies' list within the
+first several pages) - *OR* - upgrade the driver to minimum `411.63` to have the CUDA 10.0 runtime
+(unless, Fermi... they can't use CUDA 9.x or 10.0, even though newer drivers still run their *graphics* parts)
+
+Other gotchas based on GPU family:
+* Anything less than Fermi will never work
+* Fermi (arch 2x) was removed after CUDA 8.0
+* Volta (arch 7x) was added in CUDA 9.0
+* Turing (arch 75) was added in CUDA 10.0
+
+Here is a rough table of driver revisions and CUDA runtime contained:
+
+| CUDA | Driver min | Driver max | notes
+| ----:| ----------:| ----------:| -----
+| 10.0 | 411.63     | (current)  |
+|  9.2 | 397.93     | 399.24     |
+|  9.1 | 388.71     | 397.64     |
+|  9.0 | 387.92     | 388.59     | Fermi removed (must use CUDA == 8.0)
+|  8.0 | 372.70     | 386.28     | except 372.95 has CUDA7.5 
+|  7.5 |            |            | *Don't bother, won't compile anymore*
+
+nVidia generally uses the same version numbering on all OS, the above was however based
+on Windows Driver Release Notes
+nVidia always puts the runtime-included CUDA version in the release notes PDF for whatever driver, doesn't hurt to
+double check your specific one.
+
+For better navigation of CUDA version matching, xmr-stak will display both version numbers during CUDA detection phases
+such as `[9.2/10.0]` which is the compiled (SDK) version and the current (driver) runtime version.
\ No newline at end of file
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 6b0d13d794e28b85a6e3af19064867fe864fed90..30f97ac5102643ae2e335894c2964783725d1a1d 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -903,6 +903,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	const char *fastIntMathV2CL =
 			#include "./opencl/fast_int_math_v2.cl"
 	;
+    const char *fastDivHeavyCL =
+        #include "./opencl/fast_div_heavy.cl"
+    ;
 	const char *cryptonightCL =
 			#include "./opencl/cryptonight.cl"
 	;
@@ -924,6 +927,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	std::string source_code(cryptonightCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
+    source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 9c9bcd08e91fee9d3ac3c24fd86db2b84513c41a..be5b21107e9c239423271b1dd9c8e23f9afbcaeb 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -80,6 +80,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
 
 //#include "opencl/fast_int_math_v2.cl"
 XMRSTAK_INCLUDE_FAST_INT_MATH_V2
+//#include "fast_div_heavy.cl"
+XMRSTAK_INCLUDE_FAST_DIV_HEAVY
 //#include "opencl/wolf-aes.cl"
 XMRSTAK_INCLUDE_WOLF_AES
 //#include "opencl/wolf-skein.cl"
@@ -802,14 +804,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if (ALGO == 4 || ALGO == 10)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-			long q = n / (d | 0x5);
+			long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = (d ^ q) & MASK;
 // cryptonight_haven
 #elif (ALGO == 9)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-			long q = n / (d | 0x5);
+			long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = ((~d) ^ q) & MASK;
 #endif
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1d078b893a5429d36d9d99909fc907a98508a810
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
@@ -0,0 +1,53 @@
+R"===(
+#ifndef FAST_DIV_HEAVY_CL
+#define FAST_DIV_HEAVY_CL
+
+inline ulong get_reciprocal_heavy(uint a)
+{
+	const uint shift = clz(a);
+	a <<= shift;
+
+	const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
+	const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);
+
+	const float r = native_recip(a_hi);
+
+	const uint tmp0 = as_uint(r);
+	const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
+	const float r_scaled = as_float(tmp1);
+
+	const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
+
+	const float r_scaled_hi = as_float(tmp1 & ~4095U);
+	const float h_hi = as_float(as_uint(h) & ~4095U);
+
+	const float r_scaled_lo = r_scaled - r_scaled_hi;
+	const float h_lo = h - h_hi;
+
+	const float x1 = h_hi * r_scaled_hi;
+	const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;
+
+	const long h1 = convert_long_rte(x1);
+	const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));
+
+	const ulong result = tmp0 & 0xFFFFFF;
+	return (result << (shift + 9)) - ((h1 + h2) >> 2);
+}
+
+inline long fast_div_heavy(long _a, int _b)
+{
+	const ulong a = abs(_a);
+	const uint b = abs(_b);
+	ulong q = mul_hi(a, get_reciprocal_heavy(b));
+
+	const long tmp = a - q * b;
+	const int overshoot = (tmp < 0) ? 1 : 0;
+	const int undershoot = (tmp >= b) ? 1 : 0;
+	q += undershoot - overshoot;
+
+	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
+}
+
+#endif
+)==="
+        
\ No newline at end of file
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 4c2bf49767e816a182d9e702190f00ac0682c18b..d489bff82fb6519b1e561ef2503fe830cdded040 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.5.2"
+#define XMR_STAK_VERSION "2.6.0"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"