diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 286bc39b6f841cc4c1b863442cdb9a4acff5904b..e65f0ed05d09dd70c5138f8c13149e9d7bddc976 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -748,19 +748,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
  			// Use division_result as an input for the square root to prevent parallel implementation in hardware
 			sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
 #endif
+			ulong2 result_mul;
+			result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0);
+			result_mul.s1 = c[0] * as_ulong2(tmp).s0;
 // cryptonight_monero_v8
 #if(ALGO==11)
 			{
-				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
 				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				result_mul ^= chunk2;
 				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
 				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
 				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
 			}
 #endif
-			a[1] += c[0] * as_ulong2(tmp).s0;
-			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+			a[1] += result_mul.s1;
+			a[0] += result_mul.s0;
 
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
index 21f1f48c34ecde27a014c57a8ab3e8061f013d11..bc4a82f8668e6a5811807d835c84cb19342fc2f4 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
@@ -113,17 +113,21 @@ sqrt_fixup_ivybridge_ret:
 	mov	 r9, r10
 	mov	 rax, rdi
 	mul	 rbp
+        movq xmm0, rax
+ 	movq xmm1, rdx
+ 	punpcklqdq xmm1, xmm0
 
 	xor	 r9, 16
 	mov	 rcx, r10
 	xor	 rcx, 32
 	xor	 r10, 48
-	add	 r8, rdx
-	add	 r11, rax
-	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
+        pxor xmm2, xmm1
+ 	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	paddq	 xmm0, xmm5
 	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+        xor rdx, [rcx+rbx]
+ 	xor rax, [rcx+rbx+8]
 	paddq	 xmm2, xmm4
 	paddq	 xmm1, xmm7
 	movdqa	 xmm5, xmm4
@@ -131,6 +135,8 @@ sqrt_fixup_ivybridge_ret:
 	movdqa	 xmm4, xmm6
 	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
 	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+        add	 r8, rdx
+ 	add	 r11, rax
 	mov	 QWORD PTR [r14], r8
 	xor	 r8, rdi
 	mov	 r10, r8
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
index ee7f3171633834ec1684a0908586da8381abbbb7..3687d999b9fedd0f6b137cc5a48d8d4f70d29a93 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
@@ -113,17 +113,21 @@ sqrt_fixup_ivybridge_ret:
 	mov	 r9, r10
 	mov	 rax, rdi
 	mul	 rbp
+        movq xmm0, rax
+ 	movq xmm1, rdx
+ 	punpcklqdq xmm1, xmm0
 
 	xor	 r9, 16
 	mov	 rcx, r10
 	xor	 rcx, 32
 	xor	 r10, 48
-	add	 r8, rdx
-	add	 r11, rax
-	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
+        pxor xmm2, xmm1
+ 	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	paddq	 xmm0, xmm5
 	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+        xor rdx, [rcx+rbx]
+ 	xor rax, [rcx+rbx+8]
 	paddq	 xmm2, xmm4
 	paddq	 xmm1, xmm7
 	movdqa	 xmm5, xmm4
@@ -131,6 +135,8 @@ sqrt_fixup_ivybridge_ret:
 	movdqa	 xmm4, xmm6
 	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
 	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+        add	 r8, rdx
+ 	add	 r11, rax
 	mov	 QWORD PTR [r14], r8
 	xor	 r8, rdi
 	mov	 r10, r8
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
index 9c177b85aeef77cc50a356b187b536dcbae4f27c..a375a661fbc811f81980f5253dc91d5151157cc1 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
@@ -109,14 +109,20 @@ main_loop_ryzen:
 sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
+        movq xmm1, rax
+ 	movq xmm0, rdx
+ 	punpcklqdq xmm0, xmm1
 
 	mov	r9d, r10d
 	mov	ecx, r10d
 	xor	r9d, 16
 	xor	ecx, 32
 	xor	r10d, 48
-	movdqa	xmm0, XMMWORD PTR [r10+rbx]
-	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	xor rdx, [rcx+rbx]
+ 	xor rax, [rcx+rbx+8]
+ 	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+ 	pxor xmm2, xmm0
+ 	movdqa	xmm0, XMMWORD PTR [r10+rbx]
 	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
 	paddq	xmm0, xmm4
 	paddq	xmm2, xmm3
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
index f70dccef80732cd97bf0e31967f0db50a015ed78..a55004e426da66d105ae636e5ee2535b6aa70b92 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
@@ -109,14 +109,20 @@ main_loop_ryzen:
 sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
+        movq xmm1, rax
+ 	movq xmm0, rdx
+ 	punpcklqdq xmm0, xmm1
 
 	mov	r9d, r10d
 	mov	ecx, r10d
 	xor	r9d, 16
 	xor	ecx, 32
 	xor	r10d, 48
-	movdqa	xmm0, XMMWORD PTR [r10+rbx]
-	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	xor rdx, [rcx+rbx]
+ 	xor rax, [rcx+rbx+8]
+ 	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+ 	pxor xmm2, xmm0
+ 	movdqa	xmm0, XMMWORD PTR [r10+rbx]
 	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
 	paddq	xmm0, xmm4
 	paddq	xmm2, xmm3
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 6edae905ee12a1f8060ba08108a15dd06a036def..c0f122fd6f94bc658a746e85af170c4835a54837 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -543,7 +543,7 @@ inline void set_float_rounding_mode()
 #endif
 }
 
-#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \
+#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
 	if(ALGO == cryptonight_monero_v8) \
 	{ \
@@ -556,6 +556,21 @@ inline void set_float_rounding_mode()
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
 	}
 
+#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		hi ^= ((uint64_t*)&chunk2)[0]; \
+		lo ^= ((uint64_t*)&chunk2)[1]; \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	}
+
 #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
 	if(ALGO == cryptonight_monero_v8) \
 	{ \
@@ -637,7 +652,7 @@ inline void set_float_rounding_mode()
 		else \
 			cx = _mm_aesenc_si128(cx, ax0); \
 	} \
-	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1)
+	CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1)
 
 #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -659,18 +674,18 @@ inline void set_float_rounding_mode()
 	cl = ((uint64_t*)ptr0)[0]; \
 	ch = ((uint64_t*)ptr0)[1]; \
 	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
-	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \
-	if(ALGO == cryptonight_monero_v8) \
-	{ \
-		bx1 = bx0; \
-		bx0 = cx; \
-	} \
 	{ \
 		uint64_t hi; \
 		lo = _umul128(idx0, cl, &hi); \
+		CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
 		ah0 += lo; \
 		al0 += hi; \
 	} \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		bx1 = bx0; \
+		bx0 = cx; \
+	} \
 	((uint64_t*)ptr0)[0] = al0; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 05743ae922134649e7ffec347ef88fb71010e7fa..a344a9ffe74a17f584313ae7e28940c0d6567359 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -310,11 +310,11 @@ bool minethd::self_test()
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
 			hashf("This is a test This is a test This is a test", 44, out, ctx);
-			bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0;
+			bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
 			hashf("This is a test This is a test This is a test", 44, out, ctx);
-			bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0;
+			bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
 		}
 		else if(algo == cryptonight_aeon)
 		{