diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 3e6279288714c0b6eb6445d7a30f2129e3d2b642..1273f89e919f8f0fd602b650ab185443475b8dba 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -252,7 +252,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 	uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
-	uint32_t a, a1, d[2], idx0;
+	uint32_t a, d[2], idx0;
 	uint32_t t1[2], t2[2], res;
 
 	uint32_t tweak1_2[2];
@@ -296,7 +296,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 		for ( int x = 0; x < 2; ++x )
 		{
 			j = ( ( idx0 & MASK ) >> 2 ) + sub;
-
+			
 			if(ALGO == cryptonight_bittube2)
 			{
 				uint32_t k[4];
@@ -327,50 +327,69 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 					}
 				}
 			}
+			else if(ALGO == cryptonight_monero_v8)
+			{
+
+				const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
+				uint4 chunk0{};
+				chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0);
+				chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0);
+				chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0);
+				chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0);
+
+				const uint32_t x_0 = ((uint32_t*)&chunk0)[sub];
+				const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4];
+				const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4];
+				const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4];
+				d[x] = a ^
+					t_fn0( x_0 & 0xff ) ^
+					t_fn1( (x_1 >> 8) & 0xff ) ^
+					t_fn2( (x_2 >> 16) & 0xff ) ^
+					t_fn3( ( x_3 >> 24 ) );
+
+				uint4 value;
+				const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
+				if(sub == 1)
+					((uint64_t*)&value)[0] = tmp10;
+				const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
+				if(sub == 1)
+					((uint64_t*)&value)[1] = tmp20;
+				const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
+				if(sub == 2)
+					((uint64_t*)&value)[0] = tmp11;
+				const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
+				if(sub == 2)
+					((uint64_t*)&value)[1] = tmp21;
+				const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
+				if(sub == 3)
+					((uint64_t*)&value)[0] = tmp12;
+				const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
+				if(sub == 3)
+					((uint64_t*)&value)[1] = tmp22;
+
+				if(sub > 0)
+				{
+					uint4 store{};
+					((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
+					((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
+
+					const int dest = sub + 1;
+					const int dest2 = dest == 4 ? 1 : dest;
+					*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
+				}
+			}
 			else
 			{
 				const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
 				const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
 				const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
 				const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
-				if(ALGO == cryptonight_monero_v8)
-				{
-					a1 = a;
-				}
 				d[x] = a ^
 					t_fn0( x_0 & 0xff ) ^
 					t_fn1( (x_1 >> 8) & 0xff ) ^
 					t_fn2( (x_2 >> 16) & 0xff ) ^
 					t_fn3( ( x_3 >> 24 ) );
 			}
-
-			// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
-			if(ALGO == cryptonight_monero_v8)
-			{
-				// Shuffle constants here were chosen carefully
-				// to maximize permutation cycle length
-				// and have no 2-byte elements stay in their places
-				const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10)  );
-				const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20)  );
-				const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30)  );
-
-				uint32_t src = sub & 2;
-				const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1);
-				const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1);
-				const uint64_t cc3 = bx1_64 + chunk3_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]);
-
-				const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1);
-				const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1);
-				const uint64_t cc1 = bx0_64 + chunk1_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]);
-
-				const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1);
-				const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1);
-				const uint64_t cc2 = ax0_64 + chunk2_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]);
-				
-			}
 			//XOR_BLOCKS_DST(c, b, &long_state[j]);
 			t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
 
@@ -443,30 +462,36 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 			// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
 			if(ALGO == cryptonight_monero_v8)
 			{
-				// Shuffle constants here were chosen carefully
-				// to maximize permutation cycle length
-				// and have no 2-byte elements stay in their places
-				const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10)  );
-				const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20)  );
-				const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30)  );
-
-				uint32_t src = sub & 2;
-				const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1);
-				const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1);
-				const uint64_t cc3 = bx1_64 + chunk3_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]);
-
-
-
-				const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1);
-				const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1);
-				const uint64_t cc1 = bx0_64 + chunk1_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]);
-
-				const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1);
-				const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1);
-				const uint64_t cc2 = ax0_64 + chunk2_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]);
+				uint4 value;
+				const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
+				if(sub == 1)
+					((uint64_t*)&value)[0] = tmp10;
+				const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
+				if(sub == 1)
+					((uint64_t*)&value)[1] = tmp20;
+				const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
+				if(sub == 2)
+					((uint64_t*)&value)[0] = tmp11;
+				const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
+				if(sub == 2)
+					((uint64_t*)&value)[1] = tmp21;
+				const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
+				if(sub == 3)
+					((uint64_t*)&value)[0] = tmp12;
+				const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
+				if(sub == 3)
+					((uint64_t*)&value)[1] = tmp22;
+				if(sub > 0)
+				{
+					const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
+					uint4 store{};
+					((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
+					((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
+
+					const int dest = sub + 1;
+					const int dest2 = dest == 4 ? 1 : dest;
+					*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
+				}
 			}
 			
 			t1[1] = shuffle<4>(sPtr,sub, d[x], 1);