diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 7aa44e89eaa1c9615655ee8dd5853d5b253963a4..43740d295650296e085f247144a5ed97fc068e4b 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -352,7 +352,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	int sub = subv << 2;
 
 	const int batchsize = MEMORY >> bfactor;
-	const int start = partidx * batchsize;
+	const int start = (partidx % (1 << bfactor)) * batchsize;
 	const int end = start + batchsize;
 
 	if ( thread >= threads )
@@ -365,15 +365,15 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	__syncthreads( );
 	
 #if( __CUDA_ARCH__ < 300 )
-        extern __shared__ uint32_t shuffleMem[];
-        volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
+	extern __shared__ uint32_t shuffleMem[];
+	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8));
 #else
-        volatile uint32_t* sPtr = NULL;
+	volatile uint32_t* sPtr = NULL;
 #endif
 
 	for ( int i = start; i < end; i += 32 )
 	{
-#pragma unroll
+		#pragma unroll
 		for ( int j = 0; j < 4; ++j )
 			text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
 
@@ -381,25 +381,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 		
 		if(ALGO == cryptonight_heavy)
 		{
-#pragma unroll
-			for ( int j = 0; j < 4; ++j )
-				text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
-		}
-	}
-
-	if(ALGO == cryptonight_heavy)
-	{
-		__syncthreads( );
-		
-		for ( int i = start; i < end; i += 32 )
-		{
-#pragma unroll
-			for ( int j = 0; j < 4; ++j )
-				text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
-
-			cn_aes_pseudo_round_mut( sharedMemory, text, key );
-
-#pragma unroll
+			#pragma unroll
 			for ( int j = 0; j < 4; ++j )
 				text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
 		}
@@ -466,9 +448,21 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
 	}
 
-	for ( int i = 0; i < partcountOneThree; i++ )
+	int roundsPhase3 = partcountOneThree;
+
+	if(ALGO == cryptonight_heavy)
+	{
+		// cryptonight_heavy used two full rounds over the scratchpad memory
+		roundsPhase3 *= 2;
+	}
+
+	for ( int i = 0; i < roundsPhase3; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< 
+			grid,
+			block8,
+			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+		>>>( ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
 			ctx->d_ctx_state, ctx->d_ctx_key2 ));
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 04948d1cb7908fb2878cfd8b1e325dfc34e14c09..579c27ac673b8ca98518dd20eeccff717d89f24b 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.4.1"
+#define XMR_STAK_VERSION "2.4.2"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"