diff --git a/doc/tuning.md b/doc/tuning.md
index 474553b4cb2900ca20b899b70059e5cd57eb1fb0..25c50655decf94ac691c7d4bb809b39a96efd0ba 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -8,6 +8,7 @@
   * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
   * [Add more GPUs](#add-more-gpus)
   * [Increase Memory Pool](#increase-memory-pool)
+  * [Scratchpad Indexing](#scratchpad-indexing)
 
 ## NVIDIA Backend
 
@@ -80,4 +81,9 @@ export GPU_MAX_ALLOC_PERCENT=99
 export GPU_SINGLE_ALLOC_PERCENT=99
 ```
 
-*Note:* Windows user must use `set` instead of `export` to define an environment variable.
\ No newline at end of file
+*Note:* Windows user must use `set` instead of `export` to define an environment variable.
+
+### Scratchpad Indexing
+
+The layout of the hash scratchpad memory can be changed for each GPU with the option `strided_index` in `amd.txt`.
+Try to change the value from the default `false` to `true`.
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 879a2e4caa9f219f6355018f3b4d24a6191a6018..42f63884729eb9ea7185b44b7bbcd63648d00f9a 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -332,7 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 	char options[256];
 	snprintf(options, sizeof(options), 
-		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu", hasIterations, threadMemMask, int_port(ctx->workSize));
+		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", 
+		hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
 	ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
 	if(ret != CL_SUCCESS)
 	{
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index abbd08d2f7edbaeadede8fc81ec806afdb5c80ef..c17bac11b0e998faa4315d6028c279c83c7ca0a1 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -24,6 +24,7 @@ struct GpuContext
 	size_t deviceIdx;
 	size_t rawIntensity;
 	size_t workSize;
+	int stridedIndex;
 
 	/*Output vars*/
 	cl_device_id DeviceID;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 966199bc418bd2a65ccbaec1012508b061fca8c2..255fcbbff79becb419a605b808bddfdcb798e2f3 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -411,7 +411,11 @@ void AESExpandKey256(uint *keybuf)
 	}
 }
 
-#define IDX(x)	(x)
+#if(STRIDED_INDEX==0)
+#   define IDX(x)	(x)
+#else
+#   define IDX(x)	((x) * (Threads))
+#endif
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
 __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
@@ -440,7 +444,12 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		((ulong8 *)State)[0] = vload8(0, input);
 		State[8] = input[8];
@@ -519,7 +528,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		a[0] = states[0] ^ states[4];
 		b[0] = states[2] ^ states[6];
@@ -588,7 +601,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		#if defined(__Tahiti__) || defined(__Pitcairn__)
 
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 0b91212cbf5350bc18373772086e3919201d91e8..0abf84e837bbd434cea63f847ab1bd179baa8192 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -123,7 +123,7 @@ private:
 			// set 8 threads per block (this is a good value for the most gpus)
 			conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 				"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-				"    \"affine_to_cpu\" : false, \n"
+				"    \"affine_to_cpu\" : false, \"strided_index\" : false\n"
 				"  },\n";
 			++i;
 		}
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index a93859cdf013a6439c99c858df0110df5a9aed33..f31a678faf281525c443727801dc009b978a370e 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -5,9 +5,12 @@ R"===(
  *  intensity    - Number of parallel GPU threads (nothing to do with CPU threads)
  *   worksize    - Number of local GPU threads (nothing to do with CPU threads)
  * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ * strided_index - switch memory pattern used for the scratch pad memory
+ *                 false = use a contiguous block of memory per thread
+ *                 true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : false },
  * ],
  */
 
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 0617aeb2fe3c78e4164d3a632173a6f110ca6f97..07afb1964d2a943cc1b2c0e8148ff5aa578b2a9a 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -103,13 +103,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+	stridedIndex = GetObjectMember(oThdConf, "strided_index");
 
-	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr)
+	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr)
 		return false;
 
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -118,9 +119,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!aff->IsUint64() && !aff->IsBool())
 		return false;
 
+	if(!stridedIndex->IsBool())
+		return false;
+
 	cfg.index = idx->GetUint64();
 	cfg.intensity = intensity->GetUint64();
 	cfg.w_size = w_size->GetUint64();
+	cfg.stridedIndex = stridedIndex->GetBool();
 
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt64();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index da024a412246c17a4df8041b081c7fe03fa7776a..ee1882aad754ba32840d959f1c44f255f1af0380 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -26,6 +26,7 @@ public:
 		size_t intensity;
 		size_t w_size;
 		long long cpu_aff;
+		bool stridedIndex;
 	};
 
 	size_t GetThreadCount();
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index c1399e0b8adef5220177168888857f3f428b3e48..103688f8a1f7327e62921c8a9b8568123a0552f5 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -96,6 +96,7 @@ bool minethd::init_gpus()
 		vGpuData[i].deviceIdx = cfg.index;
 		vGpuData[i].rawIntensity = cfg.intensity;
 		vGpuData[i].workSize = cfg.w_size;
+		vGpuData[i].stridedIndex = cfg.stridedIndex;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;