From f6f4070cf11e9c84e3926a0df23d699706842cb2 Mon Sep 17 00:00:00 2001 From: psychocrypt <psychocryptHPC@gmail.com> Date: Sun, 8 Jul 2018 20:13:02 +0200 Subject: [PATCH] optimize cn-heavy AMD - explicit loop unrolling based on changes in @imperdin fork https://github.com/imperdin/xmr-stak/blob/master/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 7bbc386..ed4bebb 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -520,9 +520,10 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, /* Also left over threads perform this loop. * The left over thread results will be ignored */ + #pragma unroll 16 for(size_t i=0; i < 16; i++) { - #pragma unroll + #pragma unroll 10 for(int j = 0; j < 10; ++j) text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]); barrier(CLK_LOCAL_MEM_FENCE); @@ -805,9 +806,10 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states /* Also left over threads perform this loop. * The left over thread results will be ignored */ + #pragma unroll 16 for(size_t i=0; i < 16; i++) { - #pragma unroll + #pragma unroll 10 for(int j = 0; j < 10; ++j) text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); barrier(CLK_LOCAL_MEM_FENCE); -- GitLab