Skip to content
Snippets Groups Projects
Commit a5797643 authored by psychocrypt's avatar psychocrypt
Browse files

amd simplify kernel for different algorithms

- remove version numbers within the kernel
- create seperate program context for each mining algorithm
- remove kernel `cn1_monero` is now integrated in `cn1`
- remname `cnX` kernel in `cnX + algorithmNumber`
parent 100b0da7
No related branches found
No related tags found
No related merge requests found
This diff is collapsed.
...@@ -35,8 +35,8 @@ struct GpuContext ...@@ -35,8 +35,8 @@ struct GpuContext
cl_mem InputBuffer; cl_mem InputBuffer;
cl_mem OutputBuffer; cl_mem OutputBuffer;
cl_mem ExtraBuffers[6]; cl_mem ExtraBuffers[6];
cl_program Program; cl_program Program[2];
cl_kernel Kernels[8]; cl_kernel Kernels[2][8];
size_t freeMem; size_t freeMem;
int computeUnits; int computeUnits;
std::string name; std::string name;
...@@ -50,7 +50,7 @@ int getAMDPlatformIdx(); ...@@ -50,7 +50,7 @@ int getAMDPlatformIdx();
std::vector<GpuContext> getAMDDevices(int index); std::vector<GpuContext> getAMDDevices(int index);
size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version); size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version); size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
...@@ -433,15 +433,13 @@ inline ulong getIdx() ...@@ -433,15 +433,13 @@ inline ulong getIdx()
#endif #endif
} }
#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
#define JOIN_DO(x,y) x##y
#define JOIN(x,y) JOIN_DO(x,y)
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
// cryptonight_heavy
#if (ALGO == 4)
, uint version
#endif
)
{ {
ulong State[25]; ulong State[25];
uint ExpandedKey1[40]; uint ExpandedKey1[40];
...@@ -517,23 +515,20 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul ...@@ -517,23 +515,20 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
// cryptonight_heavy // cryptonight_heavy
#if (ALGO == 4) #if (ALGO == 4)
if(version >= 3) __local uint4 xin[8][WORKSIZE];
{
__local uint4 xin[8][WORKSIZE];
/* Also left over threads performe this loop. /* Also left over threads performe this loop.
* The left over thread results will be ignored * The left over thread results will be ignored
*/ */
for(size_t i=0; i < 16; i++) for(size_t i=0; i < 16; i++)
{ {
#pragma unroll #pragma unroll
for(int j = 0; j < 10; ++j) for(int j = 0; j < 10; ++j)
text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]); text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
xin[get_local_id(1)][get_local_id(0)] = text; xin[get_local_id(1)][get_local_id(0)] = text;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
text = mix_and_propagate(xin); text = mix_and_propagate(xin);
}
} }
#endif #endif
...@@ -542,13 +537,9 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul ...@@ -542,13 +537,9 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
if(gIdx < Threads) if(gIdx < Threads)
#endif #endif
{ {
int iterations = MEMORY >> 7;
#if (ALGO == 4)
if(version < 3)
iterations >>= 1;
#endif
#pragma unroll 2 #pragma unroll 2
for(int i = 0; i < iterations; ++i) for(int i = 0; i < (MEMORY >> 7); ++i)
{ {
#pragma unroll #pragma unroll
for(int j = 0; j < 10; ++j) for(int j = 0; j < 10; ++j)
...@@ -560,22 +551,13 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul ...@@ -560,22 +551,13 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
} }
#define VARIANT1_1(p) \
uint table = 0x75310U; \
uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
(p).s2 ^= ((table >> index) & 0x30U) << 24
#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2
#define VARIANT1_INIT() \
tweak1_2 = as_uint2(input[4]); \
tweak1_2.s0 >>= 24; \
tweak1_2.s0 |= tweak1_2.s1 << 8; \
tweak1_2.s1 = get_global_id(0); \
tweak1_2 ^= as_uint2(states[24])
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, __global ulong *input) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
// cryptonight_monero || cryptonight_aeon
#if(ALGO == 3 || ALGO == 5)
, __global ulong *input
#endif
)
{ {
ulong a[2], b[2]; ulong a[2], b[2];
__local uint AES0[256], AES1[256], AES2[256], AES3[256]; __local uint AES0[256], AES1[256], AES2[256], AES3[256];
...@@ -592,8 +574,9 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo ...@@ -592,8 +574,9 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#if(ALGO == 3 || ALGO == 5)
uint2 tweak1_2; uint2 tweak1_2;
#endif
uint4 b_x; uint4 b_x;
#if(COMP_MODE==1) #if(COMP_MODE==1)
// do not use early return here // do not use early return here
...@@ -615,7 +598,13 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo ...@@ -615,7 +598,13 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
b[1] = states[3] ^ states[7]; b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0]; b_x = ((uint4 *)b)[0];
VARIANT1_INIT(); #if(ALGO == 3 || ALGO == 5)
tweak1_2 = as_uint2(input[4]);
tweak1_2.s0 >>= 24;
tweak1_2.s0 |= tweak1_2.s1 << 8;
tweak1_2.s1 = get_global_id(0);
tweak1_2 ^= as_uint2(states[24]);
#endif
} }
mem_fence(CLK_LOCAL_MEM_FENCE); mem_fence(CLK_LOCAL_MEM_FENCE);
...@@ -625,17 +614,23 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo ...@@ -625,17 +614,23 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
if(gIdx < Threads) if(gIdx < Threads)
#endif #endif
{ {
ulong idx0 = a[0];
#pragma unroll 8 #pragma unroll 8
for(int i = 0; i < ITERATIONS; ++i) for(int i = 0; i < ITERATIONS; ++i)
{ {
ulong c[2]; ulong c[2];
((uint4 *)c)[0] = Scratchpad[IDX((a[0] & MASK) >> 4)]; ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
b_x ^= ((uint4 *)c)[0]; b_x ^= ((uint4 *)c)[0];
VARIANT1_1(b_x); #if(ALGO == 3 || ALGO == 5)
Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x; uint table = 0x75310U;
uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2);
b_x.s2 ^= ((table >> index) & 0x30U) << 24;
#endif
Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
uint4 tmp; uint4 tmp;
tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
...@@ -643,101 +638,14 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo ...@@ -643,101 +638,14 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
a[1] += c[0] * as_ulong2(tmp).s0; a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0); a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
VARIANT1_2(a[1]);
Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
VARIANT1_2(a[1]);
((uint4 *)a)[0] ^= tmp;
b_x = ((uint4 *)c)[0];
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads
// cryptonight_heavy
#if (ALGO == 4)
, uint version
#endif
)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
const ulong gIdx = getIdx();
for(int i = get_local_id(0); i < 256; i += WORKSIZE)
{
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
AES2[i] = rotate(tmp, 16U);
AES3[i] = rotate(tmp, 24U);
}
barrier(CLK_LOCAL_MEM_FENCE);
uint4 b_x;
#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
#endif
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (MEMORY >> 4);
#elif(STRIDED_INDEX==1)
Scratchpad += gIdx;
#elif(STRIDED_INDEX==2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
#endif
a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0];
}
mem_fence(CLK_LOCAL_MEM_FENCE);
#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
#endif
{
ulong idx0 = a[0];
ulong mask = MASK;
int iterations = ITERATIONS; #if(ALGO == 3 || ALGO == 5)
#if (ALGO == 4) ((uint2 *)&(a[1]))[0] ^= tweak1_2;
if(version < 3) Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
{ ((uint2 *)&(a[1]))[0] ^= tweak1_2;
iterations <<= 1; #else
mask -= 0x200000; Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
}
#endif #endif
#pragma unroll 8
for(int i = 0; i < iterations; ++i)
{
ulong c[2];
((uint4 *)c)[0] = Scratchpad[IDX((idx0 & mask) >> 4)];
((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
//b_x ^= ((uint4 *)c)[0];
Scratchpad[IDX((idx0 & mask) >> 4)] = b_x ^ ((uint4 *)c)[0];
uint4 tmp;
tmp = Scratchpad[IDX((c[0] & mask) >> 4)];
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
Scratchpad[IDX((c[0] & mask) >> 4)] = ((uint4 *)a)[0];
((uint4 *)a)[0] ^= tmp; ((uint4 *)a)[0] ^= tmp;
idx0 = a[0]; idx0 = a[0];
...@@ -745,14 +653,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre ...@@ -745,14 +653,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
b_x = ((uint4 *)c)[0]; b_x = ((uint4 *)c)[0];
// cryptonight_heavy // cryptonight_heavy
#if (ALGO == 4) #if (ALGO == 4)
if(version >= 3) long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
{ int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
long n = *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))); long q = n / (d | 0x5);
int d = ((__global int*)(Scratchpad + (IDX((idx0 & mask) >> 4))))[2]; *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
long q = n / (d | 0x5); idx0 = d ^ q;
*((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))) = n ^ q;
idx0 = d ^ q;
}
#endif #endif
} }
} }
...@@ -760,12 +665,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre ...@@ -760,12 +665,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
} }
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
// cryptonight_heavy
#if (ALGO == 4)
, uint version
#endif
)
{ {
__local uint AES0[256], AES1[256], AES2[256], AES3[256]; __local uint AES0[256], AES1[256], AES2[256], AES3[256];
uint ExpandedKey2[40]; uint ExpandedKey2[40];
...@@ -827,58 +727,42 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u ...@@ -827,58 +727,42 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
if(gIdx < Threads) if(gIdx < Threads)
#endif #endif
{ {
int iterations = MEMORY >> 7;
#if (ALGO == 4) #if (ALGO == 4)
if(version < 3) #pragma unroll 2
{ for(int i = 0; i < (MEMORY >> 7); ++i)
iterations >>= 1;
#pragma unroll 2
for(int i = 0; i < iterations; ++i)
{
text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
#pragma unroll 10
for(int j = 0; j < 10; ++j)
text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
}
}
else
{ {
#pragma unroll 2 text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
for(int i = 0; i < iterations; ++i)
{
text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
#pragma unroll 10 #pragma unroll 10
for(int j = 0; j < 10; ++j) for(int j = 0; j < 10; ++j)
text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
xin[get_local_id(1)][get_local_id(0)] = text; xin[get_local_id(1)][get_local_id(0)] = text;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
text = mix_and_propagate(xin); text = mix_and_propagate(xin);
} }
#pragma unroll 2 #pragma unroll 2
for(int i = 0; i < iterations; ++i) for(int i = 0; i < (MEMORY >> 7); ++i)
{ {
text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
#pragma unroll 10 #pragma unroll 10
for(int j = 0; j < 10; ++j) for(int j = 0; j < 10; ++j)
text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
xin[get_local_id(1)][get_local_id(0)] = text; xin[get_local_id(1)][get_local_id(0)] = text;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
text = mix_and_propagate(xin); text = mix_and_propagate(xin);
}
} }
#else #else
#pragma unroll 2 #pragma unroll 2
for(int i = 0; i < iterations; ++i) for(int i = 0; i < (MEMORY >> 7); ++i)
{ {
text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
...@@ -891,21 +775,18 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u ...@@ -891,21 +775,18 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
// cryptonight_heavy // cryptonight_heavy
#if (ALGO == 4) #if (ALGO == 4)
if(version >= 3) /* Also left over threads perform this loop.
* The left over thread results will be ignored
*/
for(size_t i=0; i < 16; i++)
{ {
/* Also left over threads performe this loop. #pragma unroll
* The left over thread results will be ignored for(int j = 0; j < 10; ++j)
*/ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
for(size_t i=0; i < 16; i++) barrier(CLK_LOCAL_MEM_FENCE);
{ xin[get_local_id(1)][get_local_id(0)] = text;
#pragma unroll barrier(CLK_LOCAL_MEM_FENCE);
for(int j = 0; j < 10; ++j) text = mix_and_propagate(xin);
text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
barrier(CLK_LOCAL_MEM_FENCE);
xin[get_local_id(1)][get_local_id(0)] = text;
barrier(CLK_LOCAL_MEM_FENCE);
text = mix_and_propagate(xin);
}
} }
#endif #endif
......
...@@ -233,7 +233,7 @@ void minethd::work_main() ...@@ -233,7 +233,7 @@ void minethd::work_main()
assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
uint64_t target = oWork.iTarget; uint64_t target = oWork.iTarget;
XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, version); XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo);
if(oWork.bNiceHash) if(oWork.bNiceHash)
pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39); pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39);
...@@ -249,7 +249,7 @@ void minethd::work_main() ...@@ -249,7 +249,7 @@ void minethd::work_main()
cl_uint results[0x100]; cl_uint results[0x100];
memset(results,0,sizeof(cl_uint)*(0x100)); memset(results,0,sizeof(cl_uint)*(0x100));
XMRRunJob(pGpuCtx, results, miner_algo, version); XMRRunJob(pGpuCtx, results, miner_algo);
for(size_t i = 0; i < results[0xFF]; i++) for(size_t i = 0; i < results[0xFF]; i++)
{ {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment