Skip to content
Snippets Groups Projects
Commit 522ff6a6 authored by psychocrypt's avatar psychocrypt
Browse files

NVIDIA: optimize shuffle

- use shared memory to exchange
parent d035dbc1
No related branches found
No related tags found
No related merge requests found
...@@ -252,7 +252,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti ...@@ -252,7 +252,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
const int start = partidx * batchsize; const int start = partidx * batchsize;
const int end = start + batchsize; const int end = start + batchsize;
uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY]; uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
uint32_t a, a1, d[2], idx0; uint32_t a, d[2], idx0;
uint32_t t1[2], t2[2], res; uint32_t t1[2], t2[2], res;
uint32_t tweak1_2[2]; uint32_t tweak1_2[2];
...@@ -296,7 +296,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti ...@@ -296,7 +296,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
for ( int x = 0; x < 2; ++x ) for ( int x = 0; x < 2; ++x )
{ {
j = ( ( idx0 & MASK ) >> 2 ) + sub; j = ( ( idx0 & MASK ) >> 2 ) + sub;
if(ALGO == cryptonight_bittube2) if(ALGO == cryptonight_bittube2)
{ {
uint32_t k[4]; uint32_t k[4];
...@@ -327,50 +327,69 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti ...@@ -327,50 +327,69 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
} }
} }
} }
else if(ALGO == cryptonight_monero_v8)
{
const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
uint4 chunk0{};
chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0);
chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0);
chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0);
chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0);
const uint32_t x_0 = ((uint32_t*)&chunk0)[sub];
const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4];
const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4];
const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4];
d[x] = a ^
t_fn0( x_0 & 0xff ) ^
t_fn1( (x_1 >> 8) & 0xff ) ^
t_fn2( (x_2 >> 16) & 0xff ) ^
t_fn3( ( x_3 >> 24 ) );
uint4 value;
const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
if(sub == 1)
((uint64_t*)&value)[0] = tmp10;
const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
if(sub == 1)
((uint64_t*)&value)[1] = tmp20;
const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
if(sub == 2)
((uint64_t*)&value)[0] = tmp11;
const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
if(sub == 2)
((uint64_t*)&value)[1] = tmp21;
const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
if(sub == 3)
((uint64_t*)&value)[0] = tmp12;
const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
if(sub == 3)
((uint64_t*)&value)[1] = tmp22;
if(sub > 0)
{
uint4 store{};
((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
const int dest = sub + 1;
const int dest2 = dest == 4 ? 1 : dest;
*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
}
}
else else
{ {
const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j ); const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
if(ALGO == cryptonight_monero_v8)
{
a1 = a;
}
d[x] = a ^ d[x] = a ^
t_fn0( x_0 & 0xff ) ^ t_fn0( x_0 & 0xff ) ^
t_fn1( (x_1 >> 8) & 0xff ) ^ t_fn1( (x_1 >> 8) & 0xff ) ^
t_fn2( (x_2 >> 16) & 0xff ) ^ t_fn2( (x_2 >> 16) & 0xff ) ^
t_fn3( ( x_3 >> 24 ) ); t_fn3( ( x_3 >> 24 ) );
} }
// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
if(ALGO == cryptonight_monero_v8)
{
// Shuffle constants here were chosen carefully
// to maximize permutation cycle length
// and have no 2-byte elements stay in their places
const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10) );
const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20) );
const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30) );
uint32_t src = sub & 2;
const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1);
const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1);
const uint64_t cc3 = bx1_64 + chunk3_64;
storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]);
const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1);
const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1);
const uint64_t cc1 = bx0_64 + chunk1_64;
storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]);
const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1);
const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1);
const uint64_t cc2 = ax0_64 + chunk2_64;
storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]);
}
//XOR_BLOCKS_DST(c, b, &long_state[j]); //XOR_BLOCKS_DST(c, b, &long_state[j]);
t1[0] = shuffle<4>(sPtr,sub, d[x], 0); t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
...@@ -443,30 +462,36 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti ...@@ -443,30 +462,36 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
// Shuffle the other 3x16 byte chunks in the current 64-byte cache line // Shuffle the other 3x16 byte chunks in the current 64-byte cache line
if(ALGO == cryptonight_monero_v8) if(ALGO == cryptonight_monero_v8)
{ {
// Shuffle constants here were chosen carefully uint4 value;
// to maximize permutation cycle length const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
// and have no 2-byte elements stay in their places if(sub == 1)
const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10) ); ((uint64_t*)&value)[0] = tmp10;
const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20) ); const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30) ); if(sub == 1)
((uint64_t*)&value)[1] = tmp20;
uint32_t src = sub & 2; const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1); if(sub == 2)
const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1); ((uint64_t*)&value)[0] = tmp11;
const uint64_t cc3 = bx1_64 + chunk3_64; const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]); if(sub == 2)
((uint64_t*)&value)[1] = tmp21;
const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
if(sub == 3)
const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1); ((uint64_t*)&value)[0] = tmp12;
const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1); const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
const uint64_t cc1 = bx0_64 + chunk1_64; if(sub == 3)
storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]); ((uint64_t*)&value)[1] = tmp22;
if(sub > 0)
const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1); {
const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1); const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
const uint64_t cc2 = ax0_64 + chunk2_64; uint4 store{};
storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]); ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
const int dest = sub + 1;
const int dest2 = dest == 4 ? 1 : dest;
*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
}
} }
t1[1] = shuffle<4>(sPtr,sub, d[x], 1); t1[1] = shuffle<4>(sPtr,sub, d[x], 1);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment