diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 286bc39b6f841cc4c1b863442cdb9a4acff5904b..e65f0ed05d09dd70c5138f8c13149e9d7bddc976 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -748,19 +748,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states // Use division_result as an input for the square root to prevent parallel implementation in hardware sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); #endif + ulong2 result_mul; + result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0); + result_mul.s1 = c[0] * as_ulong2(tmp).s0; // cryptonight_monero_v8 #if(ALGO==11) { - ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul; ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + result_mul ^= chunk2; ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); } #endif - a[1] += c[0] * as_ulong2(tmp).s0; - a[0] += mul_hi(c[0], as_ulong2(tmp).s0); + a[1] += result_mul.s1; + a[0] += result_mul.s0; // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc index 21f1f48c34ecde27a014c57a8ab3e8061f013d11..bc4a82f8668e6a5811807d835c84cb19342fc2f4 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc @@ -113,17 +113,21 @@ sqrt_fixup_ivybridge_ret: mov r9, r10 mov rax, rdi mul rbp + movq xmm0, rax + movq xmm1, rdx + punpcklqdq xmm1, xmm0 xor r9, 16 mov rcx, r10 xor rcx, 32 xor r10, 48 - add r8, rdx - add r11, rax - movdqu xmm0, XMMWORD PTR [r10+rbx] movdqu xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm1 + movdqu xmm0, XMMWORD PTR [r10+rbx] paddq xmm0, xmm5 movdqu xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] paddq xmm2, xmm4 paddq xmm1, xmm7 movdqa xmm5, xmm4 @@ -131,6 +135,8 @@ sqrt_fixup_ivybridge_ret: movdqa xmm4, xmm6 movdqu XMMWORD PTR [rcx+rbx], xmm2 movdqu XMMWORD PTR [r10+rbx], xmm1 + add r8, rdx + add r11, rax mov QWORD PTR [r14], r8 xor r8, rdi mov r10, r8 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc index ee7f3171633834ec1684a0908586da8381abbbb7..3687d999b9fedd0f6b137cc5a48d8d4f70d29a93 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc @@ -113,17 +113,21 @@ sqrt_fixup_ivybridge_ret: mov r9, r10 mov rax, rdi mul rbp + movq xmm0, rax + movq xmm1, rdx + punpcklqdq xmm1, xmm0 xor r9, 16 mov rcx, r10 xor rcx, 32 xor r10, 48 - add r8, rdx - add r11, rax - movdqu xmm0, XMMWORD PTR [r10+rbx] movdqu xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm1 + movdqu xmm0, XMMWORD PTR [r10+rbx] paddq xmm0, xmm5 movdqu xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] paddq xmm2, xmm4 paddq xmm1, xmm7 movdqa xmm5, xmm4 @@ -131,6 +135,8 @@ sqrt_fixup_ivybridge_ret: movdqa xmm4, xmm6 movdqu XMMWORD PTR [rcx+rbx], xmm2 movdqu XMMWORD PTR [r10+rbx], xmm1 + add r8, rdx + add r11, rax mov QWORD PTR [r14], r8 xor r8, rdi mov r10, r8 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc index 9c177b85aeef77cc50a356b187b536dcbae4f27c..a375a661fbc811f81980f5253dc91d5151157cc1 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc @@ -109,14 +109,20 @@ main_loop_ryzen: sqrt_fixup_ryzen_ret: mov rax, rsi mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 mov r9d, r10d mov ecx, r10d xor r9d, 16 xor ecx, 32 xor r10d, 48 - movdqa xmm0, XMMWORD PTR [r10+rbx] - movdqa xmm2, XMMWORD PTR [r9+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + movdqa xmm0, XMMWORD PTR [r10+rbx] movdqa xmm1, XMMWORD PTR [rcx+rbx] paddq xmm0, xmm4 paddq xmm2, xmm3 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc index f70dccef80732cd97bf0e31967f0db50a015ed78..a55004e426da66d105ae636e5ee2535b6aa70b92 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc @@ -109,14 +109,20 @@ main_loop_ryzen: sqrt_fixup_ryzen_ret: mov rax, rsi mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 mov r9d, r10d mov ecx, r10d xor r9d, 16 xor ecx, 32 xor r10d, 48 - movdqa xmm0, XMMWORD PTR [r10+rbx] - movdqa xmm2, XMMWORD PTR [r9+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + movdqa xmm0, XMMWORD PTR [r10+rbx] movdqa xmm1, XMMWORD PTR [rcx+rbx] paddq xmm0, xmm4 paddq xmm2, xmm3 diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 6edae905ee12a1f8060ba08108a15dd06a036def..c0f122fd6f94bc658a746e85af170c4835a54837 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -543,7 +543,7 @@ inline void set_float_rounding_mode() #endif } -#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \ +#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ if(ALGO == cryptonight_monero_v8) \ { \ @@ -556,6 +556,21 @@ inline void set_float_rounding_mode() _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } +#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + hi ^= ((uint64_t*)&chunk2)[0]; \ + lo ^= ((uint64_t*)&chunk2)[1]; \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } + #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ if(ALGO == cryptonight_monero_v8) \ { \ @@ -637,7 +652,7 @@ inline void set_float_rounding_mode() else \ cx = _mm_aesenc_si128(cx, ax0); \ } \ - CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) + CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -659,18 +674,18 @@ inline void set_float_rounding_mode() cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ - CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \ - if(ALGO == cryptonight_monero_v8) \ - { \ - bx1 = bx0; \ - bx0 = cx; \ - } \ { \ uint64_t hi; \ lo = _umul128(idx0, cl, &hi); \ + CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ ah0 += lo; \ al0 += hi; \ } \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ ((uint64_t*)ptr0)[0] = al0; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0) diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 05743ae922134649e7ffec347ef88fb71010e7fa..a344a9ffe74a17f584313ae7e28940c0d6567359 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -310,11 +310,11 @@ bool minethd::self_test() { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8); hashf("This is a test This is a test This is a test", 44, out, ctx); - bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8); hashf("This is a test This is a test This is a test", 44, out, ctx); - bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0; } else if(algo == cryptonight_aeon) {