From 69f550cb72c4fe22aa6ea6ca8f477559e1899a14 Mon Sep 17 00:00:00 2001 From: psychocrypt <psychocryptHPC@gmail.com> Date: Mon, 10 Sep 2018 08:05:59 +0200 Subject: [PATCH] CPU: cryptonight_v8 Add support for single hash cryptonight_v8. Co-authored-by: SChernykh <sergey.v.chernykh@gmail.com> --- .../backend/cpu/crypto/cryptonight_aesni.h | 135 ++++++++++++++---- xmrstak/backend/cpu/minethd.cpp | 3 + 2 files changed, 114 insertions(+), 24 deletions(-) diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 89c5089..2734760 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -19,6 +19,7 @@ #include "xmrstak/backend/cryptonight.hpp" #include <memory.h> #include <stdio.h> +#include <cfenv> #ifdef __GNUC__ #include <x86intrin.h> @@ -422,6 +423,27 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } +inline __m128i int_sqrt33_1_double_precision(const uint64_t n0) +{ + __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); + x = _mm_sqrt_sd(_mm_setzero_pd(), x); + uint64_t r = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_castpd_si128(x))); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + +#if defined _MSC_VER || (__GNUC__ >= 7) + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); +#else + // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence + // Fallback to simpler code + if (x2 < n0) ++r; +#endif + return _mm_cvtsi64_si128(r); +} + inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) { alignas(16) uint32_t k[4]; @@ -467,6 +489,51 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } +inline void set_float_rounding_mode() +{ +#ifdef _MSC_VER + _control87(RC_DOWN, MCW_RC); +#else + std::fesetround(FE_DOWNWARD); +#endif +} + +#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } + +#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm)); \ + /* Use division and square root results from the _previous_ iteration to hide the latency */ \ + const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \ + const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \ + /* Most and least significant bits in the divisor are set to 1 \ + * to make sure we don't divide by a small or even number, \ + * so there are no shortcuts for such cases \ + * \ + * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ + * We drop the highest bit to fit both quotient and remainder in 32 bits \ + */ \ + /* Compiler will optimize it to a single div instruction */ \ + const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \ + division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \ + /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ + sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \ + } + #define CN_INIT_SINGLE \ if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ { \ @@ -474,7 +541,7 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) return; \ } -#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \ keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -489,16 +556,27 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) uint64_t idx0; \ __m128i bx0; \ uint8_t* l0 = ctx[n]->long_state; \ + /* BEGIN cryptonight_monero_v8 variables */ \ + __m128i bx1; \ + __m128i division_result_xmm; \ + __m128i sqrt_result_xmm; \ + /* END cryptonight_monero_v8 variables */ \ { \ uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ idx0 = h0[0] ^ h0[4]; \ ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ + sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \ + set_float_rounding_mode(); \ + } \ } \ __m128i *ptr0 - -#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ +#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ __m128i cx; \ ptr0 = (__m128i *)&l0[idx0 & MASK]; \ cx = _mm_load_si128(ptr0); \ @@ -512,7 +590,8 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) cx = soft_aesenc(cx, ax0); \ else \ cx = _mm_aesenc_si128(cx, ax0); \ - } + } \ + CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -524,15 +603,22 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) ptr0 = (__m128i *)&l0[idx0 & MASK]; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - bx0 = cx; \ + if(ALGO != cryptonight_monero_v8) \ + bx0 = cx -#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \ uint64_t lo, cl, ch; \ uint64_t al0 = _mm_cvtsi128_si64(ax0); \ uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ - \ + CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \ + CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ { \ uint64_t hi; \ lo = _umul128(idx0, cl, &hi); \ @@ -542,7 +628,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) ((uint64_t*)ptr0)[0] = al0; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0) - #define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -622,6 +707,9 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) #define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n #define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n #define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n +#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n +#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n +#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n /** repeat a macro call multiple times * @@ -657,15 +745,14 @@ struct Cryptonight_hash<1> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - - REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -687,14 +774,14 @@ struct Cryptonight_hash<2> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -716,14 +803,14 @@ struct Cryptonight_hash<3> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -745,14 +832,14 @@ struct Cryptonight_hash<4> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -774,14 +861,14 @@ struct Cryptonight_hash<5> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index e11c820..87f4d32 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -489,6 +489,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_bittube2: algv = 9; break; + case cryptonight_monero_v8: + algv = 10; + break; default: algv = 2; break; -- GitLab