diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 0ab47e390d8571e50fbf4a19f5b549ebd4c531af..7c409d187727f77e5541e128db27a8439c0e4aab 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -20,6 +20,7 @@ #include <memory.h> #include <stdio.h> #include <cfenv> +#include <utility> #ifdef __GNUC__ #include <x86intrin.h> @@ -423,7 +424,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } -inline __m128i int_sqrt33_1_double_precision(const uint64_t n0) +inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0) { __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); x = _mm_sqrt_sd(_mm_setzero_pd(), x); @@ -441,7 +442,7 @@ inline __m128i int_sqrt33_1_double_precision(const uint64_t n0) // Fallback to simpler code if (x2 < n0) ++r; #endif - return _mm_cvtsi64_si128(r); + return r; } inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) @@ -489,6 +490,48 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } +/** optimal type for sqrt + * + * Depending on the number of hashes calculated the optimal type for the sqrt value will be selected. + * + * @tparam N number of hashes per thread + */ +template<size_t N> +struct GetOptimalSqrtType +{ + using type = __m128i; +}; + +template<> +struct GetOptimalSqrtType<1u> +{ + using type = uint64_t; +}; +template<size_t N> +using GetOptimalSqrtType_t = typename GetOptimalSqrtType<N>::type; + +/** assign a value and convert if necessary + * + * @param output output type + * @param input value which is assigned to output + * @{ + */ +inline void assign(__m128i& output, const uint64_t input) +{ + output = _mm_cvtsi64_si128(input); +} + +inline void assign(uint64_t& output, const uint64_t input) +{ + output = input; +} + +inline void assign(uint64_t& output, const __m128i& input) +{ + output = _mm_cvtsi128_si64(input); +} +/** @} */ + inline void set_float_rounding_mode() { #ifdef _MSC_VER @@ -511,14 +554,15 @@ inline void set_float_rounding_mode() _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } -#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \ +#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ if(ALGO == cryptonight_monero_v8) \ { \ - const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm)); \ + uint64_t sqrt_result_tmp; \ + assign(sqrt_result_tmp, sqrt_result); \ /* Use division and square root results from the _previous_ iteration to hide the latency */ \ const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ - cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \ - const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \ + cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \ + const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \ /* Most and least significant bits in the divisor are set to 1 \ * to make sure we don't divide by a small or even number, \ * so there are no shortcuts for such cases \ @@ -531,7 +575,7 @@ inline void set_float_rounding_mode() const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \ division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \ /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ - sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \ + assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ } #define CN_INIT_SINGLE \ @@ -541,7 +585,7 @@ inline void set_float_rounding_mode() return; \ } -#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \ +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \ keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -559,7 +603,7 @@ inline void set_float_rounding_mode() /* BEGIN cryptonight_monero_v8 variables */ \ __m128i bx1; \ __m128i division_result_xmm; \ - __m128i sqrt_result_xmm; \ + GetOptimalSqrtType_t<N> sqrt_result; \ /* END cryptonight_monero_v8 variables */ \ { \ uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ @@ -570,7 +614,7 @@ inline void set_float_rounding_mode() { \ bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ - sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \ + assign(sqrt_result, h0[13]); \ set_float_rounding_mode(); \ } \ } \ @@ -606,13 +650,13 @@ inline void set_float_rounding_mode() if(ALGO != cryptonight_monero_v8) \ bx0 = cx -#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \ +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \ uint64_t lo, cl, ch; \ uint64_t al0 = _mm_cvtsi128_si64(ax0); \ uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ - CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \ + CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \ if(ALGO == cryptonight_monero_v8) \ { \ @@ -745,14 +789,14 @@ struct Cryptonight_hash<1> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -774,14 +818,14 @@ struct Cryptonight_hash<2> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -803,14 +847,14 @@ struct Cryptonight_hash<3> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -832,14 +876,14 @@ struct Cryptonight_hash<4> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -861,14 +905,14 @@ struct Cryptonight_hash<5> constexpr size_t MEM = cn_select_memory<ALGO>(); CN_INIT_SINGLE; - REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); }