Skip to content
Snippets Groups Projects
Commit 02545538 authored by psychocrypt's avatar psychocrypt
Browse files

optimize single hash cryptonight_v8

If single hash is used the type of the variable to hold the intermediat sqrt value is
changed from `__m128i` to `uint64_t` as suggested by @SChernykh
parent 0a9a9aba
No related branches found
No related tags found
No related merge requests found
......@@ -20,6 +20,7 @@
#include <memory.h>
#include <stdio.h>
#include <cfenv>
#include <utility>
#ifdef __GNUC__
#include <x86intrin.h>
......@@ -423,7 +424,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
_mm_store_si128(output + 11, xout7);
}
inline __m128i int_sqrt33_1_double_precision(const uint64_t n0)
inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
{
__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
x = _mm_sqrt_sd(_mm_setzero_pd(), x);
......@@ -441,7 +442,7 @@ inline __m128i int_sqrt33_1_double_precision(const uint64_t n0)
// Fallback to simpler code
if (x2 < n0) ++r;
#endif
return _mm_cvtsi64_si128(r);
return r;
}
inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
......@@ -489,6 +490,48 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
}
/** optimal type for sqrt
*
* Depending on the number of hashes calculated the optimal type for the sqrt value will be selected.
*
* @tparam N number of hashes per thread
*/
template<size_t N>
struct GetOptimalSqrtType
{
using type = __m128i;
};
template<>
struct GetOptimalSqrtType<1u>
{
using type = uint64_t;
};
template<size_t N>
using GetOptimalSqrtType_t = typename GetOptimalSqrtType<N>::type;
/** assign a value and convert if necessary
*
* @param output output type
* @param input value which is assigned to output
* @{
*/
inline void assign(__m128i& output, const uint64_t input)
{
output = _mm_cvtsi64_si128(input);
}
inline void assign(uint64_t& output, const uint64_t input)
{
output = input;
}
inline void assign(uint64_t& output, const __m128i& input)
{
output = _mm_cvtsi128_si64(input);
}
/** @} */
inline void set_float_rounding_mode()
{
#ifdef _MSC_VER
......@@ -511,14 +554,15 @@ inline void set_float_rounding_mode()
_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
}
#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \
#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
if(ALGO == cryptonight_monero_v8) \
{ \
const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm)); \
uint64_t sqrt_result_tmp; \
assign(sqrt_result_tmp, sqrt_result); \
/* Use division and square root results from the _previous_ iteration to hide the latency */ \
const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \
const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \
cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \
const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \
/* Most and least significant bits in the divisor are set to 1 \
* to make sure we don't divide by a small or even number, \
* so there are no shortcuts for such cases \
......@@ -531,7 +575,7 @@ inline void set_float_rounding_mode()
const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \
division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
/* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \
assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \
}
#define CN_INIT_SINGLE \
......@@ -541,7 +585,7 @@ inline void set_float_rounding_mode()
return; \
}
#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \
#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \
keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
uint64_t monero_const; \
if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
......@@ -559,7 +603,7 @@ inline void set_float_rounding_mode()
/* BEGIN cryptonight_monero_v8 variables */ \
__m128i bx1; \
__m128i division_result_xmm; \
__m128i sqrt_result_xmm; \
GetOptimalSqrtType_t<N> sqrt_result; \
/* END cryptonight_monero_v8 variables */ \
{ \
uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
......@@ -570,7 +614,7 @@ inline void set_float_rounding_mode()
{ \
bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \
assign(sqrt_result, h0[13]); \
set_float_rounding_mode(); \
} \
} \
......@@ -606,13 +650,13 @@ inline void set_float_rounding_mode()
if(ALGO != cryptonight_monero_v8) \
bx0 = cx
#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \
#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
uint64_t lo, cl, ch; \
uint64_t al0 = _mm_cvtsi128_si64(ax0); \
uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
cl = ((uint64_t*)ptr0)[0]; \
ch = ((uint64_t*)ptr0)[1]; \
CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \
CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \
if(ALGO == cryptonight_monero_v8) \
{ \
......@@ -745,14 +789,14 @@ struct Cryptonight_hash<1>
constexpr size_t MEM = cn_select_memory<ALGO>();
CN_INIT_SINGLE;
REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
// Optim - 90% time boundary
for(size_t i = 0; i < ITERATIONS; i++)
{
REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
......@@ -774,14 +818,14 @@ struct Cryptonight_hash<2>
constexpr size_t MEM = cn_select_memory<ALGO>();
CN_INIT_SINGLE;
REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
// Optim - 90% time boundary
for(size_t i = 0; i < ITERATIONS; i++)
{
REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
......@@ -803,14 +847,14 @@ struct Cryptonight_hash<3>
constexpr size_t MEM = cn_select_memory<ALGO>();
CN_INIT_SINGLE;
REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
// Optim - 90% time boundary
for(size_t i = 0; i < ITERATIONS; i++)
{
REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
......@@ -832,14 +876,14 @@ struct Cryptonight_hash<4>
constexpr size_t MEM = cn_select_memory<ALGO>();
CN_INIT_SINGLE;
REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
// Optim - 90% time boundary
for(size_t i = 0; i < ITERATIONS; i++)
{
REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
......@@ -861,14 +905,14 @@ struct Cryptonight_hash<5>
constexpr size_t MEM = cn_select_memory<ALGO>();
CN_INIT_SINGLE;
REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
// Optim - 90% time boundary
for(size_t i = 0; i < ITERATIONS; i++)
{
REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment