Skip to content
Snippets Groups Projects
Commit 9cab66c6 authored by fireice-uk's avatar fireice-uk
Browse files

Software AES support (mostly for GPU hashing)

parent 1da5402d
No related branches found
No related tags found
No related merge requests found
...@@ -25,6 +25,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al ...@@ -25,6 +25,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
void cryptonight_free_ctx(cryptonight_ctx* ctx); void cryptonight_free_ctx(cryptonight_ctx* ctx);
void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx); void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx); void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1); void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1);
......
...@@ -42,6 +42,9 @@ extern "C" ...@@ -42,6 +42,9 @@ extern "C"
void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen); void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
void keccakf(uint64_t st[25], int rounds); void keccakf(uint64_t st[25], int rounds);
extern void(*const extra_hashes[4])(const void *, size_t, char *); extern void(*const extra_hashes[4])(const void *, size_t, char *);
__m128i soft_aesenc(__m128i in, __m128i key);
__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
} }
// This will shift and xor tmp1 into itself as 4 32-bit vals such as // This will shift and xor tmp1 into itself as 4 32-bit vals such as
...@@ -58,61 +61,66 @@ static inline __m128i sl_xor(__m128i tmp1) ...@@ -58,61 +61,66 @@ static inline __m128i sl_xor(__m128i tmp1)
return tmp1; return tmp1;
} }
static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon)
{
__m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon);
xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
*xout0 = sl_xor(*xout0);
*xout0 = _mm_xor_si128(*xout0, xout1);
xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00);
xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
*xout2 = sl_xor(*xout2);
*xout2 = _mm_xor_si128(*xout2, xout1);
}
static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon)
{
__m128i xout1 = soft_aeskeygenassist(*xout2, rcon);
xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
*xout0 = sl_xor(*xout0);
*xout0 = _mm_xor_si128(*xout0, xout1);
xout1 = soft_aeskeygenassist(*xout0, 0x00);
xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
*xout2 = sl_xor(*xout2);
*xout2 = _mm_xor_si128(*xout2, xout1);
}
template<bool SOFT_AES>
static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3,
__m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
{ {
__m128i xout0, xout1, xout2; __m128i xout0, xout2;
xout0 = _mm_load_si128(memory); xout0 = _mm_load_si128(memory);
xout2 = _mm_load_si128(memory+1); xout2 = _mm_load_si128(memory+1);
*k0 = xout0; *k0 = xout0;
*k1 = xout2; *k1 = xout2;
xout1 = _mm_aeskeygenassist_si128(xout2, 0x01); if(SOFT_AES)
xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem soft_aes_genkey_sub(&xout0, &xout2, 0x01);
xout0 = sl_xor(xout0); else
xout0 = _mm_xor_si128(xout0, xout1); aes_genkey_sub(&xout0, &xout2, 0x01);
xout1 = _mm_aeskeygenassist_si128(xout0, 0x00);
xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
xout2 = sl_xor(xout2);
xout2 = _mm_xor_si128(xout2, xout1);
*k2 = xout0; *k2 = xout0;
*k3 = xout2; *k3 = xout2;
xout1 = _mm_aeskeygenassist_si128(xout2, 0x02); if(SOFT_AES)
xout1 = _mm_shuffle_epi32(xout1, 0xFF); soft_aes_genkey_sub(&xout0, &xout2, 0x02);
xout0 = sl_xor(xout0); else
xout0 = _mm_xor_si128(xout0, xout1); aes_genkey_sub(&xout0, &xout2, 0x02);
xout1 = _mm_aeskeygenassist_si128(xout0, 0x00);
xout1 = _mm_shuffle_epi32(xout1, 0xAA);
xout2 = sl_xor(xout2);
xout2 = _mm_xor_si128(xout2, xout1);
*k4 = xout0; *k4 = xout0;
*k5 = xout2; *k5 = xout2;
xout1 = _mm_aeskeygenassist_si128(xout2, 0x04); if(SOFT_AES)
xout1 = _mm_shuffle_epi32(xout1, 0xFF); soft_aes_genkey_sub(&xout0, &xout2, 0x04);
xout0 = sl_xor(xout0); else
xout0 = _mm_xor_si128(xout0, xout1); aes_genkey_sub(&xout0, &xout2, 0x04);
xout1 = _mm_aeskeygenassist_si128(xout0, 0x00);
xout1 = _mm_shuffle_epi32(xout1, 0xAA);
xout2 = sl_xor(xout2);
xout2 = _mm_xor_si128(xout2, xout1);
*k6 = xout0; *k6 = xout0;
*k7 = xout2; *k7 = xout2;
xout1 = _mm_aeskeygenassist_si128(xout2, 0x08); if(SOFT_AES)
xout1 = _mm_shuffle_epi32(xout1, 0xFF); soft_aes_genkey_sub(&xout0, &xout2, 0x08);
xout0 = sl_xor(xout0); else
xout0 = _mm_xor_si128(xout0, xout1); aes_genkey_sub(&xout0, &xout2, 0x08);
xout1 = _mm_aeskeygenassist_si128(xout0, 0x00);
xout1 = _mm_shuffle_epi32(xout1, 0xAA);
xout2 = sl_xor(xout2);
xout2 = _mm_xor_si128(xout2, xout1);
*k8 = xout0; *k8 = xout0;
*k9 = xout2; *k9 = xout2;
} }
...@@ -129,14 +137,26 @@ static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, ...@@ -129,14 +137,26 @@ static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2,
*x7 = _mm_aesenc_si128(*x7, key); *x7 = _mm_aesenc_si128(*x7, key);
} }
template<size_t MEM> static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
{
*x0 = soft_aesenc(*x0, key);
*x1 = soft_aesenc(*x1, key);
*x2 = soft_aesenc(*x2, key);
*x3 = soft_aesenc(*x3, key);
*x4 = soft_aesenc(*x4, key);
*x5 = soft_aesenc(*x5, key);
*x6 = soft_aesenc(*x6, key);
*x7 = soft_aesenc(*x7, key);
}
template<size_t MEM, bool SOFT_AES>
void cn_explode_scratchpad(const __m128i* input, __m128i* output) void cn_explode_scratchpad(const __m128i* input, __m128i* output)
{ {
// This is more than we have registers, compiler will assign 2 keys on the stack // This is more than we have registers, compiler will assign 2 keys on the stack
__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
xin0 = _mm_load_si128(input + 4); xin0 = _mm_load_si128(input + 4);
xin1 = _mm_load_si128(input + 5); xin1 = _mm_load_si128(input + 5);
...@@ -149,16 +169,32 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) ...@@ -149,16 +169,32 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
{ {
aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); if(SOFT_AES)
aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); {
aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); soft_aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); soft_aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); soft_aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); soft_aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); soft_aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); soft_aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); soft_aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); soft_aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
soft_aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
soft_aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
}
else
{
aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
}
_mm_store_si128(output + i + 0, xin0); _mm_store_si128(output + i + 0, xin0);
_mm_store_si128(output + i + 1, xin1); _mm_store_si128(output + i + 1, xin1);
...@@ -173,14 +209,14 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) ...@@ -173,14 +209,14 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
} }
} }
template<size_t MEM> template<size_t MEM, bool SOFT_AES>
void cn_implode_scratchpad(const __m128i* input, __m128i* output) void cn_implode_scratchpad(const __m128i* input, __m128i* output)
{ {
// This is more than we have registers, compiler will assign 2 keys on the stack // This is more than we have registers, compiler will assign 2 keys on the stack
__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
xout0 = _mm_load_si128(output + 4); xout0 = _mm_load_si128(output + 4);
xout1 = _mm_load_si128(output + 5); xout1 = _mm_load_si128(output + 5);
...@@ -204,16 +240,32 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) ...@@ -204,16 +240,32 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); if(SOFT_AES)
aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); {
aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
}
else
{
aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
}
} }
_mm_store_si128(output + 4, xout0); _mm_store_si128(output + 4, xout0);
...@@ -226,13 +278,13 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) ...@@ -226,13 +278,13 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
_mm_store_si128(output + 11, xout7); _mm_store_si128(output + 11, xout7);
} }
template<size_t ITERATIONS, size_t MEM, bool PREFETCH> template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES>
void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0) void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
{ {
keccak((const uint8_t *)input, len, ctx0->hash_state, 200); keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
// Optim - 99% time boundary // Optim - 99% time boundary
cn_explode_scratchpad<MEM>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
uint8_t* l0 = ctx0->long_state; uint8_t* l0 = ctx0->long_state;
uint64_t* h0 = (uint64_t*)ctx0->hash_state; uint64_t* h0 = (uint64_t*)ctx0->hash_state;
...@@ -248,7 +300,10 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c ...@@ -248,7 +300,10 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
{ {
__m128i cx; __m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); if(SOFT_AES)
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
else
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx); idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx; bx0 = cx;
...@@ -271,7 +326,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c ...@@ -271,7 +326,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
} }
// Optim - 90% time boundary // Optim - 90% time boundary
cn_implode_scratchpad<MEM>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
// Optim - 99% time boundary // Optim - 99% time boundary
...@@ -282,15 +337,15 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c ...@@ -282,15 +337,15 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon // This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
template<size_t ITERATIONS, size_t MEM, bool PREFETCH> template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES>
void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
{ {
keccak((const uint8_t *)input, len, ctx0->hash_state, 200); keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200); keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200);
// Optim - 99% time boundary // Optim - 99% time boundary
cn_explode_scratchpad<MEM>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
cn_explode_scratchpad<MEM>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state); cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
uint8_t* l0 = ctx0->long_state; uint8_t* l0 = ctx0->long_state;
uint64_t* h0 = (uint64_t*)ctx0->hash_state; uint64_t* h0 = (uint64_t*)ctx0->hash_state;
...@@ -310,7 +365,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto ...@@ -310,7 +365,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
{ {
__m128i cx; __m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
cx = _mm_aesenc_si128(cx, ax0); if(SOFT_AES)
cx = soft_aesenc(cx, ax0);
else
cx = _mm_aesenc_si128(cx, ax0);
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx); idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx; bx0 = cx;
...@@ -318,7 +376,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto ...@@ -318,7 +376,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0); _mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]); cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]);
cx = _mm_aesenc_si128(cx, ax1); if(SOFT_AES)
cx = soft_aesenc(cx, ax1);
else
cx = _mm_aesenc_si128(cx, ax1);
_mm_store_si128((__m128i *)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx)); _mm_store_si128((__m128i *)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx));
idx1 = _mm_cvtsi128_si64(cx); idx1 = _mm_cvtsi128_si64(cx);
bx1 = cx; bx1 = cx;
...@@ -346,8 +407,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto ...@@ -346,8 +407,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
} }
// Optim - 90% time boundary // Optim - 90% time boundary
cn_implode_scratchpad<MEM>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
cn_implode_scratchpad<MEM>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state); cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
// Optim - 99% time boundary // Optim - 99% time boundary
...@@ -355,4 +416,4 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto ...@@ -355,4 +416,4 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
keccakf((uint64_t*)ctx1->hash_state, 24); keccakf((uint64_t*)ctx1->hash_state, 24);
extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32); extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32);
} }
\ No newline at end of file
...@@ -174,15 +174,20 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx) ...@@ -174,15 +174,20 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx)
void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx) void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
{ {
cryptonight_hash<0x80000, MEMORY, true>(input, len, output, ctx); cryptonight_hash<0x80000, MEMORY, true, false>(input, len, output, ctx);
}
void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
{
cryptonight_hash<0x80000, MEMORY, true, true>(input, len, output, ctx);
} }
void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx) void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
{ {
cryptonight_hash<0x80000, MEMORY, false>(input, len, output, ctx); cryptonight_hash<0x80000, MEMORY, false, false>(input, len, output, ctx);
} }
void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
{ {
cryptonight_double_hash<0x80000, MEMORY, false>(input, len, output, ctx0, ctx1); cryptonight_double_hash<0x80000, MEMORY, false, false>(input, len, output, ctx0, ctx1);
} }
\ No newline at end of file
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* The orginal author of this AES implementation is Karl Malbrain.
*/
#ifdef __GNUC__
#include <x86intrin.h>
#else
#include <intrin.h>
#endif // __GNUC__
#include <inttypes.h>
uint8_t Sbox[256] = { // forward s-box
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
// combined Xtimes2[Sbox[]]
uint8_t Xtime2Sbox[256] = {
0xc6, 0xf8, 0xee, 0xf6, 0xff, 0xd6, 0xde, 0x91, 0x60, 0x02, 0xce, 0x56, 0xe7, 0xb5, 0x4d, 0xec,
0x8f, 0x1f, 0x89, 0xfa, 0xef, 0xb2, 0x8e, 0xfb, 0x41, 0xb3, 0x5f, 0x45, 0x23, 0x53, 0xe4, 0x9b,
0x75, 0xe1, 0x3d, 0x4c, 0x6c, 0x7e, 0xf5, 0x83, 0x68, 0x51, 0xd1, 0xf9, 0xe2, 0xab, 0x62, 0x2a,
0x08, 0x95, 0x46, 0x9d, 0x30, 0x37, 0x0a, 0x2f, 0x0e, 0x24, 0x1b, 0xdf, 0xcd, 0x4e, 0x7f, 0xea,
0x12, 0x1d, 0x58, 0x34, 0x36, 0xdc, 0xb4, 0x5b, 0xa4, 0x76, 0xb7, 0x7d, 0x52, 0xdd, 0x5e, 0x13,
0xa6, 0xb9, 0x00, 0xc1, 0x40, 0xe3, 0x79, 0xb6, 0xd4, 0x8d, 0x67, 0x72, 0x94, 0x98, 0xb0, 0x85,
0xbb, 0xc5, 0x4f, 0xed, 0x86, 0x9a, 0x66, 0x11, 0x8a, 0xe9, 0x04, 0xfe, 0xa0, 0x78, 0x25, 0x4b,
0xa2, 0x5d, 0x80, 0x05, 0x3f, 0x21, 0x70, 0xf1, 0x63, 0x77, 0xaf, 0x42, 0x20, 0xe5, 0xfd, 0xbf,
0x81, 0x18, 0x26, 0xc3, 0xbe, 0x35, 0x88, 0x2e, 0x93, 0x55, 0xfc, 0x7a, 0xc8, 0xba, 0x32, 0xe6,
0xc0, 0x19, 0x9e, 0xa3, 0x44, 0x54, 0x3b, 0x0b, 0x8c, 0xc7, 0x6b, 0x28, 0xa7, 0xbc, 0x16, 0xad,
0xdb, 0x64, 0x74, 0x14, 0x92, 0x0c, 0x48, 0xb8, 0x9f, 0xbd, 0x43, 0xc4, 0x39, 0x31, 0xd3, 0xf2,
0xd5, 0x8b, 0x6e, 0xda, 0x01, 0xb1, 0x9c, 0x49, 0xd8, 0xac, 0xf3, 0xcf, 0xca, 0xf4, 0x47, 0x10,
0x6f, 0xf0, 0x4a, 0x5c, 0x38, 0x57, 0x73, 0x97, 0xcb, 0xa1, 0xe8, 0x3e, 0x96, 0x61, 0x0d, 0x0f,
0xe0, 0x7c, 0x71, 0xcc, 0x90, 0x06, 0xf7, 0x1c, 0xc2, 0x6a, 0xae, 0x69, 0x17, 0x99, 0x3a, 0x27,
0xd9, 0xeb, 0x2b, 0x22, 0xd2, 0xa9, 0x07, 0x33, 0x2d, 0x3c, 0x15, 0xc9, 0x87, 0xaa, 0x50, 0xa5,
0x03, 0x59, 0x09, 0x1a, 0x65, 0xd7, 0x84, 0xd0, 0x82, 0x29, 0x5a, 0x1e, 0x7b, 0xa8, 0x6d, 0x2c
};
// combined Xtimes3[Sbox[]]
uint8_t Xtime3Sbox[256] = {
0xa5, 0x84, 0x99, 0x8d, 0x0d, 0xbd, 0xb1, 0x54, 0x50, 0x03, 0xa9, 0x7d, 0x19, 0x62, 0xe6, 0x9a,
0x45, 0x9d, 0x40, 0x87, 0x15, 0xeb, 0xc9, 0x0b, 0xec, 0x67, 0xfd, 0xea, 0xbf, 0xf7, 0x96, 0x5b,
0xc2, 0x1c, 0xae, 0x6a, 0x5a, 0x41, 0x02, 0x4f, 0x5c, 0xf4, 0x34, 0x08, 0x93, 0x73, 0x53, 0x3f,
0x0c, 0x52, 0x65, 0x5e, 0x28, 0xa1, 0x0f, 0xb5, 0x09, 0x36, 0x9b, 0x3d, 0x26, 0x69, 0xcd, 0x9f,
0x1b, 0x9e, 0x74, 0x2e, 0x2d, 0xb2, 0xee, 0xfb, 0xf6, 0x4d, 0x61, 0xce, 0x7b, 0x3e, 0x71, 0x97,
0xf5, 0x68, 0x00, 0x2c, 0x60, 0x1f, 0xc8, 0xed, 0xbe, 0x46, 0xd9, 0x4b, 0xde, 0xd4, 0xe8, 0x4a,
0x6b, 0x2a, 0xe5, 0x16, 0xc5, 0xd7, 0x55, 0x94, 0xcf, 0x10, 0x06, 0x81, 0xf0, 0x44, 0xba, 0xe3,
0xf3, 0xfe, 0xc0, 0x8a, 0xad, 0xbc, 0x48, 0x04, 0xdf, 0xc1, 0x75, 0x63, 0x30, 0x1a, 0x0e, 0x6d,
0x4c, 0x14, 0x35, 0x2f, 0xe1, 0xa2, 0xcc, 0x39, 0x57, 0xf2, 0x82, 0x47, 0xac, 0xe7, 0x2b, 0x95,
0xa0, 0x98, 0xd1, 0x7f, 0x66, 0x7e, 0xab, 0x83, 0xca, 0x29, 0xd3, 0x3c, 0x79, 0xe2, 0x1d, 0x76,
0x3b, 0x56, 0x4e, 0x1e, 0xdb, 0x0a, 0x6c, 0xe4, 0x5d, 0x6e, 0xef, 0xa6, 0xa8, 0xa4, 0x37, 0x8b,
0x32, 0x43, 0x59, 0xb7, 0x8c, 0x64, 0xd2, 0xe0, 0xb4, 0xfa, 0x07, 0x25, 0xaf, 0x8e, 0xe9, 0x18,
0xd5, 0x88, 0x6f, 0x72, 0x24, 0xf1, 0xc7, 0x51, 0x23, 0x7c, 0x9c, 0x21, 0xdd, 0xdc, 0x86, 0x85,
0x90, 0x42, 0xc4, 0xaa, 0xd8, 0x05, 0x01, 0x12, 0xa3, 0x5f, 0xf9, 0xd0, 0x91, 0x58, 0x27, 0xb9,
0x38, 0x13, 0xb3, 0x33, 0xbb, 0x70, 0x89, 0xa7, 0xb6, 0x22, 0x92, 0x20, 0x49, 0xff, 0x78, 0x7a,
0x8f, 0xf8, 0x80, 0x17, 0xda, 0x31, 0xc6, 0xb8, 0xc3, 0xb0, 0x77, 0x11, 0xcb, 0xfc, 0xd6, 0x3a
};
// recombine and mix each row in a column
static inline __m128i MixSubColumns (uint8_t *state)
{
uint8_t tmp[16];
// mixing column 0
tmp[0] = Xtime2Sbox[state[0]] ^ Xtime3Sbox[state[5]] ^ Sbox[state[10]] ^ Sbox[state[15]];
tmp[1] = Sbox[state[0]] ^ Xtime2Sbox[state[5]] ^ Xtime3Sbox[state[10]] ^ Sbox[state[15]];
tmp[2] = Sbox[state[0]] ^ Sbox[state[5]] ^ Xtime2Sbox[state[10]] ^ Xtime3Sbox[state[15]];
tmp[3] = Xtime3Sbox[state[0]] ^ Sbox[state[5]] ^ Sbox[state[10]] ^ Xtime2Sbox[state[15]];
// mixing column 1
tmp[4] = Xtime2Sbox[state[4]] ^ Xtime3Sbox[state[9]] ^ Sbox[state[14]] ^ Sbox[state[3]];
tmp[5] = Sbox[state[4]] ^ Xtime2Sbox[state[9]] ^ Xtime3Sbox[state[14]] ^ Sbox[state[3]];
tmp[6] = Sbox[state[4]] ^ Sbox[state[9]] ^ Xtime2Sbox[state[14]] ^ Xtime3Sbox[state[3]];
tmp[7] = Xtime3Sbox[state[4]] ^ Sbox[state[9]] ^ Sbox[state[14]] ^ Xtime2Sbox[state[3]];
// mixing column 2
tmp[8] = Xtime2Sbox[state[8]] ^ Xtime3Sbox[state[13]] ^ Sbox[state[2]] ^ Sbox[state[7]];
tmp[9] = Sbox[state[8]] ^ Xtime2Sbox[state[13]] ^ Xtime3Sbox[state[2]] ^ Sbox[state[7]];
tmp[10] = Sbox[state[8]] ^ Sbox[state[13]] ^ Xtime2Sbox[state[2]] ^ Xtime3Sbox[state[7]];
tmp[11] = Xtime3Sbox[state[8]] ^ Sbox[state[13]] ^ Sbox[state[2]] ^ Xtime2Sbox[state[7]];
// mixing column 3
tmp[12] = Xtime2Sbox[state[12]] ^ Xtime3Sbox[state[1]] ^ Sbox[state[6]] ^ Sbox[state[11]];
tmp[13] = Sbox[state[12]] ^ Xtime2Sbox[state[1]] ^ Xtime3Sbox[state[6]] ^ Sbox[state[11]];
tmp[14] = Sbox[state[12]] ^ Sbox[state[1]] ^ Xtime2Sbox[state[6]] ^ Xtime3Sbox[state[11]];
tmp[15] = Xtime3Sbox[state[12]] ^ Sbox[state[1]] ^ Sbox[state[6]] ^ Xtime2Sbox[state[11]];
return _mm_load_si128((__m128i*)tmp);
}
__m128i soft_aesenc(__m128i in, __m128i key)
{
uint8_t state[16];
_mm_store_si128((__m128i*)state, in);
__m128i out = MixSubColumns(state);
return _mm_xor_si128(out, key);
}
static inline void sub_word(uint8_t* key)
{
key[0] = Sbox[key[0]];
key[1] = Sbox[key[1]];
key[2] = Sbox[key[2]];
key[3] = Sbox[key[3]];
}
__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
{
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
sub_word((uint8_t*)&X1);
sub_word((uint8_t*)&X3);
return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1);
}
...@@ -22,6 +22,9 @@ ...@@ -22,6 +22,9 @@
#ifdef _WIN32 #ifdef _WIN32
#define strcasecmp _stricmp #define strcasecmp _stricmp
#include <intrin.h>
#else
#include <cpuid.h>
#endif #endif
#include "rapidjson/document.h" #include "rapidjson/document.h"
...@@ -120,6 +123,12 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) ...@@ -120,6 +123,12 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
cfg.bDoubleMode = mode->GetBool(); cfg.bDoubleMode = mode->GetBool();
cfg.bNoPrefetch = no_prefetch->GetBool(); cfg.bNoPrefetch = no_prefetch->GetBool();
if(!bHaveAes && (cfg.bDoubleMode || cfg.bNoPrefetch))
{
printer::inst()->print_msg(L0, "Invalid thread confg - low_power_mode and no_prefetch are unsupported on CPUs without AES-NI.");
return false;
}
if(aff->IsNumber()) if(aff->IsNumber())
cfg.iCpuAff = aff->GetInt64(); cfg.iCpuAff = aff->GetInt64();
else else
...@@ -194,12 +203,38 @@ uint16_t jconf::GetHttpdPort() ...@@ -194,12 +203,38 @@ uint16_t jconf::GetHttpdPort()
return prv->configValues[iHttpdPort]->GetUint(); return prv->configValues[iHttpdPort]->GetUint();
} }
bool jconf::check_cpu_features()
{
constexpr int AESNI_BIT = 1 << 25;
constexpr int SSE2_BIT = 1 << 26;
int cpu_info[4];
#ifdef _WIN32
__cpuid(cpu_info, 1);
#else
__cpuid(1, cpu_info[0], cpu_info[1], cpu_info[2], cpu_info[3]);
#endif
bHaveAes = (cpu_info[2] & AESNI_BIT) != 0;
if(!bHaveAes)
printer::inst()->print_msg(L0, "Your CPU doesn't support hardware AES. Don't expect high hashrates.");
return (cpu_info[3] & SSE2_BIT) != 0;
}
bool jconf::parse_config(const char* sFilename) bool jconf::parse_config(const char* sFilename)
{ {
FILE * pFile; FILE * pFile;
char * buffer; char * buffer;
size_t flen; size_t flen;
if(!check_cpu_features())
{
printer::inst()->print_msg(L0, "CPU support of SSE2 is required.");
return false;
}
pFile = fopen(sFilename, "rb"); pFile = fopen(sFilename, "rb");
if (pFile == NULL) if (pFile == NULL)
{ {
......
...@@ -46,10 +46,15 @@ public: ...@@ -46,10 +46,15 @@ public:
bool PreferIpv4(); bool PreferIpv4();
inline bool HaveHardwareAes() { return bHaveAes; }
private: private:
jconf(); jconf();
static jconf* oInst; static jconf* oInst;
bool check_cpu_features();
struct opaque_private; struct opaque_private;
opaque_private* prv; opaque_private* prv;
bool bHaveAes;
}; };
...@@ -21,16 +21,13 @@ ...@@ -21,16 +21,13 @@
#ifdef _WIN32 #ifdef _WIN32
#include <windows.h> #include <windows.h>
#include <intrin.h>
void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
{ {
SetThreadAffinityMask(h, 1 << cpu_id); SetThreadAffinityMask(h, 1 << cpu_id);
} }
#else #else
#include <pthread.h> #include <pthread.h>
#include <cpuid.h>
void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
{ {
...@@ -175,29 +172,8 @@ cryptonight_ctx* minethd_alloc_ctx() ...@@ -175,29 +172,8 @@ cryptonight_ctx* minethd_alloc_ctx()
return nullptr; //Should never happen return nullptr; //Should never happen
} }
static bool check_cpu_features()
{
constexpr int AESNI_BIT = 1 << 25;
constexpr int SSE2_BIT = 1 << 26;
int cpu_info[4];
#ifdef _WIN32
__cpuid(cpu_info, 1);
#else
__cpuid(1, cpu_info[0], cpu_info[1], cpu_info[2], cpu_info[3]);
#endif
return (cpu_info[2] & AESNI_BIT) != 0 &&
(cpu_info[3] & SSE2_BIT) != 0;
}
bool minethd::self_test() bool minethd::self_test()
{ {
if (!check_cpu_features())
{
printer::inst()->print_msg(L0, "This application requires CPU support of AES-NI and SSE2 instructions.");
return false;
}
alloc_msg msg = { 0 }; alloc_msg msg = { 0 };
size_t res; size_t res;
bool fatal = false; bool fatal = false;
...@@ -350,6 +326,7 @@ void minethd::work_main() ...@@ -350,6 +326,7 @@ void minethd::work_main()
piNonce = (uint32_t*)(oWork.bWorkBlob + 39); piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
iConsumeCnt++; iConsumeCnt++;
bool bHaveAes = jconf::inst()->HaveHardwareAes();
while (bQuit == 0) while (bQuit == 0)
{ {
if (oWork.bStall) if (oWork.bStall)
...@@ -383,10 +360,15 @@ void minethd::work_main() ...@@ -383,10 +360,15 @@ void minethd::work_main()
*piNonce = ++result.iNonce; *piNonce = ++result.iNonce;
if(bNoPrefetch) if(bHaveAes)
cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); {
if(bNoPrefetch)
cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
else
cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
}
else else
cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); cryptonight_hash_ctx_soft(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
if (*piHashVal < oWork.iTarget) if (*piHashVal < oWork.iTarget)
executor::inst()->push_event(ex_event(result, oWork.iPoolId)); executor::inst()->push_event(ex_event(result, oWork.iPoolId));
......
...@@ -92,11 +92,15 @@ ...@@ -92,11 +92,15 @@
</Unit> </Unit>
<Unit filename="crypto/c_skein.h" /> <Unit filename="crypto/c_skein.h" />
<Unit filename="crypto/cryptonight.h" /> <Unit filename="crypto/cryptonight.h" />
<Unit filename="crypto/cryptonight_aesni.h" />
<Unit filename="crypto/cryptonight_common.cpp" /> <Unit filename="crypto/cryptonight_common.cpp" />
<Unit filename="crypto/groestl_tables.h" /> <Unit filename="crypto/groestl_tables.h" />
<Unit filename="crypto/hash.h" /> <Unit filename="crypto/hash.h" />
<Unit filename="crypto/int-util.h" /> <Unit filename="crypto/int-util.h" />
<Unit filename="crypto/skein_port.h" /> <Unit filename="crypto/skein_port.h" />
<Unit filename="crypto/soft_aes.c">
<Option compilerVar="CC" />
</Unit>
<Unit filename="donate-level.h" /> <Unit filename="donate-level.h" />
<Unit filename="executor.cpp" /> <Unit filename="executor.cpp" />
<Unit filename="executor.h" /> <Unit filename="executor.h" />
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment