Skip to content
Snippets Groups Projects
Commit f59c4d17 authored by fireice-uk's avatar fireice-uk
Browse files

No prefetch option

parent e3bda576
No related branches found
No related tags found
No related merge requests found
...@@ -7,16 +7,23 @@ ...@@ -7,16 +7,23 @@
/* /*
* Thread configuration for each thread. Make sure it matches the number above. * Thread configuration for each thread. Make sure it matches the number above.
* low_power_mode will double the cache usage, and double the single thread performance. It will consume much * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will
* less power (as less cores are working), but will max out at around 80-85% of the maximum performance. * consume much less power (as less cores are working), but will max out at around 80-85% of
* affine_to_cpu can be either false (no affinity), or the CPU core number. Note that on hyperthreading systems * the maximum performance.
* it is better to assign threads to physical cores. On Windows this usually means selecting even or odd numbered *
* cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 physical core CPU you should select * no_prefetch - This mode meant for large pages only. It will generate an error if running on slow memory
* cpu numbers 0-3. * Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
* things slower.
*
* affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
* systems it is better to assign threads to physical cores. On Windows this usually means selecting
* even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
* physical core CPU you should select cpu numbers 0-3.
*
*/ */
"cpu_threads_conf" : [ "cpu_threads_conf" : [
{ "low_power_mode" : false, "affine_to_cpu" : 0 }, { "low_power_mode" : false, "no_prefetch" : false, "affine_to_cpu" : 0 },
{ "low_power_mode" : false, "affine_to_cpu" : 1 }, { "low_power_mode" : false, "no_prefetch" : false, "affine_to_cpu" : 1 },
], ],
/* /*
......
...@@ -23,7 +23,9 @@ typedef struct { ...@@ -23,7 +23,9 @@ typedef struct {
size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
void cryptonight_free_ctx(cryptonight_ctx* ctx); void cryptonight_free_ctx(cryptonight_ctx* ctx);
void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx); void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1); void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1);
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -270,6 +270,54 @@ void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonig ...@@ -270,6 +270,54 @@ void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonig
extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output); extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output);
} }
void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
{
keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
// Optim - 99% time boundary
cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
uint8_t* l0 = ctx0->long_state;
uint64_t* h0 = (uint64_t*)ctx0->hash_state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
// Optim - 90% time boundary
for(size_t i = 0; i < 0x80000; i++)
{
__m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
// Optim - 90% time boundary
cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
// Optim - 99% time boundary
keccakf((uint64_t*)ctx0->hash_state, 24);
extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output);
}
// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon // This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
......
...@@ -99,14 +99,15 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) ...@@ -99,14 +99,15 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject()) if(!oThdConf.IsObject())
return false; return false;
const Value *mode, *aff; const Value *mode, *no_prefetch, *aff;
mode = GetObjectMember(oThdConf, "low_power_mode"); mode = GetObjectMember(oThdConf, "low_power_mode");
no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
aff = GetObjectMember(oThdConf, "affine_to_cpu"); aff = GetObjectMember(oThdConf, "affine_to_cpu");
if(mode == nullptr || aff == nullptr) if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
return false; return false;
if(!mode->IsBool()) if(!mode->IsBool() || !no_prefetch->IsBool())
return false; return false;
if(!aff->IsNumber() && !aff->IsBool()) if(!aff->IsNumber() && !aff->IsBool())
...@@ -116,6 +117,8 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) ...@@ -116,6 +117,8 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
return false; return false;
cfg.bDoubleMode = mode->GetBool(); cfg.bDoubleMode = mode->GetBool();
cfg.bNoPrefetch = no_prefetch->GetBool();
if(aff->IsNumber()) if(aff->IsNumber())
cfg.iCpuAff = aff->GetInt64(); cfg.iCpuAff = aff->GetInt64();
else else
......
...@@ -15,6 +15,7 @@ public: ...@@ -15,6 +15,7 @@ public:
struct thd_cfg { struct thd_cfg {
bool bDoubleMode; bool bDoubleMode;
bool bNoPrefetch;
long long iCpuAff; long long iCpuAff;
}; };
......
...@@ -117,7 +117,7 @@ void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTime ...@@ -117,7 +117,7 @@ void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTime
iBucketTop[iThd] = (iTop + 1) & iBucketMask; iBucketTop[iThd] = (iTop + 1) & iBucketMask;
} }
minethd::minethd(miner_work& pWork, size_t iNo, bool double_work) minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch)
{ {
oWork = pWork; oWork = pWork;
bQuit = 0; bQuit = 0;
...@@ -125,6 +125,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work) ...@@ -125,6 +125,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work)
iJobNo = 0; iJobNo = 0;
iHashCount = 0; iHashCount = 0;
iTimestamp = 0; iTimestamp = 0;
bNoPrefetch = no_prefetch;
if(double_work) if(double_work)
oWorkThd = std::thread(&minethd::double_work_main, this); oWorkThd = std::thread(&minethd::double_work_main, this);
...@@ -242,12 +243,31 @@ bool minethd::self_test() ...@@ -242,12 +243,31 @@ bool minethd::self_test()
return false; return false;
} }
bool bHasLp = ctx0->ctx_info[0] == 1 && ctx1->ctx_info[1];
size_t n = jconf::inst()->GetThreadCount();
jconf::thd_cfg cfg;
for (size_t i = 0; i < n; i++)
{
jconf::inst()->GetThreadConfig(i, cfg);
if(!bHasLp && cfg.bNoPrefetch)
{
printer::inst()->print_msg(L0, "Wrong config. You are running in slow memory mode with no_prefetch.");
cryptonight_free_ctx(ctx0);
cryptonight_free_ctx(ctx1);
return false;
}
}
unsigned char out[64]; unsigned char out[64];
bool bResult; bool bResult;
cryptonight_hash_ctx("This is a test", 14, out, ctx0); cryptonight_hash_ctx("This is a test", 14, out, ctx0);
bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
cryptonight_hash_ctx_np("This is a test", 14, out, ctx0);
bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
cryptonight_double_hash_ctx("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); cryptonight_double_hash_ctx("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
...@@ -278,7 +298,7 @@ std::vector<minethd*>* minethd::thread_starter(miner_work& pWork) ...@@ -278,7 +298,7 @@ std::vector<minethd*>* minethd::thread_starter(miner_work& pWork)
{ {
jconf::inst()->GetThreadConfig(i, cfg); jconf::inst()->GetThreadConfig(i, cfg);
minethd* thd = new minethd(pWork, i, cfg.bDoubleMode); minethd* thd = new minethd(pWork, i, cfg.bDoubleMode, cfg.bNoPrefetch);
if(cfg.iCpuAff >= 0) if(cfg.iCpuAff >= 0)
thd_setaffinity(thd->oWorkThd.native_handle(), cfg.iCpuAff); thd_setaffinity(thd->oWorkThd.native_handle(), cfg.iCpuAff);
...@@ -362,7 +382,11 @@ void minethd::work_main() ...@@ -362,7 +382,11 @@ void minethd::work_main()
iCount++; iCount++;
*piNonce = ++result.iNonce; *piNonce = ++result.iNonce;
cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
if(bNoPrefetch)
cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
else
cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
if (*piHashVal < oWork.iTarget) if (*piHashVal < oWork.iTarget)
executor::inst()->push_event(ex_event(result, oWork.iPoolId)); executor::inst()->push_event(ex_event(result, oWork.iPoolId));
......
...@@ -94,7 +94,7 @@ public: ...@@ -94,7 +94,7 @@ public:
std::atomic<uint64_t> iTimestamp; std::atomic<uint64_t> iTimestamp;
private: private:
minethd(miner_work& pWork, size_t iNo, bool double_work); minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch);
// We use the top 10 bits of the nonce for thread and resume // We use the top 10 bits of the nonce for thread and resume
// This allows us to resume up to 128 threads 4 times before // This allows us to resume up to 128 threads 4 times before
...@@ -119,5 +119,6 @@ private: ...@@ -119,5 +119,6 @@ private:
uint8_t iThreadNo; uint8_t iThreadNo;
bool bQuit; bool bQuit;
bool bNoPrefetch;
}; };
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment