From 0a9a9abaf7afceeb923292630b93b0fe4830efea Mon Sep 17 00:00:00 2001 From: psychocrypt <psychocryptHPC@gmail.com> Date: Sun, 16 Sep 2018 22:24:56 +0200 Subject: [PATCH] infrastructure to load asm code - add new option to `cpu.txt` named `asm` to select the asm code version - extent function selection method to choose assembler code for `cryptonight_v8` - update auto adjustment to add default value for option `asm` --- CMakeLists.txt | 11 +++++- xmrstak/backend/cpu/autoAdjust.hpp | 2 +- xmrstak/backend/cpu/autoAdjustHwloc.hpp | 2 +- xmrstak/backend/cpu/config.tpl | 13 ++++--- .../backend/cpu/crypto/cryptonight_aesni.h | 21 +++++++++++ xmrstak/backend/cpu/jconf.cpp | 7 +++- xmrstak/backend/cpu/jconf.hpp | 1 + xmrstak/backend/cpu/minethd.cpp | 35 +++++++++++++++---- xmrstak/backend/cpu/minethd.hpp | 5 +-- 9 files changed, 81 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a642b38..067bbd0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -458,6 +458,15 @@ if(MICROHTTPD_ENABLE) endif() target_link_libraries(xmr-stak-c ${LIBS}) +enable_language(ASM) +# asm optimized monero v8 code +add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" +) +set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) + + # compile generic backend files file(GLOB BACKEND_CPP "xmrstak/*.cpp" @@ -472,7 +481,7 @@ add_library(xmr-stak-backend STATIC ${BACKEND_CPP} ) -target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS}) +target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm) # compile CUDA backend if(CUDA_FOUND) diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index 57dbef0..8588fea 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -82,7 +82,7 @@ public: conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 01d2280..a73de86 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ public: { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 2fc9a47..bfffc85 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -7,10 +7,15 @@ R"===( * the maximum performance. When set to a number N greater than 1, this mode will increase the * cache usage and single thread performance by N times. * - * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make + * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. * - * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading + * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen] + * - auto: used the default implementation (no assembler version) + * - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + * - ryzen: AMD Ryzen (1xxx and 2xxx series) + * + * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading * systems it is better to assign threads to physical cores. On Windows this usually means selecting * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 * physical core CPU you should select cpu numbers 0-3. @@ -21,8 +26,8 @@ R"===( * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 2734760..0ab47e3 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -876,3 +876,24 @@ struct Cryptonight_hash<5> REPEAT_5(0, CN_FINALIZE); } }; + +extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); + +template<xmrstak_algo ALGO, int asm_version> +void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MEM = cn_select_memory<ALGO>(); + + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + + if (asm_version == 1) + cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]); + else + cryptonigh_v8_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); +} diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 49da7ae..1f9501c 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -108,10 +108,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *mode, *no_prefetch, *aff; + const Value *mode, *no_prefetch, *aff, *asm_version; mode = GetObjectMember(oThdConf, "low_power_mode"); no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + asm_version = GetObjectMember(oThdConf, "asm"); if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) return false; @@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) else cfg.iCpuAff = -1; + if(!asm_version->IsString()) + return false; + cfg.asm_version_str = asm_version->GetString(); + return true; } diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index be85503..4ec9165 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -24,6 +24,7 @@ public: struct thd_cfg { int iMultiway; bool bNoPrefetch; + std::string asm_version_str; long long iCpuAff; }; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 87f4d32..f07c714 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -104,7 +104,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id #endif } -minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity) +minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version) { this->backendType = iBackend::CPU; oWork = pWork; @@ -113,6 +113,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, iJobNo = 0; bNoPrefetch = no_prefetch; this->affinity = affinity; + asm_version_str = asm_version; std::unique_lock<std::mutex> lck(thd_aff_set); std::future<void> order_guard = order_fix.get_future(); @@ -441,7 +442,7 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work else printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway); - minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff); + minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str); pvThreads.push_back(thd); } @@ -449,9 +450,31 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work } template<size_t N> -minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) { static_assert(N >= 1, "number of threads must be >= 1" ); + + // check for asm optimized version for cryptonight_v8 + if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) + { + if(asm_version_str != "auto") + { + if(asm_version_str == "intel") + { + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + return cryptonight_hash_v2_asm<cryptonight_monero_v8, 1>; + } + if(asm_version_str == "ryzen") + { + // AMD Ryzen (1xxx and 2xxx series) + return cryptonight_hash_v2_asm<cryptonight_monero_v8, 2>; + } + else + { + printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); + } + } + } // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -636,7 +659,7 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); uint8_t version = 0; size_t lastPoolId = 0; @@ -671,12 +694,12 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } lastPoolId = oWork.iPoolId; version = new_version; diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 2647854..53ff93c 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -32,9 +32,9 @@ public: private: template<size_t N> - static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto"); - minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); template<uint32_t N> void multiway_work_main(); @@ -60,6 +60,7 @@ private: bool bQuit; bool bNoPrefetch; + std::string asm_version_str = "auto"; }; } // namespace cpu -- GitLab