diff --git a/CMakeLists.txt b/CMakeLists.txt index a642b385d0b8aed1f77101d6d749e9082dfd9691..067bbd0a2f7fd85fff613cd5767da2ec471624e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -458,6 +458,15 @@ if(MICROHTTPD_ENABLE) endif() target_link_libraries(xmr-stak-c ${LIBS}) +enable_language(ASM) +# asm optimized monero v8 code +add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" +) +set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) + + # compile generic backend files file(GLOB BACKEND_CPP "xmrstak/*.cpp" @@ -472,7 +481,7 @@ add_library(xmr-stak-backend STATIC ${BACKEND_CPP} ) -target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS}) +target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm) # compile CUDA backend if(CUDA_FOUND) diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index 57dbef05358c301d90af5504f072ff23cd2035f5..8588fea8c062357d2f4f557bc139a2def3067c9e 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -82,7 +82,7 @@ public: conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 01d2280d8f580fd7950bd2973b3743fd7ebb3f3a..a73de8618730271b90e1e66a7344a9125b3d7c24 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ public: { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 2fc9a47ec5a92f3c90d3ee89cc0c9aedc4326958..bfffc851e76ba3d16935b20c5124af1d6c8fe1d4 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -7,10 +7,15 @@ R"===( * the maximum performance. When set to a number N greater than 1, this mode will increase the * cache usage and single thread performance by N times. * - * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make + * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. * - * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading + * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen] + * - auto: used the default implementation (no assembler version) + * - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + * - ryzen: AMD Ryzen (1xxx and 2xxx series) + * + * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading * systems it is better to assign threads to physical cores. On Windows this usually means selecting * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 * physical core CPU you should select cpu numbers 0-3. @@ -21,8 +26,8 @@ R"===( * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 27347609625dfb2f873fe578ad9dd9a1adeda3c4..0ab47e390d8571e50fbf4a19f5b549ebd4c531af 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -876,3 +876,24 @@ struct Cryptonight_hash<5> REPEAT_5(0, CN_FINALIZE); } }; + +extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); + +template<xmrstak_algo ALGO, int asm_version> +void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MEM = cn_select_memory<ALGO>(); + + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + + if (asm_version == 1) + cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]); + else + cryptonigh_v8_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); +} diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 49da7ae2d2b575421bd227af5663b49f2799c924..1f9501c40da4a383b249e6a8dc5f6f1bd9ea52a5 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -108,10 +108,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *mode, *no_prefetch, *aff; + const Value *mode, *no_prefetch, *aff, *asm_version; mode = GetObjectMember(oThdConf, "low_power_mode"); no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + asm_version = GetObjectMember(oThdConf, "asm"); if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) return false; @@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) else cfg.iCpuAff = -1; + if(!asm_version->IsString()) + return false; + cfg.asm_version_str = asm_version->GetString(); + return true; } diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index be855036eceaeb32f1e7fb8094dd1b244fb29cc7..4ec9165d59ec11b0a3ad5ed929d08218e95ca7ed 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -24,6 +24,7 @@ public: struct thd_cfg { int iMultiway; bool bNoPrefetch; + std::string asm_version_str; long long iCpuAff; }; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 87f4d3285f77e7ee74da31f312e20cbbf894afe9..f07c71481b61a7bd4690bab568c65767590da9e6 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -104,7 +104,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id #endif } -minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity) +minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version) { this->backendType = iBackend::CPU; oWork = pWork; @@ -113,6 +113,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, iJobNo = 0; bNoPrefetch = no_prefetch; this->affinity = affinity; + asm_version_str = asm_version; std::unique_lock<std::mutex> lck(thd_aff_set); std::future<void> order_guard = order_fix.get_future(); @@ -441,7 +442,7 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work else printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway); - minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff); + minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str); pvThreads.push_back(thd); } @@ -449,9 +450,31 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work } template<size_t N> -minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) { static_assert(N >= 1, "number of threads must be >= 1" ); + + // check for asm optimized version for cryptonight_v8 + if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) + { + if(asm_version_str != "auto") + { + if(asm_version_str == "intel") + { + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + return cryptonight_hash_v2_asm<cryptonight_monero_v8, 1>; + } + if(asm_version_str == "ryzen") + { + // AMD Ryzen (1xxx and 2xxx series) + return cryptonight_hash_v2_asm<cryptonight_monero_v8, 2>; + } + else + { + printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); + } + } + } // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -636,7 +659,7 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); uint8_t version = 0; size_t lastPoolId = 0; @@ -671,12 +694,12 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } lastPoolId = oWork.iPoolId; version = new_version; diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 26478542cddad4d7cda9725450d0621c44cd73c7..53ff93c1549b3c92bfe4e720e084c2222412ff92 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -32,9 +32,9 @@ public: private: template<size_t N> - static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto"); - minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); template<uint32_t N> void multiway_work_main(); @@ -60,6 +60,7 @@ private: bool bQuit; bool bNoPrefetch; + std::string asm_version_str = "auto"; }; } // namespace cpu