diff --git a/CMakeLists.txt b/CMakeLists.txt
index a642b385d0b8aed1f77101d6d749e9082dfd9691..067bbd0a2f7fd85fff613cd5767da2ec471624e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -458,6 +458,15 @@ if(MICROHTTPD_ENABLE)
 endif()
 target_link_libraries(xmr-stak-c ${LIBS})
 
+enable_language(ASM)
+# asm optimized monero v8 code
+add_library(xmr-stak-asm
+    STATIC
+    "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
+)
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
+
 # compile generic backend files
 file(GLOB BACKEND_CPP
     "xmrstak/*.cpp"
@@ -472,7 +481,7 @@ add_library(xmr-stak-backend
     STATIC
     ${BACKEND_CPP}
 )
-target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS})
+target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm)
 
 # compile CUDA backend
 if(CUDA_FOUND)
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 57dbef05358c301d90af5504f072ff23cd2035f5..8588fea8c062357d2f4f557bc139a2def3067c9e 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -82,7 +82,7 @@ public:
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(aff_id);
 				conf += std::string(" },\n");
 
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index 01d2280d8f580fd7950bd2973b3743fd7ebb3f3a..a73de8618730271b90e1e66a7344a9125b3d7c24 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -70,7 +70,7 @@ public:
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true,  \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(id & 0x7FFFFFF);
 				conf += std::string(" },\n");
 			}
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index 2fc9a47ec5a92f3c90d3ee89cc0c9aedc4326958..bfffc851e76ba3d16935b20c5124af1d6c8fe1d4 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -7,10 +7,15 @@ R"===(
  *                  the maximum performance. When set to a number N greater than 1, this mode will increase the
  *                  cache usage and single thread performance by N times.
  *
- * no_prefetch -    Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
+ * no_prefetch    - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
  *                  things slower.
  *
- * affine_to_cpu -  This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
+ * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen]
+ *                    - auto: used the default implementation (no assembler version)
+ *                    - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+ *                    - ryzen: AMD Ryzen (1xxx and 2xxx series)
+ *
+ * affine_to_cpu  - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
  *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting
  *                  even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
  *                  physical core CPU you should select cpu numbers 0-3.
@@ -21,8 +26,8 @@ R"===(
  * A filled out configuration should look like this:
  * "cpu_threads_conf" :
  * [
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 },
  * ],
  * If you do not wish to mine with your CPU(s) then use:
  * "cpu_threads_conf" :
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 27347609625dfb2f873fe578ad9dd9a1adeda3c4..0ab47e390d8571e50fbf4a19f5b549ebd4c531af 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -876,3 +876,24 @@ struct Cryptonight_hash<5>
 		REPEAT_5(0, CN_FINALIZE);
 	}
 };
+
+extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+
+template<xmrstak_algo ALGO, int asm_version>
+void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+	constexpr size_t MEM = cn_select_memory<ALGO>();
+
+	keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+	cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+
+	if (asm_version == 1)
+		cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]);
+	else
+		cryptonigh_v8_mainloop_ryzen_asm(ctx[0]);
+
+	cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+	keccakf((uint64_t*)ctx[0]->hash_state, 24);
+	extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
+}
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 49da7ae2d2b575421bd227af5663b49f2799c924..1f9501c40da4a383b249e6a8dc5f6f1bd9ea52a5 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -108,10 +108,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *mode, *no_prefetch, *aff;
+	const Value *mode, *no_prefetch, *aff, *asm_version;
 	mode = GetObjectMember(oThdConf, "low_power_mode");
 	no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+	asm_version = GetObjectMember(oThdConf, "asm");
 
 	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
 		return false;
@@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	else
 		cfg.iCpuAff = -1;
 
+	if(!asm_version->IsString())
+		return false;
+	cfg.asm_version_str = asm_version->GetString();
+
 	return true;
 }
 
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index be855036eceaeb32f1e7fb8094dd1b244fb29cc7..4ec9165d59ec11b0a3ad5ed929d08218e95ca7ed 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -24,6 +24,7 @@ public:
 	struct thd_cfg {
 		int iMultiway;
 		bool bNoPrefetch;
+		std::string asm_version_str;
 		long long iCpuAff;
 	};
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 87f4d3285f77e7ee74da31f312e20cbbf894afe9..f07c71481b61a7bd4690bab568c65767590da9e6 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -104,7 +104,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 #endif
 }
 
-minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity)
+minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version)
 {
 	this->backendType = iBackend::CPU;
 	oWork = pWork;
@@ -113,6 +113,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 	iJobNo = 0;
 	bNoPrefetch = no_prefetch;
 	this->affinity = affinity;
+	asm_version_str = asm_version;
 
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
@@ -441,7 +442,7 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 		else
 			printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway);
 
-		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff);
+		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str);
 		pvThreads.push_back(thd);
 	}
 
@@ -449,9 +450,31 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 }
 
 template<size_t N>
-minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
 {
 	static_assert(N >= 1, "number of threads must be >= 1" );
+	
+	// check for asm optimized version for cryptonight_v8
+	if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes)
+	{
+		if(asm_version_str != "auto")
+		{
+			if(asm_version_str == "intel")
+			{
+				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 1>;
+			}
+			if(asm_version_str == "ryzen")
+			{
+				// AMD Ryzen (1xxx and 2xxx series)
+				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 2>;
+			}
+			else
+			{
+				printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str());
+			}
+		}
+	}
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
 	// function as a two digit binary
@@ -636,7 +659,7 @@ void minethd::multiway_work_main()
 
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
@@ -671,12 +694,12 @@ void minethd::multiway_work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 26478542cddad4d7cda9725450d0621c44cd73c7..53ff93c1549b3c92bfe4e720e084c2222412ff92 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -32,9 +32,9 @@ public:
 private:
 
 	template<size_t N>
-	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
+	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto");
 
-	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
+	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
 	template<uint32_t N>
 	void multiway_work_main();
@@ -60,6 +60,7 @@ private:
 
 	bool bQuit;
 	bool bNoPrefetch;
+	std::string asm_version_str = "auto";
 };
 
 } // namespace cpu