CUDA: reduce startup time

- reduce startup time for multi gpu systems - initialize the GPU memory non concurrent

CUDA: reduce startup time
- reduce startup time for multi gpu systems - initialize the GPU memory non concurrent
6488a026 · psychocrypt · 7b850646 · 6488a026 · 6488a026
Commit 6488a026 authored 7 years ago by psychocrypt
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -80,14 +80,22 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	ctx.syncMode = cfg.syncMode;
 	this->affinity = cfg.cpu_aff;
-	std::unique_lock<std::mutex> lck(thd_aff_set);
+	std::future<void> numa_guard = numa_promise.get_future();
-	std::future<void> order_guard = order_fix.get_future();
+	thread_work_guard = thread_work_promise.get_future();
 	oWorkThd = std::thread(&minethd::work_main, this);
-	order_guard.wait();
+	/* Wait until the gpu memory is initialized and numa cpu memory is pinned.
+	 * The startup time is reduced if the memory is initialized in sequential order
+	 * without concurrent threads (CUDA driver is less occupied).
+	 */
+	numa_guard.wait();
+}
-	if(affinity >= 0) //-1 means no affinity
+void minethd::start_mining()
+{
+	thread_work_promise.set_value();
+	if(this->affinity >= 0) //-1 means no affinity
 		if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
 }
@@ -179,6 +187,11 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	}
+	for (i = 0; i < n; i++)
+	{
+		static_cast<minethd*>((*pvThreads)[i])->start_mining();
+	}
 	return pvThreads;
 }
@@ -208,10 +221,18 @@ void minethd::work_main()
 	if(affinity >= 0) //-1 means no affinity
 		bindMemoryToNUMANode(affinity);
-	order_fix.set_value();
+	if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1)
-	std::unique_lock<std::mutex> lck(thd_aff_set);
+	{
-	lck.release();
+		printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
+		std::exit(0);
+	}
+	// numa memory bind and gpu memory is initialized
+	numa_promise.set_value();
 	std::this_thread::yield();
+	// wait until all NVIDIA devices are initialized
+	thread_work_guard.wait();
 	uint64_t iCount = 0;
 	cryptonight_ctx* cpu_ctx;
@@ -221,12 +242,6 @@ void minethd::work_main()
 	globalStates::inst().iConsumeCnt++;
-	if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1)
-	{
-		printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
-		std::exit(0);
-	}
 	bool mineMonero = strcmp_i(::jconf::inst()->GetCurrency(), "monero");
 	while (bQuit == 0)

--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -32,7 +32,8 @@ private:
 	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
 	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
+	void start_mining();
 	void work_main();
 	void consume_work();
@@ -44,8 +45,11 @@ private:
 	static miner_work oGlobalWork;
 	miner_work oWork;
-	std::promise<void> order_fix;
+	std::promise<void> numa_promise;
-	std::mutex thd_aff_set;
+	std::promise<void> thread_work_promise;
+	// block thread until all NVIDIA GPUs are initialized
+	std::future<void> thread_work_guard;
 	std::thread oWorkThd;
 	int64_t affinity;