diff --git a/backend/nvidia/autoAdjust.hpp b/backend/nvidia/autoAdjust.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be9b53d735daa38160db4bd28aa97c6dd5f36676
--- /dev/null
+++ b/backend/nvidia/autoAdjust.hpp
@@ -0,0 +1,112 @@
+
+#pragma once
+
+#include "autoAdjust.hpp"
+
+#include "nvcc_code/cryptonight.h"
+#include "jconf.h"
+#include "../../console.h"
+#include "../../ConfigEditor.hpp"
+
+#include <vector>
+#include <cstdio>
+#include <sstream>
+#include <string>
+
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+class autoAdjust
+{    
+public:
+
+    autoAdjust()
+    {
+
+    }
+
+    /** print the adjusted values if needed
+     *
+     * Routine exit the application and print the adjusted values if needed else
+     * nothing is happened.
+     */
+    bool printConfig()
+    {
+		int deviceCount = 0;
+        if(cuda_get_devicecount(&deviceCount) == 0)
+            std::exit(0);
+        // evaluate config parameter for if auto adjustment is needed
+                // evaluate config parameter for if auto adjustment is needed
+        for(int i = 0; i < deviceCount; i++)
+        {
+       
+            nvid_ctx ctx;
+
+            ctx.device_id = i;
+            // -1 trigger auto adjustment
+            ctx.device_blocks = -1;
+            ctx.device_threads = -1;
+
+        // set all evice option those marked as auto (-1) to a valid value
+#ifndef _WIN32
+            ctx.device_bfactor = 0;
+            ctx.device_bsleep = 0;
+#else
+            // windows pass, try to avoid that windows kills the miner if the gpu is blocked for 2 seconds
+            ctx.device_bfactor = 6;
+            ctx.device_bsleep = 25;
+#endif
+            if( cuda_get_deviceinfo(&ctx) != 1 )
+            {
+                printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", cfg.id);
+                std::exit(0);
+            }
+            nvidCtxVec.push_back(ctx);
+
+        }
+
+        generateThreadConfig();
+		return true;
+
+    }
+
+private:
+    
+    void generateThreadConfig()
+    {
+		// load the template of the backend config into a char variable
+		const char *tpl =
+			#include "./config.tpl"
+		;
+
+		ConfigEditor configTpl{};
+		configTpl.set( std::string(tpl) );
+
+		std::string conf;
+		conf += std::string("\"gpu_threads_conf\" :\n[\n");
+        int i = 0;
+        for(auto& ctx : nvidCtxVec)
+        {
+
+            conf += std::string("  { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" +
+                "    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
+                "    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
+                "    \"affine_to_cpu\" : false,\n" +
+                "  },\n";
+            ++i;
+        }
+		conf += std::string("],\n\n");
+
+		configTpl.replace("GPUCONFIG",conf);
+		configTpl.write("nvidia.txt");
+		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", "nvidia.txt");
+    }
+
+    std::vector<nvid_ctx> nvidCtxVec;
+};
+
+} // namespace nvidia
+} // namepsace xmrstak
diff --git a/backend/nvidia/config.tpl b/backend/nvidia/config.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..bf7a614ab191455aeedb019f62629a278f65450e
--- /dev/null
+++ b/backend/nvidia/config.tpl
@@ -0,0 +1,25 @@
+R"===(
+/*
+ * GPU configuration. You should play around with threads and blocks as the fastest settings will vary.
+ * index         - GPU index number usually starts from 0.
+ * threads       - Number of GPU threads (nothing to do with CPU threads).
+ * blocks        - Number of GPU blocks (nothing to do with CPU threads).
+ * bfactor       - Enables running the Cryptonight kernel in smaller pieces.
+ *                 Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 8
+ * bsleep        - Insert a delay of X microseconds between kernel launches.
+ *                 Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 100
+ * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ *
+ * On the first run the miner will look at your system and suggest a basic configuration that will work,
+ * you can try to tweak it from there to get the best performance.
+ *
+ * A filled out configuration should look like this:
+ * "gpu_threads_conf" :
+ * [
+ *     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0, "affine_to_cpu" : false},
+ * ],
+ */
+
+GPUCONFIG
+
+)==="
diff --git a/backend/nvidia/jconf.cpp b/backend/nvidia/jconf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2184acd5d18c80ab4cbd8559d6caf8b01464e2ab
--- /dev/null
+++ b/backend/nvidia/jconf.cpp
@@ -0,0 +1,274 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include "jconf.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#include <intrin.h>
+#else
+#include <cpuid.h>
+#endif
+
+#include "../../rapidjson/document.h"
+#include "../../rapidjson/error/en.h"
+#include "../../jext.h"
+#include "../../console.h"
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+using namespace rapidjson;
+
+/*
+ * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
+ */
+enum configEnum { aGpuThreadsConf };
+
+struct configVal {
+	configEnum iName;
+	const char* sName;
+	Type iType;
+};
+
+// Same order as in configEnum, as per comment above
+// kNullType means any type
+configVal oConfigValues[] = {
+	{ aGpuThreadsConf, "gpu_threads_conf", kNullType }
+};
+
+inline bool checkType(Type have, Type want)
+{
+	if(want == have)
+		return true;
+	else if(want == kNullType)
+		return true;
+	else if(want == kTrueType && have == kFalseType)
+		return true;
+	else if(want == kFalseType && have == kTrueType)
+		return true;
+	else
+		return false;
+}
+
+constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+
+
+
+struct jconf::opaque_private
+{
+	Document jsonDoc;
+	const Value* configValues[iConfigCnt]; //Compile time constant
+
+	opaque_private()
+	{
+	}
+};
+
+
+bool jconf::NeedsAutoconf()
+{
+	return !prv->configValues[aGpuThreadsConf]->IsArray();
+}
+
+jconf* jconf::oInst = nullptr;
+
+jconf::jconf()
+{
+	prv = new opaque_private();
+}
+
+size_t jconf::GetGPUThreadCount()
+{
+	if(prv->configValues[aGpuThreadsConf]->IsArray())
+		return prv->configValues[aGpuThreadsConf]->Size();
+	else
+		return 0;
+}
+
+bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
+{
+	if(!prv->configValues[aGpuThreadsConf]->IsArray())
+		return false;
+
+	if(id >= prv->configValues[aGpuThreadsConf]->Size())
+		return false;
+
+	const Value& oThdConf = prv->configValues[aGpuThreadsConf]->GetArray()[id];
+
+	if(!oThdConf.IsObject())
+		return false;
+
+	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff;
+	gid = GetObjectMember(oThdConf, "index");
+	blocks = GetObjectMember(oThdConf, "blocks");
+	threads = GetObjectMember(oThdConf, "threads");
+	bfactor = GetObjectMember(oThdConf, "bfactor");
+	bsleep = GetObjectMember(oThdConf, "bsleep");
+	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+
+	if(gid == nullptr || blocks == nullptr || threads == nullptr ||
+		bfactor == nullptr || bsleep == nullptr || aff == nullptr)
+	{
+		return false;
+	}
+
+	if(!gid->IsNumber() || gid->GetInt() < 0)
+		return false;
+
+	if(!blocks->IsNumber() || blocks->GetInt() < 0)
+		return false;
+
+	if(!threads->IsNumber() || threads->GetInt() < 0)
+		return false;
+
+	if(!bfactor->IsNumber() || bfactor->GetInt() < 0)
+		return false;
+
+	if(!bsleep->IsNumber() || bsleep->GetInt() < 0)
+		return false;
+
+	if(!aff->IsUint64() && !aff->IsBool())
+		return false;
+
+	cfg.id = gid->GetInt();
+	cfg.blocks = blocks->GetInt();
+	cfg.threads = threads->GetInt();
+	cfg.bfactor = bfactor->GetInt();
+	cfg.bsleep = bsleep->GetInt();
+
+	if(aff->IsNumber())
+		cfg.cpu_aff = aff->GetInt();
+	else
+		cfg.cpu_aff = -1;
+
+	return true;
+}
+
+bool jconf::parse_config(const char* sFilename)
+{
+	FILE * pFile;
+	char * buffer;
+	size_t flen;
+
+	pFile = fopen(sFilename, "rb");
+	if (pFile == NULL)
+	{
+		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
+		return false;
+	}
+
+	fseek(pFile,0,SEEK_END);
+	flen = ftell(pFile);
+	rewind(pFile);
+
+	if(flen >= 64*1024)
+	{
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
+		return false;
+	}
+
+	if(flen <= 16)
+	{
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "File is empty or too short - %s.", sFilename);
+		return false;
+	}
+
+	buffer = (char*)malloc(flen + 3);
+	if(fread(buffer+1, flen, 1, pFile) != 1)
+	{
+		free(buffer);
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "Read error while reading %s.", sFilename);
+		return false;
+	}
+	fclose(pFile);
+
+	//Replace Unicode BOM with spaces - we always use UTF-8
+	unsigned char* ubuffer = (unsigned char*)buffer;
+	if(ubuffer[1] == 0xEF && ubuffer[2] == 0xBB && ubuffer[3] == 0xBF)
+	{
+		buffer[1] = ' ';
+		buffer[2] = ' ';
+		buffer[3] = ' ';
+	}
+
+	buffer[0] = '{';
+	buffer[flen] = '}';
+	buffer[flen + 1] = '\0';
+
+	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	free(buffer);
+
+	if(prv->jsonDoc.HasParseError())
+	{
+		printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s",
+			int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
+		return false;
+	}
+
+
+	if(!prv->jsonDoc.IsObject())
+	{ //This should never happen as we created the root ourselves
+		printer::inst()->print_msg(L0, "Invalid config file. No root?\n");
+		return false;
+	}
+
+	for(size_t i = 0; i < iConfigCnt; i++)
+	{
+		if(oConfigValues[i].iName != i)
+		{
+			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s",oConfigValues[i].sName);
+			return false;
+		}
+
+		prv->configValues[i] = GetObjectMember(prv->jsonDoc, oConfigValues[i].sName);
+
+		if(prv->configValues[i] == nullptr)
+		{
+			printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName);
+			return false;
+		}
+
+		if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
+		{
+			printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName);
+			return false;
+		}
+	}
+	if(NeedsAutoconf())
+		return true;
+
+	return true;
+}
+
+} // namespace nvidia
+} // namespace xmrstak
\ No newline at end of file
diff --git a/backend/nvidia/jconf.h b/backend/nvidia/jconf.h
new file mode 100644
index 0000000000000000000000000000000000000000..e398f30be51fdbd53614160941a9b060a8af21af
--- /dev/null
+++ b/backend/nvidia/jconf.h
@@ -0,0 +1,50 @@
+#pragma once
+#include <stdlib.h>
+#include <string>
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+class jconf
+{
+public:
+	static jconf* inst()
+	{
+		if (oInst == nullptr) oInst = new jconf;
+		return oInst;
+	};
+
+	bool parse_config(const char* sFilename = "nvidia.txt");
+
+	struct thd_cfg {
+		uint32_t id;
+		uint32_t blocks;
+		uint32_t threads;
+		uint32_t bfactor;
+		uint32_t bsleep;
+		bool bDoubleMode;
+		bool bNoPrefetch;
+		int32_t cpu_aff;
+
+		long long iCpuAff;
+	};
+
+	size_t GetGPUThreadCount();
+
+	bool GetGPUThreadConfig(size_t id, thd_cfg &cfg);
+
+	bool NeedsAutoconf();
+
+private:
+	jconf();
+	static jconf* oInst;
+
+	struct opaque_private;
+	opaque_private* prv;
+
+};
+
+} // namespace nvidia
+} // namepsace xmrstak
diff --git a/backend/nvidia/minethd.cpp b/backend/nvidia/minethd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a331f302702ff43992e31f12be030f5885e5445
--- /dev/null
+++ b/backend/nvidia/minethd.cpp
@@ -0,0 +1,268 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include <assert.h>
+#include <cmath>
+#include <chrono>
+#include <thread>
+#include <bitset>
+#include <vector>
+#include "../../console.h"
+#include "../../crypto/cryptonight_aesni.h"
+#include "../cpu/minethd.h"
+#include "../cpu/jconf.h"
+
+#include "../../executor.h"
+#include "minethd.h"
+#include "../../jconf.h"
+#include "../../crypto/cryptonight.h"
+#include "autoAdjust.hpp"
+
+
+#ifndef USE_PRECOMPILED_HEADERS
+#ifdef WIN32
+#include <direct.h>
+#include <windows.h>
+#else
+#include <sys/types.h>
+#include <dlfcn.h>
+#endif
+#include <iostream>
+#endif
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+#ifdef WIN32
+    HINSTANCE lib_handle;
+#else
+    void *lib_handle;
+#endif
+	
+minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
+{
+	oWork = pWork;
+	bQuit = 0;
+	iThreadNo = (uint8_t)iNo;
+	iJobNo = 0;
+
+	ctx.device_id = (int)cfg.id;
+	ctx.device_blocks = (int)cfg.blocks;
+	ctx.device_threads = (int)cfg.threads;
+	ctx.device_bfactor = (int)cfg.bfactor;
+	ctx.device_bsleep = (int)cfg.bsleep;
+	
+	oWorkThd = std::thread(&minethd::work_main, this);
+}
+
+
+bool minethd::self_test()
+{
+	cryptonight_ctx* ctx0;
+	unsigned char out[32];
+	bool bResult = true;
+
+	ctx0 = new cryptonight_ctx;
+	if(cpu::jconf::inst()->HaveHardwareAes())
+	{
+		//cryptonight_hash_ctx("This is a test", 14, out, ctx0);
+		bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+	}
+	else
+	{
+		//cryptonight_hash_ctx_soft("This is a test", 14, out, ctx0);
+		bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+	}
+	delete ctx0;
+
+	//if(!bResult)
+	//	printer::inst()->print_msg(L0,
+	//	    "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
+
+	return bResult;
+}
+
+
+extern "C" std::vector<IBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork)
+{
+	return nvidia::minethd::thread_starter(threadOffset, pWork);
+}
+
+std::vector<IBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork)
+{
+	std::vector<IBackend*>* pvThreads = new std::vector<IBackend*>();
+
+	if(!ConfigEditor::file_exist("nvidia.txt"))
+	{
+		autoAdjust adjust;
+		if(!adjust.printConfig())
+			return pvThreads;
+	}
+
+	if(!jconf::inst()->parse_config("nvidia.txt"))
+	{
+		win_exit();
+	}
+
+	int deviceCount = 0;
+	if(cuda_get_devicecount(&deviceCount) != 1)
+	{
+		std::cout<<"WARNING: NVIDIA no device found"<<std::endl;
+		return pvThreads;
+	}
+
+	size_t i, n = jconf::inst()->GetGPUThreadCount();
+	pvThreads->reserve(n);
+
+	jconf::thd_cfg cfg;
+	for (i = 0; i < n; i++)
+	{
+		jconf::inst()->GetGPUThreadConfig(i, cfg);
+		minethd* thd = new minethd(pWork, i + threadOffset, cfg);
+
+		if(cfg.cpu_aff >= 0)
+		{
+#if defined(__APPLE__)
+			printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+#endif
+			cpu::minethd::thd_setaffinity(thd->oWorkThd.native_handle(), cfg.cpu_aff);
+		}
+
+		pvThreads->push_back(thd);
+
+		if(cfg.cpu_aff >= 0)
+			printer::inst()->print_msg(L1, "Starting GPU thread, affinity: %d.", (int)cfg.cpu_aff);
+		else
+			printer::inst()->print_msg(L1, "Starting GPU thread, no affinity.");
+	}
+
+	return pvThreads;
+}
+
+void minethd::switch_work(miner_work& pWork)
+{
+	// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
+	// faster than threads can consume them. This should never happen in real life.
+	// Pool cant physically send jobs faster than every 250ms or so due to net latency.
+
+	while (GlobalStates::iConsumeCnt.load(std::memory_order_seq_cst) < GlobalStates::iThreadCount)
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+	GlobalStates::oGlobalWork = pWork;
+	GlobalStates::iConsumeCnt.store(0, std::memory_order_seq_cst);
+	GlobalStates::iGlobalJobNo++;
+}
+
+void minethd::consume_work()
+{
+	memcpy(&oWork, &GlobalStates::oGlobalWork, sizeof(miner_work));
+	iJobNo++;
+	GlobalStates::iConsumeCnt++;
+}
+
+void minethd::work_main()
+{
+	uint64_t iCount = 0;
+	uint32_t iNonce;
+	cryptonight_ctx* cpu_ctx;
+	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
+	cn_hash_fun hash_fun = cpu::minethd::func_selector(cpu::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/);
+
+	GlobalStates::iConsumeCnt++;
+
+	if(/*cuda_get_deviceinfo(&ctx) != 1 ||*/ cryptonight_extra_cpu_init(&ctx) != 1)
+	{
+		printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
+		std::exit(0);
+	}
+	
+	while (bQuit == 0)
+	{
+		if (oWork.bStall)
+		{
+			/*  We are stalled here because the executor didn't find a job for us yet,
+			    either because of network latency, or a socket problem. Since we are
+			    raison d'etre of this software it us sensible to just wait until we have something*/
+
+			while (GlobalStates::iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+				std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+			consume_work();
+			continue;
+		}
+
+		cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize);
+		iNonce = calc_start_nonce(oWork.iResumeCnt);
+
+		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
+
+		while(GlobalStates::iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		{
+
+			uint32_t foundNonce[10];
+			uint32_t foundCount;
+
+			cryptonight_extra_cpu_prepare(&ctx, iNonce);
+			cryptonight_core_cpu_hash(&ctx);
+			cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce);
+
+			for(size_t i = 0; i < foundCount; i++)
+			{
+
+				uint8_t	bWorkBlob[112];
+				uint8_t	bResult[32];
+
+				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
+				memset(bResult, 0, sizeof(job_result::bResult));
+
+				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
+
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
+				{
+					std::cout<<"found NVIDIA"<<std::endl;
+					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult), oWork.iPoolId));
+				}
+				else
+					std::cout<<"wrong NVIDIA"<<std::endl;
+			}
+
+			iCount += ctx.device_blocks * ctx.device_threads;
+			iNonce += ctx.device_blocks * ctx.device_threads;
+
+			using namespace std::chrono;
+			uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
+			iHashCount.store(iCount, std::memory_order_relaxed);
+			iTimestamp.store(iStamp, std::memory_order_relaxed);
+			std::this_thread::yield();
+		}
+
+		consume_work();
+	}
+}
+
+} // namespace xmrstak
+
+} //namespace nvidia
diff --git a/backend/nvidia/minethd.h b/backend/nvidia/minethd.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4b095e958546f73498f99b0731a68db53bd49ba
--- /dev/null
+++ b/backend/nvidia/minethd.h
@@ -0,0 +1,65 @@
+#pragma once
+#include <thread>
+#include <atomic>
+#include <vector>
+#include "nvcc_code/cryptonight.h"
+#include "../../crypto/cryptonight.h"
+#include "../../jconf.h"
+#include "./jconf.h"
+#include "../IBackend.hpp"
+#include <iostream>
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+class minethd : public IBackend
+{
+public:
+
+	static void switch_work(miner_work& pWork);
+	static std::vector<IBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
+	static bool self_test();
+
+private:
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	
+	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
+
+	// We use the top 10 bits of the nonce for thread and resume
+	// This allows us to resume up to 128 threads 4 times before
+	// we get nonce collisions
+	// Bottom 22 bits allow for an hour of work at 1000 H/s
+	inline uint32_t calc_start_nonce(uint32_t resume)
+	{
+		return reverseBits<uint32_t>(iThreadNo + GlobalStates::iThreadCount * resume);
+	}
+
+	// Limited version of the nonce calc above
+	inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume)
+	{
+		return start | ( ( reverseBits(iThreadNo + GlobalStates::iThreadCount * resume) >> 4u ) );
+	}
+
+	void work_main();
+	void consume_work();
+
+	static std::atomic<uint64_t> iGlobalJobNo;
+	static std::atomic<uint64_t> iConsumeCnt;
+	static uint64_t iThreadCount;
+	uint64_t iJobNo;
+
+	static miner_work oGlobalWork;
+	miner_work oWork;
+
+	std::thread oWorkThd;
+	uint8_t iThreadNo;
+
+	nvid_ctx ctx;
+
+	bool bQuit;
+};
+
+} // namespace nvidia
+} // namepsace xmrstak
diff --git a/backend/nvidia/nvcc_code/cryptonight.h b/backend/nvidia/nvcc_code/cryptonight.h
new file mode 100644
index 0000000000000000000000000000000000000000..bec99973ad492ae9f279f1c0a4c97e97391d97a5
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cryptonight.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <stdint.h>
+
+typedef struct {
+	int device_id;
+	const char *device_name;
+	int device_arch[2];
+	int device_mpcount;
+	int device_blocks;
+	int device_threads;
+	int device_bfactor;
+	int device_bsleep;
+	
+	uint32_t *d_input;
+	uint32_t inputlen;
+	uint32_t *d_result_count;
+	uint32_t *d_result_nonce;
+	uint32_t *d_long_state;
+	uint32_t *d_ctx_state;
+	uint32_t *d_ctx_a;
+	uint32_t *d_ctx_b;
+	uint32_t *d_ctx_key1;
+	uint32_t *d_ctx_key2;
+	uint32_t *d_ctx_text;
+} nvid_ctx;
+
+extern "C" {
+
+/** get device count
+ *
+ * @param deviceCount[out] cuda device count
+ * @return error code: 0 == error is occurred, 1 == no error
+ */
+int cuda_get_devicecount( int* deviceCount);
+int cuda_get_deviceinfo(nvid_ctx *ctx);
+int cryptonight_extra_cpu_init(nvid_ctx *ctx);
+void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
+void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce);
+void cryptonight_core_cpu_hash(nvid_ctx* ctx);
+void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce);
+
+}
+
diff --git a/backend/nvidia/nvcc_code/cuda_aes.hpp b/backend/nvidia/nvcc_code/cuda_aes.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e478600e32a30cc3f5fb3b88b53789fe23464ad2
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_aes.hpp
@@ -0,0 +1,305 @@
+
+#pragma once
+
+#include <stdint.h>
+
+#define N_COLS          4
+#define WPOLY           0x011b
+
+static __constant__ uint32_t d_t_fn[1024] =
+{
+	0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U,
+	0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
+	0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U,
+	0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
+	0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU,
+	0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
+	0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U,
+	0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU,
+	0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU,
+	0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U,
+	0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U,
+	0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU,
+	0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU,
+	0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU,
+	0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU,
+	0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU,
+	0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U,
+	0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU,
+	0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU,
+	0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U,
+	0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U,
+	0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U,
+	0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U,
+	0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U,
+	0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU,
+	0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U,
+	0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU,
+	0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU,
+	0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U,
+	0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U,
+	0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U,
+	0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU,
+	0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U,
+	0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU,
+	0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU,
+	0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U,
+	0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U,
+	0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU,
+	0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U,
+	0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU,
+	0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U,
+	0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U,
+	0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U,
+	0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U,
+	0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU,
+	0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U,
+	0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU,
+	0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U,
+	0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU,
+	0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U,
+	0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU,
+	0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU,
+	0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU,
+	0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU,
+	0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U,
+	0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U,
+	0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U,
+	0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U,
+	0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U,
+	0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U,
+	0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU,
+	0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U,
+	0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU,
+	0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU,
+	0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU,
+	0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U,
+	0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU,
+	0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU,
+	0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U,
+	0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU,
+	0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU,
+	0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU,
+	0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU,
+	0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU,
+	0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U,
+	0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU,
+	0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU,
+	0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U,
+	0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU,
+	0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU,
+	0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU,
+	0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU,
+	0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU,
+	0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U,
+	0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU,
+	0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU,
+	0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU,
+	0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU,
+	0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U,
+	0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U,
+	0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U,
+	0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U,
+	0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU,
+	0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U,
+	0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U,
+	0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU,
+	0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU,
+	0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U,
+	0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U,
+	0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U,
+	0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU,
+	0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U,
+	0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU,
+	0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U,
+	0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU,
+	0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U,
+	0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U,
+	0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU,
+	0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U,
+	0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U,
+	0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U,
+	0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U,
+	0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U,
+	0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U,
+	0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U,
+	0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U,
+	0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU,
+	0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U,
+	0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U,
+	0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U,
+	0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U,
+	0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U,
+	0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U,
+	0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU,
+	0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U,
+	0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U,
+	0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U,
+	0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU,
+	0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU,
+	0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U,
+	0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU,
+	0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U,
+	0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU,
+	0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U,
+	0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU,
+	0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U,
+	0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U,
+	0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU,
+	0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U,
+	0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U,
+	0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U,
+	0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU,
+	0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U,
+	0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U,
+	0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU,
+	0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U,
+	0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U,
+	0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U,
+	0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU,
+	0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU,
+	0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U,
+	0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU,
+	0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU,
+	0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U,
+	0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU,
+	0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U,
+	0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU,
+	0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U,
+	0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U,
+	0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U,
+	0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU,
+	0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U,
+	0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU,
+	0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U,
+	0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU,
+	0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U,
+	0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U,
+	0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU,
+	0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU,
+	0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU,
+	0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U,
+	0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U,
+	0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU,
+	0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U,
+	0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU,
+	0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U,
+	0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU,
+	0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U,
+	0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU,
+	0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU,
+	0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U,
+	0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU,
+	0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U,
+	0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU,
+	0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U,
+	0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U,
+	0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U,
+	0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU,
+	0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU,
+	0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U,
+	0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU,
+	0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U,
+	0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU,
+	0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U,
+	0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU,
+	0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U,
+	0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU,
+	0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U,
+	0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU,
+	0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U,
+	0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U,
+	0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU,
+	0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U,
+	0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U,
+	0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U,
+	0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU,
+	0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U,
+	0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U,
+	0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU,
+	0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U,
+	0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U,
+	0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U,
+	0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU,
+	0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU,
+	0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U,
+	0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU,
+	0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU,
+	0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U,
+	0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU,
+	0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U,
+	0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU,
+	0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U,
+	0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U,
+	0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U,
+	0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU,
+	0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U,
+	0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU,
+	0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U,
+	0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU,
+	0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U,
+	0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U,
+	0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU,
+	0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU,
+	0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU,
+	0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U,
+	0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U,
+	0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU,
+	0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U,
+	0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU,
+	0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U,
+	0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU,
+	0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U,
+	0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU,
+	0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU,
+	0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U,
+	0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU,
+	0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U,
+	0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU,
+	0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U,
+	0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U,
+	0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U,
+	0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU,
+	0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU,
+	0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
+	0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU,
+	0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U
+};
+
+#define t_fn0(x) (sharedMemory[      (x)])
+#define t_fn1(x) (sharedMemory[256 + (x)])
+#define t_fn2(x) (sharedMemory[512 + (x)])
+#define t_fn3(x) (sharedMemory[768 + (x)])
+
+
+#define round(dummy,y,x,k) \
+	y[0] = (k)[0]  ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \
+	y[1] = (k)[1]  ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \
+	y[2] = (k)[2]  ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \
+	y[3] = (k)[3]  ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) ));
+
+__device__ __forceinline__ static void cn_aes_single_round(uint32_t * __restrict__ sharedMemory, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t * __restrict__ expandedKey)
+{
+	round(sharedMemory, out, in, expandedKey);
+}
+
+__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * __restrict__ sharedMemory, uint32_t * __restrict__ val, const uint32_t * __restrict__ expandedKey)
+{
+	uint32_t b1[4];
+	round(sharedMemory, b1, val, expandedKey);
+	round(sharedMemory, val, b1, expandedKey + 1 * N_COLS);
+	round(sharedMemory, b1, val, expandedKey + 2 * N_COLS);
+	round(sharedMemory, val, b1, expandedKey + 3 * N_COLS);
+	round(sharedMemory, b1, val, expandedKey + 4 * N_COLS);
+	round(sharedMemory, val, b1, expandedKey + 5 * N_COLS);
+	round(sharedMemory, b1, val, expandedKey + 6 * N_COLS);
+	round(sharedMemory, val, b1, expandedKey + 7 * N_COLS);
+	round(sharedMemory, b1, val, expandedKey + 8 * N_COLS);
+	round(sharedMemory, val, b1, expandedKey + 9 * N_COLS);
+}
+
+__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
+{
+	for(int i = threadIdx.x; i < 1024; i += blockDim.x)
+		sharedMemory[i] = d_t_fn[i];
+}
diff --git a/backend/nvidia/nvcc_code/cuda_blake.hpp b/backend/nvidia/nvcc_code/cuda_blake.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..07ae16969fb969ceb87497b6ca68f041724665b8
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_blake.hpp
@@ -0,0 +1,193 @@
+#pragma once
+
+typedef struct {
+	uint32_t h[8], s[4], t[2];
+	int buflen, nullt;
+	uint8_t buf[64];
+} blake_state;
+
+#define U8TO32(p) \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+	((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+
+#define U32TO8(p, v) \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define BLAKE_ROT(x,n) ROTR32(x, n)
+#define BLAKE_G(a,b,c,d,e) \
+	v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a],16); \
+	v[c] += v[d];                     \
+	v[b] = BLAKE_ROT(v[b] ^ v[c],12); \
+	v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \
+	v[c] += v[d];                     \
+	v[b] = BLAKE_ROT(v[b] ^ v[c], 7);
+
+__constant__ uint8_t d_blake_sigma[14][16] =
+{
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}
+};
+__constant__ uint32_t d_blake_cst[16]
+= {
+	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
+	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+};
+
+__device__ void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * __restrict__ block)
+{
+	uint32_t v[16], m[16], i;
+
+	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
+	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
+	v[ 8] = S->s[0] ^ 0x243F6A88;
+	v[ 9] = S->s[1] ^ 0x85A308D3;
+	v[10] = S->s[2] ^ 0x13198A2E;
+	v[11] = S->s[3] ^ 0x03707344;
+	v[12] = 0xA4093822;
+	v[13] = 0x299F31D0;
+	v[14] = 0x082EFA98;
+	v[15] = 0xEC4E6C89;
+
+	if (S->nullt == 0)
+	{
+		v[12] ^= S->t[0];
+		v[13] ^= S->t[0];
+		v[14] ^= S->t[1];
+		v[15] ^= S->t[1];
+	}
+
+	for (i = 0; i < 14; ++i)
+	{
+		BLAKE_G(0, 4,  8, 12,  0);
+		BLAKE_G(1, 5,  9, 13,  2);
+		BLAKE_G(2, 6, 10, 14,  4);
+		BLAKE_G(3, 7, 11, 15,  6);
+		BLAKE_G(3, 4,  9, 14, 14);
+		BLAKE_G(2, 7,  8, 13, 12);
+		BLAKE_G(0, 5, 10, 15,  8);
+		BLAKE_G(1, 6, 11, 12, 10);
+	}
+
+	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
+	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+}
+
+__device__ void cn_blake_update(blake_state * __restrict__ S, const uint8_t * __restrict__ data, uint64_t datalen)
+{
+	int left = S->buflen >> 3;
+	int fill = 64 - left;
+
+	if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) 
+	{
+		memcpy((void *) (S->buf + left), (void *) data, fill);
+		S->t[0] += 512;
+		if (S->t[0] == 0) S->t[1]++;
+		cn_blake_compress(S, S->buf);
+		data += fill;
+		datalen -= (fill << 3);
+		left = 0;
+	}
+
+	while (datalen >= 512) 
+	{
+		S->t[0] += 512;
+		if (S->t[0] == 0) S->t[1]++;
+		cn_blake_compress(S, data);
+		data += 64;
+		datalen -= 512;
+	}
+
+	if (datalen > 0) 
+	{
+		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+		S->buflen = (left << 3) + datalen;
+	}
+	else 
+	{
+		S->buflen = 0;
+	}
+}
+
+__device__ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
+{
+	const uint8_t padding[] = 
+	{
+		0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+	};
+
+	uint8_t pa = 0x81, pb = 0x01;
+	uint8_t msglen[8];
+	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
+	if (lo < (unsigned) S->buflen) hi++;
+	U32TO8(msglen + 0, hi);
+	U32TO8(msglen + 4, lo);
+
+	if (S->buflen == 440) 
+	{
+		S->t[0] -= 8;
+		cn_blake_update(S, &pa, 8);
+	} 
+	else 
+	{
+		if (S->buflen < 440) 
+		{
+			if (S->buflen == 0) S->nullt = 1;
+			S->t[0] -= 440 - S->buflen;
+			cn_blake_update(S, padding, 440 - S->buflen);
+		}
+		else 
+		{
+			S->t[0] -= 512 - S->buflen;
+			cn_blake_update(S, padding, 512 - S->buflen);
+			S->t[0] -= 440;
+			cn_blake_update(S, padding + 1, 440);
+			S->nullt = 1;
+		}
+		cn_blake_update(S, &pb, 8);
+		S->t[0] -= 8;
+	}
+	S->t[0] -= 64;
+	cn_blake_update(S, msglen, 64);
+
+	U32TO8(digest +  0, S->h[0]);
+	U32TO8(digest +  4, S->h[1]);
+	U32TO8(digest +  8, S->h[2]);
+	U32TO8(digest + 12, S->h[3]);
+	U32TO8(digest + 16, S->h[4]);
+	U32TO8(digest + 20, S->h[5]);
+	U32TO8(digest + 24, S->h[6]);
+	U32TO8(digest + 28, S->h[7]);
+}
+
+__device__ void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
+{
+	blake_state bs;
+	blake_state *S = (blake_state *)&bs;
+
+	S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372;
+	S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C;
+	S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19;
+	S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
+	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
+
+	cn_blake_update(S, (uint8_t *)in, inlen * 8);
+	cn_blake_final(S, (uint8_t *)out);
+}
diff --git a/backend/nvidia/nvcc_code/cuda_core.cu b/backend/nvidia/nvcc_code/cuda_core.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7590cf55baf80e387754f24c375e327b9cfba057
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_core.cu
@@ -0,0 +1,343 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#ifdef _WIN32
+#include <windows.h>
+extern "C" void compat_usleep(uint64_t waitTime)
+{
+    if (waitTime > 0)
+    {
+        if (waitTime > 100)
+        {
+            // use a waitable timer for larger intervals > 0.1ms
+
+            HANDLE timer;
+            LARGE_INTEGER ft;
+
+            ft.QuadPart = -(10*waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time
+
+            timer = CreateWaitableTimer(NULL, TRUE, NULL);
+            SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0);
+            WaitForSingleObject(timer, INFINITE);
+            CloseHandle(timer);
+        }
+        else
+        {
+            // use a polling loop for short intervals <= 100ms
+
+            LARGE_INTEGER perfCnt, start, now;
+            __int64 elapsed;
+
+            QueryPerformanceFrequency(&perfCnt);
+            QueryPerformanceCounter(&start);
+            do {
+		SwitchToThread();
+                QueryPerformanceCounter((LARGE_INTEGER*) &now);
+                elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000);
+            } while ( elapsed < waitTime );
+        }
+    }
+}
+#else
+#include <unistd.h>
+extern "C" void compat_usleep(uint64_t waitTime)
+{
+	usleep(waitTime);
+}
+#endif
+
+#include "cryptonight.h"
+#include "cuda_extra.h"
+#include "cuda_aes.hpp"
+#include "cuda_device.hpp"
+
+/* sm_2X is limited to 2GB due to the small TLB
+ * therefore we never use 64bit indices
+ */
+#if defined(XMR_STAK_LARGEGRID) && (__CUDA_ARCH__ >= 300)
+typedef uint64_t IndexType;
+#else
+typedef int IndexType;
+#endif
+
+__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi )
+{
+	*product_hi = __umul64hi( multiplier, multiplicand );
+	return (multiplier * multiplicand );
+}
+
+template< typename T >
+__device__ __forceinline__ T loadGlobal64( T * const addr )
+{
+	T x;
+	asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) );
+	return x;
+}
+
+template< typename T >
+__device__ __forceinline__ T loadGlobal32( T * const addr )
+{
+	T x;
+	asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) );
+	return x;
+}
+
+
+template< typename T >
+__device__ __forceinline__ void storeGlobal32( T* addr, T const & val )
+{
+	asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) );
+}
+
+__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1 )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
+	__syncthreads( );
+
+	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
+	const int sub = ( threadIdx.x & 7 ) << 2;
+
+	const int batchsize = 0x80000 >> bfactor;
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+
+	if ( thread >= threads )
+		return;
+
+	uint32_t key[40], text[4];
+
+	MEMCPY8( key, ctx_key1 + thread * 40, 20 );
+
+	if( partidx == 0 )
+	{
+		// first round
+		MEMCPY8( text, ctx_state + thread * 50 + sub + 16, 2 );
+	}
+	else
+	{
+		// load previous text data
+		MEMCPY8( text, &long_state[( (uint64_t) thread << 19 ) + sub + start - 32], 2 );
+	}
+	__syncthreads( );
+	for ( int i = start; i < end; i += 32 )
+	{
+		cn_aes_pseudo_round_mut( sharedMemory, text, key );
+		MEMCPY8(&long_state[((uint64_t) thread << 19) + (sub + i)], text, 2);
+	}
+}
+
+/** avoid warning `unused parameter` */
+template< typename T >
+__forceinline__ __device__ void unusedVar( const T& )
+{
+}
+
+/** shuffle data for
+ *
+ * - this method can be used with all compute architectures
+ * - for <sm_30 shared memory is needed
+ *
+ * @param ptr pointer to shared memory, size must be `threadIdx.x * sizeof(uint32_t)`
+ *            value can be NULL for compute architecture >=sm_30
+ * @param sub thread number within the group, range [0;4)
+ * @param value value to share with other threads within the group
+ * @param src thread number within the group from where the data is read, range [0;4)
+ */
+__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src)
+{
+#if( __CUDA_ARCH__ < 300 )
+    ptr[sub] = val;
+    return ptr[src&3];
+#else
+    unusedVar( ptr );
+    unusedVar( sub );
+    return __shfl( val, src, 4 );
+#endif
+}
+
+#ifdef XMR_STAK_THREADS
+__launch_bounds__( XMR_STAK_THREADS * 4 )
+#endif
+__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
+
+	__syncthreads( );
+
+	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
+	const int sub = threadIdx.x & 3;
+	const int sub2 = sub & 2;
+
+#if( __CUDA_ARCH__ < 300 )
+        extern __shared__ uint32_t shuffleMem[];
+        volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
+#else
+        volatile uint32_t* sPtr = NULL;
+#endif
+	if ( thread >= threads )
+		return;
+
+	int i, k;
+        uint32_t j;
+	const int batchsize = ITER >> ( 2 + bfactor );
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+	uint32_t * long_state = &d_long_state[(IndexType) thread << 19];
+	uint32_t * ctx_a = d_ctx_a + thread * 4;
+	uint32_t * ctx_b = d_ctx_b + thread * 4;
+	uint32_t a, d[2];
+	uint32_t t1[2], t2[2], res;
+
+	a = ctx_a[sub];
+	d[1] = ctx_b[sub];
+	#pragma unroll 2
+	for ( i = start; i < end; ++i )
+	{
+		#pragma unroll 2
+		for ( int x = 0; x < 2; ++x )
+		{
+			j = ( ( shuffle(sPtr,sub, a, 0) & 0x1FFFF0 ) >> 2 ) + sub;
+
+			const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
+			const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1);
+			const uint32_t x_2 = shuffle(sPtr,sub, x_0, sub + 2);
+			const uint32_t x_3 = shuffle(sPtr,sub, x_0, sub + 3);
+			d[x] = a ^
+				t_fn0( x_0 & 0xff ) ^
+				t_fn1( (x_1 >> 8) & 0xff ) ^
+				t_fn2( (x_2 >> 16) & 0xff ) ^
+				t_fn3( ( x_3 >> 24 ) );
+
+
+			//XOR_BLOCKS_DST(c, b, &long_state[j]);
+			t1[0] = shuffle(sPtr,sub, d[x], 0);
+			//long_state[j] = d[0] ^ d[1];
+			storeGlobal32( long_state + j, d[0] ^ d[1] );
+
+			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
+			j = ( ( *t1 & 0x1FFFF0 ) >> 2 ) + sub;
+
+			uint32_t yy[2];
+			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
+			uint32_t zz[2];
+			zz[0] = shuffle(sPtr,sub, yy[0], 0);
+			zz[1] = shuffle(sPtr,sub, yy[1], 0);
+
+			t1[1] = shuffle(sPtr,sub, d[x], 1);
+			#pragma unroll
+			for ( k = 0; k < 2; k++ )
+				t2[k] = shuffle(sPtr,sub, a, k + sub2);
+
+            *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) );
+
+			res = *( (uint64_t *) t2 )  >> ( sub & 1 ? 32 : 0 );
+
+			storeGlobal32( long_state + j, res );
+			a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
+		}
+	}
+
+	if ( bfactor > 0 )
+	{
+		ctx_a[sub] = a;
+		ctx_b[sub] = d[1];
+	}
+}
+
+__global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
+	__syncthreads( );
+
+	int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
+	int sub = ( threadIdx.x & 7 ) << 2;
+
+	const int batchsize = 0x80000 >> bfactor;
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+
+	if ( thread >= threads )
+		return;
+
+	uint32_t key[40], text[4];
+	MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
+	MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 );
+
+	__syncthreads( );
+	for ( int i = start; i < end; i += 32 )
+	{
+#pragma unroll
+		for ( int j = 0; j < 4; ++j )
+			text[j] ^= long_state[((IndexType) thread << 19) + (sub + i + j)];
+
+		cn_aes_pseudo_round_mut( sharedMemory, text, key );
+	}
+
+	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
+}
+
+extern "C" void cryptonight_core_cpu_hash(nvid_ctx* ctx)
+{
+	dim3 grid( ctx->device_blocks );
+	dim3 block( ctx->device_threads );
+	dim3 block4( ctx->device_threads << 2 );
+	dim3 block8( ctx->device_threads << 3 );
+
+	int partcount = 1 << ctx->device_bfactor;
+
+	/* bfactor for phase 1 and 3
+	 *
+	 * phase 1 and 3 consume less time than phase 2, therefore we begin with the
+	 * kernel splitting if the user defined a `bfactor >= 5`
+	 */
+	int bfactorOneThree = ctx->device_bfactor - 4;
+	if( bfactorOneThree < 0 )
+		bfactorOneThree = 0;
+
+	int partcountOneThree = 1 << bfactorOneThree;
+
+	for ( int i = 0; i < partcountOneThree; i++ )
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+			bfactorOneThree, i,
+			ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key1 ));
+
+		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+	}
+	if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+
+	for ( int i = 0; i < partcount; i++ )
+	{
+        CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase2<<<
+            grid,
+            block4,
+            block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+        >>>(
+            ctx->device_blocks*ctx->device_threads,
+            ctx->device_bfactor,
+            i,
+            ctx->d_long_state,
+            ctx->d_ctx_a,
+            ctx->d_ctx_b
+        ));
+
+		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+	}
+
+	for ( int i = 0; i < partcountOneThree; i++ )
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+			bfactorOneThree, i,
+			ctx->d_long_state,
+			ctx->d_ctx_state, ctx->d_ctx_key2 ));
+	}
+}
diff --git a/backend/nvidia/nvcc_code/cuda_device.hpp b/backend/nvidia/nvcc_code/cuda_device.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..078c16537cdc803ed958d8170c8c7b7b63eeaa30
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_device.hpp
@@ -0,0 +1,30 @@
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <iostream>
+#include <string>
+
+/** execute and check a CUDA api command
+ *
+ * @param id gpu id (thread id)
+ * @param ... CUDA api command
+ */
+#define CUDA_CHECK(id, ...) {														\
+	cudaError_t error = __VA_ARGS__;											\
+	if(error!=cudaSuccess){														\
+		std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__ << std::endl; \
+		throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error)));	\
+	}																			\
+}																				\
+( (void) 0 )
+
+/** execute and check a CUDA kernel
+ *
+ * @param id gpu id (thread id)
+ * @param ... CUDA kernel call
+ */
+#define CUDA_CHECK_KERNEL(id, ...)													\
+	__VA_ARGS__;																\
+	CUDA_CHECK(id, cudaGetLastError())
diff --git a/backend/nvidia/nvcc_code/cuda_extra.cu b/backend/nvidia/nvcc_code/cuda_extra.cu
new file mode 100644
index 0000000000000000000000000000000000000000..94a4adbbf40da13dd9210423a85c0876a2681e2b
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -0,0 +1,345 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_functions.hpp>
+
+#ifdef __CUDACC__
+__constant__
+#else
+const
+#endif
+uint64_t keccakf_rndc[24] ={
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+
+#include "cryptonight.h"
+#include "cuda_extra.h"
+#include "cuda_keccak.hpp"
+#include "cuda_blake.hpp"
+#include "cuda_groestl.hpp"
+#include "cuda_jh.hpp"
+#include "cuda_skein.hpp"
+#include "cuda_device.hpp"
+
+__constant__ uint8_t d_sub_byte[16][16] ={
+	{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
+	{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
+	{0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 },
+	{0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 },
+	{0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 },
+	{0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf },
+	{0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 },
+	{0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 },
+	{0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 },
+	{0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb },
+	{0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 },
+	{0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 },
+	{0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a },
+	{0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e },
+	{0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf },
+	{0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }
+};
+
+__device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__ key, const uint32_t * __restrict__ data )
+{
+	int i, j;
+	uint8_t temp[4];
+	const uint32_t aes_gf[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
+
+	MEMSET4( key, 0, 40 );
+	MEMCPY4( key, data, 8 );
+
+#pragma unroll
+	for ( i = 8; i < 40; i++ )
+	{
+		*(uint32_t *) temp = key[i - 1];
+		if ( i % 8 == 0 )
+		{
+			*(uint32_t *) temp = ROTR32( *(uint32_t *) temp, 8 );
+			for ( j = 0; j < 4; j++ )
+				temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f];
+			*(uint32_t *) temp ^= aes_gf[i / 8 - 1];
+		}
+		else
+		{
+			if ( i % 8 == 4 )
+			{
+#pragma unroll
+				for ( j = 0; j < 4; j++ )
+					temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f];
+			}
+		}
+
+		key[i] = key[( i - 8 )] ^ *(uint32_t *) temp;
+	}
+}
+
+__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
+{
+	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
+
+	if ( thread >= threads )
+		return;
+
+	uint32_t ctx_state[50];
+	uint32_t ctx_a[4];
+	uint32_t ctx_b[4];
+	uint32_t ctx_key1[40];
+	uint32_t ctx_key2[40];
+	uint32_t input[21];
+
+	memcpy( input, d_input, len );
+	//*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
+	uint32_t nonce = startNonce + thread;
+	for ( int i = 0; i < sizeof (uint32_t ); ++i )
+		( ( (char *) input ) + 39 )[i] = ( (char*) ( &nonce ) )[i]; //take care of pointer alignment
+
+	cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state );
+	cryptonight_aes_set_key( ctx_key1, ctx_state );
+	cryptonight_aes_set_key( ctx_key2, ctx_state + 8 );
+	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
+	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
+
+	memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
+	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
+	memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+	memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
+	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
+}
+
+__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state )
+{
+	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if ( thread >= threads )
+		return;
+
+	int i;
+	uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50;
+	uint64_t hash[4];
+	uint32_t state[50];
+
+#pragma unroll
+	for ( i = 0; i < 50; i++ )
+		state[i] = ctx_state[i];
+
+	cn_keccakf2( (uint64_t *) state );
+
+	switch ( ( (uint8_t *) state )[0] & 0x03 )
+	{
+	case 0:
+		cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash );
+		break;
+	case 1:
+		cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash );
+		break;
+	case 2:
+		cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash );
+		break;
+	case 3:
+		cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash );
+		break;
+	default:
+		break;
+	}
+
+	// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+	// and expect an accurate result for target > 32-bit without implementing carries
+
+	if ( hash[3] < target )
+	{
+		uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+
+		if(idx < 10)
+			d_res_nonce[idx] = thread;
+	}
+}
+
+extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len )
+{
+	ctx->inputlen = len;
+	CUDA_CHECK(ctx->device_id, cudaMemcpy( ctx->d_input, data, len, cudaMemcpyHostToDevice ));
+}
+
+extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
+{
+	cudaError_t err;
+	err = cudaSetDevice(ctx->device_id);
+	if(err != cudaSuccess)
+	{
+		printf("GPU %d: %s", ctx->device_id, cudaGetErrorString(err));
+		return 0;
+	}
+
+	cudaDeviceReset();
+	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+
+	size_t wsize = ctx->device_blocks * ctx->device_threads;
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, (size_t)MEMORY * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_text, 32 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, 4 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
+	return 1;
+}
+
+extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce)
+{
+	int threadsperblock = 128;
+	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
+
+	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
+	dim3 block( threadsperblock );
+
+	CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+		ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+}
+
+extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce)
+{
+	int threadsperblock = 128;
+	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
+
+	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
+	dim3 block( threadsperblock );
+
+	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) ));
+
+	CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state ));
+
+	CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+	CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+
+	/* There is only a 32bit limit for the counter on the device side
+	 * therefore this value can be greater than 10, in that case limit rescount
+	 * to 10 entries.
+	 */
+	if(*rescount > 10)
+		*rescount = 10;
+	for(int i=0; i < *rescount; i++)
+		resnonce[i] += startNonce;
+}
+
+extern "C" int cuda_get_devicecount( int* deviceCount)
+{
+	cudaError_t err;
+	*deviceCount = 0;
+	err = cudaGetDeviceCount(deviceCount);
+	if(err != cudaSuccess)
+	{
+		if(err == cudaErrorNoDevice)
+			printf("ERROR: NVIDIA no CUDA device found!\n");
+		else if(err == cudaErrorInsufficientDriver)
+			printf("WARNING: NVIDIA Insufficient driver!\n");
+		else
+			printf("WARNING: NVIDIA Unable to query number of CUDA devices!\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
+{
+	cudaError_t err;
+	int version;
+
+	err = cudaDriverGetVersion(&version);
+	if(err != cudaSuccess)
+	{
+		printf("Unable to query CUDA driver version! Is an nVidia driver installed?\n");
+		return 0;
+	}
+
+	if(version < CUDART_VERSION)
+	{
+		printf("Driver does not support CUDA %d.%d API! Update your nVidia driver!\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
+		return 0;
+	}
+
+	int GPU_N;
+	if(cuda_get_devicecount(&GPU_N) == 0)
+	{
+		return 0;
+	}
+
+	if(ctx->device_id >= GPU_N)
+	{
+		printf("Invalid device ID!\n");
+		return 0;
+	}
+
+	cudaDeviceProp props;
+	err = cudaGetDeviceProperties(&props, ctx->device_id);
+	if(err != cudaSuccess)
+	{
+		printf("\nGPU %d: %s\n%s line %d\n", ctx->device_id, cudaGetErrorString(err), __FILE__, __LINE__);
+		return 0;
+	}
+
+	ctx->device_name = strdup(props.name);
+	ctx->device_mpcount = props.multiProcessorCount;
+	ctx->device_arch[0] = props.major;
+	ctx->device_arch[1] = props.minor;
+
+	// set all evice option those marked as auto (-1) to a valid value
+	if(ctx->device_blocks == -1)
+	{
+		/* good values based of my experience
+		 *	 - 3 * SMX count >=sm_30
+		 *   - 2 * SMX count for <sm_30
+		 */
+		ctx->device_blocks = props.multiProcessorCount *
+			( props.major < 3 ? 2 : 3 );
+	}
+	if(ctx->device_threads == -1)
+	{
+		/* sm_20 devices can only run 512 threads per cuda block
+		 * `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts
+		 * `8 * ctx->device_threads` threads per block
+		 */
+		ctx->device_threads = 64;
+		if(props.major < 6)
+		{
+			// try to stay under 950 threads ( 1900MiB memory per for hashes )
+			while(ctx->device_blocks * ctx->device_threads >= 950 && ctx->device_threads > 2)
+			{
+				ctx->device_threads /= 2;
+			}
+		}
+
+		// stay within 85% of the available RAM
+		while(ctx->device_threads > 2)
+		{
+			size_t freeMemory = 0;
+			size_t totalMemory = 0;
+			CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory));
+			freeMemory = (freeMemory * size_t(85)) / 100;
+			if( freeMemory > (size_t(ctx->device_blocks) * size_t(ctx->device_threads) * size_t(2u * 1024u * 1024u)) )
+				break;
+			else
+				ctx->device_threads /= 2;
+		}
+	}
+
+	return 1;
+}
diff --git a/backend/nvidia/nvcc_code/cuda_extra.h b/backend/nvidia/nvcc_code/cuda_extra.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ccdcd6451805e6e4fa6c9a235bc184141749858
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_extra.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 520
+/* avoid red underlining */
+
+struct uint3
+{
+	unsigned int x, y, z;
+};
+
+struct uint3  threadIdx;
+struct uint3  blockIdx;
+struct uint3  blockDim;
+#define __funnelshift_r(a,b,c) 1
+#define __syncthreads()
+#define asm(x)
+#define __shfl(a,b,c) 1
+#endif
+
+#define MEMORY         (1 << 21) // 2 MiB / 2097152 B
+#define ITER           (1 << 20) // 1048576
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
+
+#define C32(x)    ((uint32_t)(x ## U))
+#define T32(x) ((x) & C32(0xFFFFFFFF))
+
+#if __CUDA_ARCH__ >= 350
+__forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset)
+{
+	uint2 result;
+	if(offset >= 32)
+	{
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} 
+	else 
+	{
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#define ROTL64(x, n) (cuda_ROTL64(x, n))
+#else
+#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
+#if __CUDA_ARCH__ < 350
+#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#endif
+
+#define MEMSET8(dst,what,cnt) { \
+	int i_memset8; \
+	uint64_t *out_memset8 = (uint64_t *)(dst); \
+	for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
+		out_memset8[i_memset8] = (what); }
+
+#define MEMSET4(dst,what,cnt) { \
+	int i_memset4; \
+	uint32_t *out_memset4 = (uint32_t *)(dst); \
+	for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
+		out_memset4[i_memset4] = (what); }
+
+#define MEMCPY8(dst,src,cnt) { \
+	int i_memcpy8; \
+	uint64_t *in_memcpy8 = (uint64_t *)(src); \
+	uint64_t *out_memcpy8 = (uint64_t *)(dst); \
+	for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
+		out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
+
+#define MEMCPY4(dst,src,cnt) { \
+	int i_memcpy4; \
+	uint32_t *in_memcpy4 = (uint32_t *)(src); \
+	uint32_t *out_memcpy4 = (uint32_t *)(dst); \
+	for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
+		out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
+
+#define XOR_BLOCKS(a,b) { \
+	((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
+	((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }
+
+#define XOR_BLOCKS_DST(x,y,z) { \
+	((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
+	((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
+
+#define MUL_SUM_XOR_DST(a,c,dst) { \
+	const uint64_t dst0 = ((uint64_t *)dst)[0]; \
+	uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \
+	hi += ((uint64_t *)c)[0]; \
+	((uint64_t *)c)[0] = dst0 ^ hi; \
+	((uint64_t *)dst)[0] = hi; \
+	((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \
+	}
+
+#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
+
diff --git a/backend/nvidia/nvcc_code/cuda_groestl.hpp b/backend/nvidia/nvcc_code/cuda_groestl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a37934cb8d4e78ad168dcf41edb31ce2d2bbe98b
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_groestl.hpp
@@ -0,0 +1,357 @@
+#pragma once
+
+#define GROESTL_ROWS 8
+#define GROESTL_LENGTHFIELDLEN GROESTL_ROWS
+#define GROESTL_COLS512 8
+
+#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512)
+
+#define GROESTL_ROUNDS512 10
+#define GROESTL_HASH_BIT_LEN 256
+
+#define GROESTL_ROTL32(v, n) ROTL32(v, n)
+
+
+#define li_32(h) 0x##h##u
+#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
+
+#define u32BIG(a)	\
+	((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a,24) & li_32(FF00FF00)))
+
+typedef struct {
+	uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)];            /* actual state */
+	uint32_t block_counter1,
+	block_counter2;         /* message block counter(s) */
+	BitSequence buffer[GROESTL_SIZE512];      /* data buffer */
+	int buf_ptr;              /* data buffer pointer */
+	int bits_in_last_byte;    /* no. of message bits in last byte of data buffer */
+} groestlHashState;
+
+
+__constant__ uint32_t d_groestl_T[512] =
+{
+  0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
+, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
+, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
+, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
+, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
+, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
+, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
+, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
+, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
+, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
+, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
+, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
+, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
+, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
+, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
+, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
+, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
+, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
+, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
+, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
+, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
+, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
+, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
+, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
+, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
+, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
+, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
+, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
+, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
+, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
+, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
+, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e
+};
+
+#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \
+	{ temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
+		v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
+		v1 = temp_var; }
+
+#define GROESTL_COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \
+	tu = d_groestl_T[2*(uint32_t)x[4*c0+0]];	\
+	tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1];	\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t)		\
+	tu ^= tv1;									\
+	tl ^= tv2;									\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t)		\
+	tu ^= tv1;									\
+	tl ^= tv2;   								\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t)		\
+	tu ^= tv1;									\
+	tl ^= tv2;									\
+	tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]];	\
+	tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1];	\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t)		\
+	tl ^= tv1;									\
+	tu ^= tv2;									\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t)		\
+	tl ^= tv1;									\
+	tu ^= tv2;   								\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t)		\
+	tl ^= tv1;									\
+	tu ^= tv2;									\
+	y[i] = tu;									\
+	y[i+1] = tl;
+
+__device__ void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[ 0] ^= 0x00000000^r;
+	x32[ 2] ^= 0x00000010^r;
+	x32[ 4] ^= 0x00000020^r;
+	x32[ 6] ^= 0x00000030^r;
+	x32[ 8] ^= 0x00000040^r;
+	x32[10] ^= 0x00000050^r;
+	x32[12] ^= 0x00000060^r;
+	x32[14] ^= 0x00000070^r;
+	GROESTL_COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+__device__ void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[ 0] = ~x32[ 0];
+	x32[ 1] ^= 0xffffffff^r;
+	x32[ 2] = ~x32[ 2];
+	x32[ 3] ^= 0xefffffff^r;
+	x32[ 4] = ~x32[ 4];
+	x32[ 5] ^= 0xdfffffff^r;
+	x32[ 6] = ~x32[ 6];
+	x32[ 7] ^= 0xcfffffff^r;
+	x32[ 8] = ~x32[ 8];
+	x32[ 9] ^= 0xbfffffff^r;
+	x32[10] = ~x32[10];
+	x32[11] ^= 0xafffffff^r;
+	x32[12] = ~x32[12];
+	x32[13] ^= 0x9fffffff^r;
+	x32[14] = ~x32[14];
+	x32[15] ^= 0x8fffffff^r;
+	GROESTL_COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+__device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m)
+{
+	int i;
+	uint32_t Ptmp[2*GROESTL_COLS512];
+	uint32_t Qtmp[2*GROESTL_COLS512];
+	uint32_t y[2*GROESTL_COLS512];
+	uint32_t z[2*GROESTL_COLS512];
+
+	for (i = 0; i < 2*GROESTL_COLS512; i++) 
+	{
+		z[i] = m[i];
+		Ptmp[i] = h[i]^m[i];
+	}
+
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x01000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x02000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x03000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x04000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x05000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x06000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x07000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x08000000);
+	cn_groestl_RND512Q((uint8_t*)y, Qtmp, 0x09000000);
+
+	cn_groestl_RND512P((uint8_t*)Ptmp, y, 0x00000000);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000001);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000002);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000003);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000004);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000005);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000006);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000007);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
+	cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009);
+
+	for (i = 0; i < 2*GROESTL_COLS512; i++)
+		h[i] ^= Ptmp[i]^Qtmp[i];
+}
+
+__device__ void cn_groestl_outputtransformation(groestlHashState *ctx)
+{
+	int j;
+	uint32_t temp[2*GROESTL_COLS512];
+	uint32_t y[2*GROESTL_COLS512];
+	uint32_t z[2*GROESTL_COLS512];
+
+	for (j = 0; j < 2*GROESTL_COLS512; j++)
+		temp[j] = ctx->chaining[j];
+
+	cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000001);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000002);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000003);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000004);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000005);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000006);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000007);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
+	cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009);
+
+	for (j = 0; j < 2*GROESTL_COLS512; j++)
+		ctx->chaining[j] ^= temp[j];
+}
+
+__device__ void cn_groestl_transform(groestlHashState * __restrict__ ctx,
+	const uint8_t * __restrict__ input, int msglen)
+{
+	for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) 
+	{
+		cn_groestl_F512(ctx->chaining,(uint32_t*)input);
+		ctx->block_counter1++;
+
+		if (ctx->block_counter1 == 0) 
+			ctx->block_counter2++;
+	}
+}
+
+__device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx, 
+	BitSequence* __restrict__  output)
+{
+	int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8;
+	uint8_t *s = (BitSequence*)ctx->chaining;
+
+	if (ctx->bits_in_last_byte) 
+	{
+		ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<ctx->bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte);
+		ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte);
+		ctx->bits_in_last_byte = 0;
+	}
+	else
+	{
+		ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+	}
+
+	if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) 
+	{
+		while (ctx->buf_ptr < GROESTL_SIZE512) 
+			ctx->buffer[(int)ctx->buf_ptr++] = 0;
+
+		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+		ctx->buf_ptr = 0;
+	}
+
+	while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+		ctx->buffer[(int)ctx->buf_ptr++] = 0;
+
+	ctx->block_counter1++;
+	if (ctx->block_counter1 == 0)
+		ctx->block_counter2++;
+	ctx->buf_ptr = GROESTL_SIZE512;
+
+	while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t))
+	{
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
+		ctx->block_counter1 >>= 8;
+	}
+	while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+	{
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
+		ctx->block_counter2 >>= 8;
+	}
+	cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+	cn_groestl_outputtransformation(ctx);
+
+	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++)
+		output[j] = s[i];
+
+	for (i = 0; i < GROESTL_COLS512; i++)
+		ctx->chaining[i] = 0;
+	for (i = 0; i < GROESTL_SIZE512; i++)
+		ctx->buffer[i] = 0;
+}
+
+__device__ void cn_groestl_update(groestlHashState* __restrict__ ctx,
+	const BitSequence* __restrict__ input, DataLength databitlen)
+{
+	int index = 0;
+	int msglen = (int)(databitlen/8);
+	int rem = (int)(databitlen%8);
+
+	if (ctx->buf_ptr) 
+	{
+		while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen)
+			ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+
+		if (ctx->buf_ptr < GROESTL_SIZE512) 
+		{
+			if (rem) 
+			{
+				ctx->bits_in_last_byte = rem;
+				ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+			}
+			return;
+		}
+
+		ctx->buf_ptr = 0;
+		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+	}
+
+	cn_groestl_transform(ctx, input+index, msglen-index);
+	index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512;
+
+	while (index < msglen)
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+
+	if (rem)
+	{
+		ctx->bits_in_last_byte = rem;
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+	}
+}
+
+__device__ void cn_groestl_init(groestlHashState* ctx)
+{
+	int i = 0;
+
+	for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++)
+		ctx->chaining[i] = 0;
+
+	ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN);
+	ctx->buf_ptr = 0;
+	ctx->block_counter1 = 0;
+	ctx->block_counter2 = 0;
+	ctx->bits_in_last_byte = 0;
+}
+
+__device__ void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+{
+	DataLength databitlen = len << 3;
+	groestlHashState context;
+
+	cn_groestl_init(&context);
+	cn_groestl_update(&context, data, databitlen);
+	cn_groestl_final(&context, hashval);
+}
diff --git a/backend/nvidia/nvcc_code/cuda_jh.hpp b/backend/nvidia/nvcc_code/cuda_jh.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..679046ecf88451d7ab696754599d366ea174c0e5
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_jh.hpp
@@ -0,0 +1,301 @@
+#include <stdint.h>
+
+typedef struct {
+	int hashbitlen;
+	unsigned long long databitlen;
+	unsigned long long datasize_in_buffer;
+	uint64_t x[8][2];
+	unsigned char buffer[64];
+} jhHashState;
+
+__constant__ unsigned char d_JH256_H0[512] =
+{
+	0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1,
+	0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3,
+	0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77,
+	0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8,
+	0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62,
+	0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c,
+	0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf,
+	0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69
+};
+
+__constant__ unsigned char d_E8_rc[42][32] =
+{
+	{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
+	{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
+	{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
+	{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
+	{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
+	{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
+	{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
+	{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
+	{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
+	{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
+	{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
+	{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
+	{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
+	{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
+	{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
+	{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
+	{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
+	{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
+	{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
+	{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
+	{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
+	{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
+	{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
+	{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
+	{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
+	{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
+	{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
+	{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
+	{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
+	{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
+	{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
+	{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
+	{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
+	{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
+	{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
+	{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
+	{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
+	{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
+	{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
+	{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
+	{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
+	{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}
+};
+
+#define JH_SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
+#define JH_SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
+#define JH_SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
+#define JH_SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
+#define JH_SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
+#define JH_SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
+
+#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \
+	(m4) ^= (m1);                \
+	(m5) ^= (m2);                \
+	(m6) ^= (m0) ^ (m3);         \
+	(m7) ^= (m0);                \
+	(m0) ^= (m5);                \
+	(m1) ^= (m6);                \
+	(m2) ^= (m4) ^ (m7);         \
+	(m3) ^= (m4);
+
+#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
+	m3  = ~(m3);                  \
+	m7  = ~(m7);                  \
+	m0 ^= ((~(m2)) & (cc0));      \
+	m4 ^= ((~(m6)) & (cc1));      \
+	temp0 = (cc0) ^ ((m0) & (m1));\
+	temp1 = (cc1) ^ ((m4) & (m5));\
+	m0 ^= ((m2) & (m3));          \
+	m4 ^= ((m6) & (m7));          \
+	m3 ^= ((~(m1)) & (m2));       \
+	m7 ^= ((~(m5)) & (m6));       \
+	m1 ^= ((m0) & (m2));          \
+	m5 ^= ((m4) & (m6));          \
+	m2 ^= ((m0) & (~(m3)));       \
+	m6 ^= ((m4) & (~(m7)));       \
+	m0 ^= ((m1) | (m3));          \
+	m4 ^= ((m5) | (m7));          \
+	m3 ^= ((m1) & (m2));          \
+	m7 ^= ((m5) & (m6));          \
+	m1 ^= (temp0 & (m0));         \
+	m5 ^= (temp1 & (m4));         \
+	m2 ^= temp0;                  \
+	m6 ^= temp1;
+
+__device__ void cn_jh_E8(jhHashState *state)
+{
+	uint64_t i,roundnumber,temp0,temp1;
+
+	for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) 
+	{
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++) 
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+		}
+
+		for (i = 1; i < 8; i = i+2) 
+		{
+			temp0 = state->x[i][0];
+			state->x[i][0] = state->x[i][1];
+			state->x[i][1] = temp0;
+		}
+	}
+}
+
+__device__ void cn_jh_F8(jhHashState *state)
+{
+	uint64_t i;
+
+	for (i = 0; i < 8; i++)
+		state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i];
+
+	cn_jh_E8(state);
+
+	for (i = 0; i < 8; i++)
+		state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i];
+}
+
+__device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
+{
+	DataLength index;
+
+	state->databitlen += databitlen;
+	index = 0;
+
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  ) 
+	{
+		if ( (databitlen & 7) == 0 )
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3));
+		else
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1);
+		state->datasize_in_buffer += databitlen;
+		databitlen = 0;
+	}
+
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  )
+	{
+		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
+		index = 64-(state->datasize_in_buffer >> 3);
+		databitlen = databitlen - (512 - state->datasize_in_buffer);
+		cn_jh_F8(state);
+		state->datasize_in_buffer = 0;
+	}
+
+	for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512)
+	{
+		memcpy(state->buffer, data+index, 64);
+		cn_jh_F8(state);
+	}
+
+	if ( databitlen > 0) 
+	{
+		if ((databitlen & 7) == 0)
+			memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
+		else
+			memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
+		state->datasize_in_buffer = databitlen;
+	}
+}
+
+/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
+__device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval)
+{
+	unsigned int i;
+	//uint32_t *bufptr = (uint32_t *)state->buffer;
+
+	if ( (state->databitlen & 0x1ff) == 0 )
+	{
+		/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
+		memset(state->buffer, 0, 64);
+		//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
+		state->buffer[0]  = 0x80;
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8)  & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		cn_jh_F8(state);
+	}
+	else 
+	{
+		/*set the rest of the bytes in the buffer to 0*/
+		if ( (state->datasize_in_buffer & 7) == 0)
+		{
+			for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)
+				state->buffer[i] = 0;
+		}
+		else
+		{
+			for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)
+				state->buffer[i] = 0;
+		}
+
+		/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
+		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
+
+		cn_jh_F8(state);
+		memset(state->buffer, 0, 64);
+		//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		cn_jh_F8(state);
+	}
+
+	memcpy(hashval,(unsigned char*)state->x+64+32,32);
+}
+
+__device__ void cn_jh_init(jhHashState *state, int hashbitlen)
+{
+	state->databitlen = 0;
+	state->datasize_in_buffer = 0;
+	state->hashbitlen = hashbitlen;
+	memcpy(state->x, d_JH256_H0, 128);
+}
+
+__device__ void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+{
+	int hashbitlen = 256;
+	DataLength databitlen = len << 3;
+	jhHashState state;
+
+	cn_jh_init(&state, hashbitlen);
+	cn_jh_update(&state, data, databitlen);
+	cn_jh_final(&state, hashval);
+}
diff --git a/backend/nvidia/nvcc_code/cuda_keccak.hpp b/backend/nvidia/nvcc_code/cuda_keccak.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..99c651645ecbb97fcb810cc3d0ab1a82596df198
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_keccak.hpp
@@ -0,0 +1,197 @@
+#if __CUDA_ARCH__ >= 350
+	__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
+	{
+		uint2 result;
+		if(offset >= 32)
+		{
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		}
+		else
+		{
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		}
+		return  __double_as_longlong(__hiloint2double(result.y, result.x));
+	}
+	#define rotl64_1(x, y) (cuda_rotl64((x), (y)))
+#else
+	#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
+#endif
+
+#define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y))
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
+
+__device__ __forceinline__ void cn_keccakf2(uint64_t *s)
+{
+	uint8_t i;
+
+	for(i = 0; i < 24; ++i)
+	{
+		uint64_t bc[5], tmpxor[5], tmp1, tmp2;
+
+		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = rotl64_2(s[6] ^ bc[0], 12);
+		s[6] = rotl64_1(s[9] ^ bc[3], 20);
+		s[9] = rotl64_2(s[22] ^ bc[1], 29);
+		s[22] = rotl64_2(s[14] ^ bc[3], 7);
+		s[14] = rotl64_1(s[20] ^ bc[4], 18);
+		s[20] = rotl64_2(s[2] ^ bc[1], 30);
+		s[2] = rotl64_2(s[12] ^ bc[1], 11);
+		s[12] = rotl64_1(s[13] ^ bc[2], 25);
+		s[13] = rotl64_1(s[19] ^ bc[3], 8);
+		s[19] = rotl64_2(s[23] ^ bc[2], 24);
+		s[23] = rotl64_2(s[15] ^ bc[4], 9);
+		s[15] = rotl64_1(s[4] ^ bc[3], 27);
+		s[4] = rotl64_1(s[24] ^ bc[3], 14);
+		s[24] = rotl64_1(s[21] ^ bc[0], 2);
+		s[21] = rotl64_2(s[8] ^ bc[2], 23);
+		s[8] = rotl64_2(s[16] ^ bc[0], 13);
+		s[16] = rotl64_2(s[5] ^ bc[4], 4);
+		s[5] = rotl64_1(s[3] ^ bc[2], 28);
+		s[3] = rotl64_1(s[18] ^ bc[2], 21);
+		s[18] = rotl64_1(s[17] ^ bc[1], 15);
+		s[17] = rotl64_1(s[11] ^ bc[0], 10);
+		s[11] = rotl64_1(s[7] ^ bc[1], 6);
+		s[7] = rotl64_1(s[10] ^ bc[4], 3);
+		s[10] = rotl64_1(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccakf_rndc[i];
+	}
+}
+
+__device__ __forceinline__ void cn_keccakf(uint64_t *s)
+{
+	uint64_t bc[5], tmpxor[5], tmp1, tmp2;
+
+	tmpxor[0] = s[0] ^ s[5];
+	tmpxor[1] = s[1] ^ s[6] ^ 0x8000000000000000ULL;
+	tmpxor[2] = s[2] ^ s[7];
+	tmpxor[3] = s[3] ^ s[8];
+	tmpxor[4] = s[4] ^ s[9];
+
+	bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+	bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+	bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+	bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+	bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+	tmp1 = s[1] ^ bc[0];
+
+	s[0] ^= bc[4];
+	s[1] = rotl64_2(s[6] ^ bc[0], 12);
+	s[6] = rotl64_1(s[9] ^ bc[3], 20);
+	s[9] = rotl64_2(bc[1], 29);
+	s[22] = rotl64_2(bc[3], 7);
+	s[14] = rotl64_1(bc[4], 18);
+	s[20] = rotl64_2(s[2] ^ bc[1], 30);
+	s[2] = rotl64_2(bc[1], 11);
+	s[12] = rotl64_1(bc[2], 25);
+	s[13] = rotl64_1(bc[3], 8);
+	s[19] = rotl64_2(bc[2], 24);
+	s[23] = rotl64_2(bc[4], 9);
+	s[15] = rotl64_1(s[4] ^ bc[3], 27);
+	s[4] = rotl64_1(bc[3], 14);
+	s[24] = rotl64_1(bc[0], 2);
+	s[21] = rotl64_2(s[8] ^ bc[2], 23);
+	s[8] = rotl64_2(0x8000000000000000ULL ^ bc[0], 13);
+	s[16] = rotl64_2(s[5] ^ bc[4], 4);
+	s[5] = rotl64_1(s[3] ^ bc[2], 28);
+	s[3] = rotl64_1(bc[2], 21);
+	s[18] = rotl64_1(bc[1], 15);
+	s[17] = rotl64_1(bc[0], 10);
+	s[11] = rotl64_1(s[7] ^ bc[1], 6);
+	s[7] = rotl64_1(bc[4], 3);
+	s[10] = rotl64_1(tmp1, 1);
+
+	tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+	tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+	tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+	tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+	tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+	s[0] ^= 0x0000000000000001;
+
+	for(int i = 1; i < 24; ++i)
+	{
+		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = rotl64_2(s[6] ^ bc[0], 12);
+		s[6] = rotl64_1(s[9] ^ bc[3], 20);
+		s[9] = rotl64_2(s[22] ^ bc[1], 29);
+		s[22] = rotl64_2(s[14] ^ bc[3], 7);
+		s[14] = rotl64_1(s[20] ^ bc[4], 18);
+		s[20] = rotl64_2(s[2] ^ bc[1], 30);
+		s[2] = rotl64_2(s[12] ^ bc[1], 11);
+		s[12] = rotl64_1(s[13] ^ bc[2], 25);
+		s[13] = rotl64_1(s[19] ^ bc[3], 8);
+		s[19] = rotl64_2(s[23] ^ bc[2], 24);
+		s[23] = rotl64_2(s[15] ^ bc[4], 9);
+		s[15] = rotl64_1(s[4] ^ bc[3], 27);
+		s[4] = rotl64_1(s[24] ^ bc[3], 14);
+		s[24] = rotl64_1(s[21] ^ bc[0], 2);
+		s[21] = rotl64_2(s[8] ^ bc[2], 23);
+		s[8] = rotl64_2(s[16] ^ bc[0], 13);
+		s[16] = rotl64_2(s[5] ^ bc[4], 4);
+		s[5] = rotl64_1(s[3] ^ bc[2], 28);
+		s[3] = rotl64_1(s[18] ^ bc[2], 21);
+		s[18] = rotl64_1(s[17] ^ bc[1], 15);
+		s[17] = rotl64_1(s[11] ^ bc[0], 10);
+		s[11] = rotl64_1(s[7] ^ bc[1], 6);
+		s[7] = rotl64_1(s[10] ^ bc[4], 3);
+		s[10] = rotl64_1(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccakf_rndc[i];
+	}
+}
+
+__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md)
+{
+	uint64_t st[25];
+
+	MEMSET8(st + 8, 0x00, 25 - 8);
+	memcpy(st, in, len);
+	((uint8_t*)st)[len] = 0x01;
+	st[16] = 0x8000000000000000ULL;
+
+	cn_keccakf(st);
+
+	MEMCPY8(md, st, 25);
+	return;
+}
diff --git a/backend/nvidia/nvcc_code/cuda_skein.hpp b/backend/nvidia/nvcc_code/cuda_skein.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..041a593b756a6b5d455bd08685b8cc6019573bef
--- /dev/null
+++ b/backend/nvidia/nvcc_code/cuda_skein.hpp
@@ -0,0 +1,347 @@
+#pragma once
+
+typedef unsigned int    uint_t;             /* native unsigned integer */
+
+#define SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+
+#define SKEIN_256_STATE_WORDS ( 4)
+#define SKEIN_512_STATE_WORDS ( 8)
+#define SKEIN1024_STATE_WORDS (16)
+
+#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+
+#define SKEIN_T1_FLAG_FIRST     (((uint64_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_BIT_PAD   (((uint64_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+#define SKEIN_T1_FLAG_FINAL     (((uint64_t)  1 ) << SKEIN_T1_POS_FINAL)
+
+#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \
+  Skein_Set_T0(ctxPtr,(T0)); \
+  Skein_Set_T1(ctxPtr,(T1)); }
+
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
+{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)
+#define ts              (kw + KW_TWK_BASE)
+
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \
+	X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \
+	X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \
+	X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \
+	X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6;
+
+#define I512(R) \
+	X0   += ks[((R)+1) % 9]; \
+	X1   += ks[((R)+2) % 9]; \
+	X2   += ks[((R)+3) % 9]; \
+	X3   += ks[((R)+4) % 9]; \
+	X4   += ks[((R)+5) % 9]; \
+	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
+	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
+	X7   += ks[((R)+8) % 9] + (R)+1;
+
+
+#define R512_8_rounds(R) \
+	R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
+	R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
+	R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
+	R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
+	I512(2*(R)); \
+	R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
+	R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
+	R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
+	R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
+	I512(2*(R)+1);
+
+typedef struct
+{
+	size_t  hashBitLen;
+	size_t  bCnt;
+	uint64_t  T[SKEIN_MODIFIER_WORDS];
+} Skein_Ctxt_Hdr_t;
+
+typedef struct {
+	Skein_Ctxt_Hdr_t h;
+	uint64_t  X[SKEIN_256_STATE_WORDS];
+	uint8_t  b[SKEIN_256_BLOCK_BYTES];
+} Skein_256_Ctxt_t;
+
+typedef struct {
+	Skein_Ctxt_Hdr_t h;
+	uint64_t  X[SKEIN_512_STATE_WORDS];
+	uint8_t  b[SKEIN_512_BLOCK_BYTES];
+} Skein_512_Ctxt_t;
+
+typedef struct {
+	Skein_Ctxt_Hdr_t h;
+	uint64_t  X[SKEIN1024_STATE_WORDS];
+	uint8_t  b[SKEIN1024_BLOCK_BYTES];
+} Skein1024_Ctxt_t;
+
+typedef struct {
+	uint_t  statebits;
+	union {
+		Skein_Ctxt_Hdr_t h;
+		Skein_256_Ctxt_t ctx_256;
+		Skein_512_Ctxt_t ctx_512;
+		Skein1024_Ctxt_t ctx1024;
+	} u;
+} skeinHashState;
+
+__device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen)
+{
+	const uint64_t SKEIN_512_IV_256[] =
+	{
+		SKEIN_MK_64(0xCCD044A1,0x2FDB3E13),
+		SKEIN_MK_64(0xE8359030,0x1A79A9EB),
+		SKEIN_MK_64(0x55AEA061,0x4F816E6F),
+		SKEIN_MK_64(0x2A2767A4,0xAE9B94DB),
+		SKEIN_MK_64(0xEC06025E,0x74DD7683),
+		SKEIN_MK_64(0xE7A436CD,0xC4746251),
+		SKEIN_MK_64(0xC36FBAF9,0x393AD185),
+		SKEIN_MK_64(0x3EEDBA18,0x33EDFC13)
+	};
+
+	Skein_512_Ctxt_t *ctx = &state->u.ctx_512;
+
+	ctx->h.hashBitLen = hashBitLen;
+
+	memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X));
+
+	Skein_Start_New_Type(ctx, MSG);
+}
+
+__device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
+{
+	enum {
+		R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+		R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+		R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+		R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+		R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+		R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+		R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+		R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22
+	};
+
+	uint64_t X0,X1,X2,X3,X4,X5,X6,X7;
+	uint64_t w[SKEIN_512_STATE_WORDS];
+	uint64_t kw[SKEIN_512_STATE_WORDS+4];
+
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+
+	do
+	{
+
+		ts[0] += byteCntAdd;
+
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		memcpy(w, blkPtr, SKEIN_512_STATE_WORDS << 3);
+
+		X0 = w[0] + ks[0];
+		X1 = w[1] + ks[1];
+		X2 = w[2] + ks[2];
+		X3 = w[3] + ks[3];
+		X4 = w[4] + ks[4];
+		X5 = w[5] + ks[5] + ts[0];
+		X6 = w[6] + ks[6] + ts[1];
+		X7 = w[7] + ks[7];
+
+		blkPtr += SKEIN_512_BLOCK_BYTES;
+
+		R512_8_rounds( 0);
+		R512_8_rounds( 1);
+		R512_8_rounds( 2);
+		R512_8_rounds( 3);
+		R512_8_rounds( 4);
+		R512_8_rounds( 5);
+		R512_8_rounds( 6);
+		R512_8_rounds( 7);
+		R512_8_rounds( 8);
+
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+		ctx->X[4] = X4 ^ w[4];
+		ctx->X[5] = X5 ^ w[5];
+		ctx->X[6] = X6 ^ w[6];
+		ctx->X[7] = X7 ^ w[7];
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	} 
+	while (--blkCnt);
+
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+__device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
+{
+	size_t i,n,byteCnt;
+	uint64_t X[SKEIN_512_STATE_WORDS];
+	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
+	//size_t tmp;
+	//uint8_t *p8;
+	//uint64_t *p64;
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
+
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) 
+	{
+		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+		//p8 = &ctx->b[ctx->h.bCnt];
+		//tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+		//for( i = 0; i < tmp; i++ ) *(p8+i) = 0;
+	}
+
+	cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt);
+
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	//uint8_t  b[SKEIN_512_BLOCK_BYTES] == 64
+	memset(ctx->b,0,sizeof(ctx->b));
+	//p64 = (uint64_t *)ctx->b;
+	//for( i = 0; i < 8; i++ ) *(p64+i) = 0;
+
+	memcpy(X,ctx->X,sizeof(X));
+
+	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) 
+	{
+		((uint64_t *)ctx->b)[0]= (uint64_t)i;
+		Skein_Start_New_Type(ctx,OUT_FINAL);
+		cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t));
+		n = byteCnt - i*SKEIN_512_BLOCK_BYTES;
+		if (n >= SKEIN_512_BLOCK_BYTES)
+		n  = SKEIN_512_BLOCK_BYTES;
+		memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);
+		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+	}
+}
+
+__device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) 
+	{
+
+		if (ctx->h.bCnt) 
+		{
+
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+
+			if (n) 
+			{
+				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+				msgByteCnt  -= n;
+				msg         += n;
+				ctx->h.bCnt += n;
+			}
+
+			cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+
+		if (msgByteCnt > SKEIN_512_BLOCK_BYTES) 
+		{
+			n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;
+			cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+			msg        += n * SKEIN_512_BLOCK_BYTES;
+		}
+	}
+
+	if (msgByteCnt) 
+	{
+		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+}
+
+__device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
+{
+	if ((databitlen & 7) == 0) 
+	{
+		cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3);
+	}
+	else 
+	{
+
+		size_t bCnt = (databitlen >> 3) + 1;
+		uint8_t b,mask;
+
+		mask = (uint8_t) (1u << (7 - (databitlen & 7)));
+		b    = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask);
+
+		cn_skein512_update(&state->u.ctx_512,data,bCnt-1);
+		cn_skein512_update(&state->u.ctx_512,&b  ,  1   );
+
+		Skein_Set_Bit_Pad_Flag(state->u.h);
+	}
+}
+
+__device__ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+{
+	int hashbitlen = 256;
+	DataLength databitlen = len << 3;
+	skeinHashState state;
+
+	state.statebits = 64*SKEIN_512_STATE_WORDS;
+
+	cn_skein_init(&state, hashbitlen);
+	cn_skein_update(&state, data, databitlen);
+	cn_skein_final(&state, hashval);
+}