diff --git a/CMakeLists.txt b/CMakeLists.txt index b714ee0ceb3bc3e39de9e7851e1231c16ef53a2b..7d21fa928c8c71f5a5ead048e450f2553d006309 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,6 +152,9 @@ if(CUDA_ENABLE) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") endif() + # required for cryptonight_gpu (fast floating point operations are not allowed) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --fmad=false --prec-div=true --ftz=false") + # avoid that nvcc in CUDA 8 complains about sm_20 pending removal if(CUDA_VERSION VERSION_EQUAL 8.0) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets") @@ -279,6 +282,14 @@ else() list(APPEND BACKEND_TYPES "cpu") endif() +################################################################################ +# Explicit march setting for Clang +################################################################################ + +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2") +endif() + ################################################################################ # Find PThreads ################################################################################ diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 4ad4d59e944af5aceb97e9109c126d675744b7f7..857abc138cb6d7013a996a7b7f512ee503a066e4 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -413,6 +413,10 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if(strided_index == 1) strided_index = 0; } + if(miner_algo == cryptonight_gpu) + { + strided_index = 0; + } // if intensity is a multiple of worksize than comp mode is not needed int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0; @@ -433,6 +437,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ */ options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100); + if(miner_algo == cryptonight_gpu) + options += " -cl-fp32-correctly-rounded-divide-sqrt"; + /* create a hash for the compile time cache * used data: * - source code @@ -579,12 +586,23 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } } - std::vector<std::string> KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; + std::vector<std::string> KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" }; + if(miner_algo == cryptonight_gpu) + { + KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu"); + KernelNames.insert(KernelNames.begin(), "cn0_cn_gpu"); + } + else + { + KernelNames.insert(KernelNames.begin(), "cn1"); + KernelNames.insert(KernelNames.begin(), "cn0"); + } + // append algorithm number to kernel name for(int k = 0; k < 3; k++) KernelNames[k] += std::to_string(miner_algo); - for(int i = 0; i < 7; ++i) + for(int i = 0; i < KernelNames.size(); ++i) { ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret); if(ret != CL_SUCCESS) @@ -919,6 +937,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) const char *wolfSkeinCL = #include "./opencl/wolf-skein.cl" ; + const char *cryptonight_gpu = + #include "./opencl/cryptonight_gpu.cl" + ; std::string source_code(cryptonightCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); @@ -928,6 +949,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_CN_GPU"), cryptonight_gpu); // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); @@ -1066,76 +1088,102 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_OCL_API; } - // Branch 0 - if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + if(miner_algo == cryptonight_gpu) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); - return ERR_OCL_API; - } - - // Branch 1 - if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); - return ERR_OCL_API; - } - - // Branch 2 - if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); - return ERR_OCL_API; - } + // Output + if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2); + return ERR_OCL_API; + } - // Branch 3 - if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); - return ERR_OCL_API; - } + // Target + if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3); + return ERR_OCL_API; + } - // Threads - if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); - return(ERR_OCL_API); + // Threads + if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); + return(ERR_OCL_API); + } } - - for(int i = 0; i < 4; ++i) - { - // States - if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + else { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + // Branch 0 + if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } - // Nonce buffer - if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + // Branch 1 + if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } - // Output - if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + // Branch 2 + if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } - // Target - if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + // Branch 3 + if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } - if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + // Threads + if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); return(ERR_OCL_API); } + + for(int i = 0; i < 4; ++i) + { + // States + if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + return ERR_OCL_API; + } + + // Nonce buffer + if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + return ERR_OCL_API; + } + + // Output + if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + return ERR_OCL_API; + } + + // Target + if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + return ERR_OCL_API; + } + + if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return(ERR_OCL_API); + } + } } return ERR_SUCCESS; @@ -1277,10 +1325,24 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if(miner_algo == cryptonight_gpu) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); - return ERR_OCL_API; + size_t w_size_cn_gpu = w_size * 16; + size_t g_thd_cn_gpu = g_thd * 16; + + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + return ERR_OCL_API; + } + } + else + { + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + return ERR_OCL_API; + } } if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) @@ -1289,13 +1351,16 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) return ERR_OCL_API; } - for(int i = 0; i < 4; ++i) + if(miner_algo != cryptonight_gpu) { - size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + for(int i = 0; i < 4; ++i) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); - return ERR_OCL_API; + size_t tmpNonce = ctx->Nonce; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + return ERR_OCL_API; + } } } diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 5e9f618ede7822318aad75a24f4fdce695f8e49c..5b95e9865b17717913fa195c9b68c57d0f0adba6 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -53,7 +53,7 @@ struct GpuContext cl_mem OutputBuffer; cl_mem ExtraBuffers[6]; std::map<xmrstak_algo, cl_program> Program; - std::map<xmrstak_algo, std::array<cl_kernel,8>> Kernels; + std::map<xmrstak_algo, std::array<cl_kernel,7>> Kernels; size_t freeMem; size_t maxMemPerAlloc; int computeUnits; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index f647bcafc1e7f051cd574814b2a9e71158efcbae..53394037bb682b7412f6c39c4300d81c2a2d3f4e 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -78,21 +78,6 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width) } #endif -//#include "opencl/fast_int_math_v2.cl" -XMRSTAK_INCLUDE_FAST_INT_MATH_V2 -//#include "fast_div_heavy.cl" -XMRSTAK_INCLUDE_FAST_DIV_HEAVY -//#include "opencl/wolf-aes.cl" -XMRSTAK_INCLUDE_WOLF_AES -//#include "opencl/wolf-skein.cl" -XMRSTAK_INCLUDE_WOLF_SKEIN -//#include "opencl/jh.cl" -XMRSTAK_INCLUDE_JH -//#include "opencl/blake256.cl" -XMRSTAK_INCLUDE_BLAKE256 -//#include "opencl/groestl256.cl" -XMRSTAK_INCLUDE_GROESTL256 - static const __constant ulong keccakf_rndc[24] = { 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, @@ -186,31 +171,49 @@ static const __constant uint keccakf_piln[24] = 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 }; -void keccakf1600_1(ulong *st) +inline void keccakf1600_1(ulong st[25]) { int i, round; ulong t, bc[5]; #pragma unroll 1 - for(round = 0; round < 24; ++round) + for (round = 0; round < 24; ++round) { + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL); + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL); + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL); + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL); + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL); - // Theta - bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; - bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; - bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; - bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; - bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; - - #pragma unroll 1 - for (i = 0; i < 5; ++i) { - t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL); - st[i ] ^= t; - st[i + 5] ^= t; - st[i + 10] ^= t; - st[i + 15] ^= t; - st[i + 20] ^= t; - } + st[0] ^= bc[4]; + st[5] ^= bc[4]; + st[10] ^= bc[4]; + st[15] ^= bc[4]; + st[20] ^= bc[4]; + + st[1] ^= bc[0]; + st[6] ^= bc[0]; + st[11] ^= bc[0]; + st[16] ^= bc[0]; + st[21] ^= bc[0]; + + st[2] ^= bc[1]; + st[7] ^= bc[1]; + st[12] ^= bc[1]; + st[17] ^= bc[1]; + st[22] ^= bc[1]; + + st[3] ^= bc[2]; + st[8] ^= bc[2]; + st[13] ^= bc[2]; + st[18] ^= bc[2]; + st[23] ^= bc[2]; + + st[4] ^= bc[3]; + st[9] ^= bc[3]; + st[14] ^= bc[3]; + st[19] ^= bc[3]; + st[24] ^= bc[3]; // Rho Pi t = st[1]; @@ -221,17 +224,16 @@ void keccakf1600_1(ulong *st) t = bc[0]; } - #pragma unroll 1 + #pragma unroll for(int i = 0; i < 25; i += 5) { - ulong tmp[5]; - - #pragma unroll 1 - for(int x = 0; x < 5; ++x) - tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]); + ulong tmp1 = st[i], tmp2 = st[i + 1]; - #pragma unroll 1 - for(int x = 0; x < 5; ++x) st[i + x] = tmp[x]; + st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]); + st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]); + st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]); + st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]); + st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1); } // Iota @@ -311,6 +313,46 @@ void keccakf1600_2(__local ulong *st) } } +#define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT) + +#if(STRIDED_INDEX==0) +# define IDX(x) (x) +#elif(STRIDED_INDEX==1) +# define IDX(x) (mul24(((uint)(x)), Threads)) +#elif(STRIDED_INDEX==2) +# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) +#elif(STRIDED_INDEX==3) +# define IDX(x) ((x) * WORKSIZE) +#endif + +#define JOIN_DO(x,y) x##y +#define JOIN(x,y) JOIN_DO(x,y) + +inline uint getIdx() +{ + return get_global_id(0) - get_global_offset(0); +} + +//#include "opencl/fast_int_math_v2.cl" +XMRSTAK_INCLUDE_FAST_INT_MATH_V2 +//#include "fast_div_heavy.cl" +XMRSTAK_INCLUDE_FAST_DIV_HEAVY +//#include "opencl/wolf-aes.cl" +XMRSTAK_INCLUDE_WOLF_AES +//#include "opencl/wolf-skein.cl" +XMRSTAK_INCLUDE_WOLF_SKEIN +//#include "opencl/jh.cl" +XMRSTAK_INCLUDE_JH +//#include "opencl/blake256.cl" +XMRSTAK_INCLUDE_BLAKE256 +//#include "opencl/groestl256.cl" +XMRSTAK_INCLUDE_GROESTL256 + +#if (ALGO == 13) + //#include "opencl/cryptonight_gpu.cl" + XMRSTAK_INCLUDE_CN_GPU +#endif + )===" R"===( @@ -360,28 +402,8 @@ void AESExpandKey256(uint *keybuf) )===" R"===( -#define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT) - -#if(STRIDED_INDEX==0) -# define IDX(x) (x) -#elif(STRIDED_INDEX==1) -# define IDX(x) (mul24(((uint)(x)), Threads)) -#elif(STRIDED_INDEX==2) -# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) -#elif(STRIDED_INDEX==3) -# define IDX(x) ((x) * WORKSIZE) -#endif - -inline uint getIdx() -{ - return get_global_id(0) - get_global_offset(0); -} - #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] - #define JOIN_DO(x,y) x##y -#define JOIN(x,y) JOIN_DO(x,y) - __attribute__((reqd_work_group_size(8, 8, 1))) __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads) { @@ -538,7 +560,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, R"===( // cryptonight_monero_v8 && NVIDIA -#if((ALGO==11 || ALGO==13) && defined(__NV_CL_C_VERSION)) +#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) # define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) #else @@ -556,7 +578,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong a[2]; // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) ulong b[4]; uint4 b_x[2]; // NVIDIA @@ -571,7 +593,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states __local uint AES0[256], AES1[256]; // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) # if defined(__clang__) && !defined(__NV_CL_C_VERSION) __local uint RCP[256]; # endif @@ -587,7 +609,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES0[i] = tmp; AES1[i] = rotate(tmp, 8U); // cryptonight_monero_v8 -#if((ALGO==11 || ALGO==13) && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) +#if((ALGO==11 || ALGO==14) && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) RCP[i] = RCP_C[i]; #endif } @@ -622,7 +644,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0] = ((uint4 *)b)[0]; // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) a[1] = states[1] ^ states[5]; b[2] = states[8] ^ states[10]; b[3] = states[9] ^ states[11]; @@ -654,7 +676,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states { ulong c[2]; // cryptonight_monero_v8 && NVIDIA -#if((ALGO==11 || ALGO==13) && defined(__NV_CL_C_VERSION)) +#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION)) uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif @@ -668,7 +690,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) { ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); @@ -693,7 +715,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states SCRATCHPAD_CHUNK(0) = b_x[0]; idx0 = as_uint2(c[0]).s0 & MASK; // cryptonight_monero_v8 -#elif(ALGO==11 || ALGO==13) +#elif(ALGO==11 || ALGO==14) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data @@ -712,7 +734,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) // Use division and square root results from the _previous_ iteration to hide the latency tmp.s0 ^= division_result.s0; tmp.s1 ^= division_result.s1 ^ sqrt_result; @@ -770,7 +792,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ((uint4 *)a)[0] ^= tmp; // cryptonight_monero_v8 -#if (ALGO == 11 || ALGO==13) +#if (ALGO == 11 || ALGO==14) # if defined(__NV_CL_C_VERSION) // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; @@ -805,7 +827,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states R"===( __attribute__((reqd_work_group_size(8, 8, 1))) -__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) +__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, +// cryptonight_gpu +#if (ALGO == 13) + __global uint *output, ulong Target, uint Threads) +#else + __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) +#endif { __local uint AES0[256], AES1[256], AES2[256], AES3[256]; uint ExpandedKey2[40]; @@ -823,8 +851,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4 xin1[8][8]; __local uint4 xin2[8][8]; #endif @@ -862,8 +890,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)]; __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)]; __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)]; @@ -876,7 +904,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states if (gIdx < Threads) #endif { -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) #pragma unroll 2 for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) { @@ -916,8 +945,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif } -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) /* Also left over threads performe this loop. * The left over thread results will be ignored */ @@ -959,7 +988,15 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states for(int i = 0; i < 25; ++i) State[i] = states[i]; keccakf1600_2(State); - +#if (ALGO == 13) + if(State[3] <= Target) + { + //printf("gt %lu\n", State[3]); + ulong outIdx = atomic_inc(output + 0xFF); + if(outIdx < 0xFF) + output[outIdx] = get_global_id(0); + } +#else for(int i = 0; i < 25; ++i) states[i] = State[i]; uint StateSwitch = State[0] & 3; @@ -967,6 +1004,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3; __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2; destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx; +#endif } } mem_fence(CLK_GLOBAL_MEM_FENCE); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl new file mode 100644 index 0000000000000000000000000000000000000000..a99243e4402724057780df88ae64a98cf097933f --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl @@ -0,0 +1,383 @@ +R"===( + + +inline float4 _mm_add_ps(float4 a, float4 b) +{ + return a + b; +} + +inline float4 _mm_sub_ps(float4 a, float4 b) +{ + return a - b; +} + +inline float4 _mm_mul_ps(float4 a, float4 b) +{ + + //#pragma OPENCL SELECT_ROUNDING_MODE rte + return a * b; +} + +inline float4 _mm_div_ps(float4 a, float4 b) +{ + return a / b; +} + +inline float4 _mm_and_ps(float4 a, int b) +{ + return as_float4(as_int4(a) & (int4)(b)); +} + +inline float4 _mm_or_ps(float4 a, int b) +{ + return as_float4(as_int4(a) | (int4)(b)); +} + +inline float4 _mm_fmod_ps(float4 v, float dc) +{ + float4 d = (float4)(dc); + float4 c = _mm_div_ps(v, d); + c = trunc(c); + c = _mm_mul_ps(c, d); + return _mm_sub_ps(v, c); +} + +inline int4 _mm_xor_si128(int4 a, int4 b) +{ + return a ^ b; +} + +inline float4 _mm_xor_ps(float4 a, int b) +{ + return as_float4(as_int4(a) ^ (int4)(b)); +} + +inline int4 _mm_alignr_epi8(int4 a, const uint rot) +{ + const uint right = 8 * rot; + const uint left = (32 - 8 * rot); + return (int4)( + ((uint)a.x >> right) | ( a.y << left ), + ((uint)a.y >> right) | ( a.z << left ), + ((uint)a.z >> right) | ( a.w << left ), + ((uint)a.w >> right) | ( a.x << left ) + ); +} + + +inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); } + +inline float4 fma_break(float4 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(x, 0xFEFFFFFF); + return _mm_or_ps(x, 0x00800000); +} + +inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c) +{ + n1 = _mm_add_ps(n1, *c); + float4 nn = _mm_mul_ps(n0, *c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + *n = _mm_add_ps(*n, nn); + + n3 = _mm_sub_ps(n3, *c); + float4 dd = _mm_mul_ps(n2, *c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + *d = _mm_add_ps(*d, dd); + + //Constant feedback + *c = _mm_add_ps(*c, rnd_c); + *c = _mm_add_ps(*c, (float4)(0.734375f)); + float4 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + *c = _mm_add_ps(*c, r); + +} + +// 9*8 + 2 = 74 +inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r) +{ + float4 n = (float4)(0.0f); + float4 d = (float4)(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); + sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c); + sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c); + sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c); + sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c); + sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c); + sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c); + sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(d, 0xFF7FFFFF); + d = _mm_or_ps(d, 0x40000000); + *r =_mm_add_ps(*r, _mm_div_ps(n,d)); +} + +inline int4 single_comupte(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum) +{ + float4 c= (float4)(cnt); + // 35 maths calls follow (140 FLOPS) + float4 r = (float4)(0.0f); + + for(int i = 0; i < 4; ++i) + round_compute(n0, n1, n2, n3, rnd_c, &c, &r); + + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + *sum = r; // 34 + float4 x = (float4)(536870880.0f); + r = _mm_mul_ps(r, x); // 35 + return convert_int4_rte(r); +} + +inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out) +{ + float4 n0 = convert_float4_rte(v0); + float4 n1 = convert_float4_rte(v1); + float4 n2 = convert_float4_rte(v2); + float4 n3 = convert_float4_rte(v3); + + int4 r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + *out = rot == 0 ? r : _mm_alignr_epi8(r, rot); +} + +)===" +R"===( + +static const __constant uint look[16][4] = { + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} +}; + +static const __constant float ccnt[16] = { + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f +}; + +__attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1))) +__kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads) +{ + const uint gIdx = getIdx(); + +#if(COMP_MODE==1) + if(gIdx < Threads) + return; +#endif + + uint chunk = get_local_id(0) / 16; + +#if(STRIDED_INDEX==0) + __global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16)); +#endif + + __local int4 smem2[1 * 4 * WORKSIZE]; + __local int4 smemOut2[1 * 16 * WORKSIZE]; + __local float4 smemVa2[1 * 16 * WORKSIZE]; + + __local int4* smem = smem2 + 4 * chunk; + __local int4* smemOut = smemOut2 + 16 * chunk; + __local float4* smemVa = smemVa2 + 16 * chunk; + + uint tid = get_local_id(0) % 16; + + uint idxHash = gIdx/16; + uint s = ((__global uint*)spad)[idxHash * 50] >> 8; + float4 vs = (float4)(0); + + for(size_t i = 0; i < ITERATIONS; i++) + { + mem_fence(CLK_LOCAL_MEM_FENCE); + ((__local int*)smem)[tid] = ((__global int*)scratchpad_ptr(s, tid/4, lpad))[tid%4]; + mem_fence(CLK_LOCAL_MEM_FENCE); + + float4 rc = vs; + + { + single_comupte_wrap( + tid%4, + *(smem + look[tid][0]), + *(smem + look[tid][1]), + *(smem + look[tid][2]), + *(smem + look[tid][3]), + ccnt[tid], rc, smemVa + tid, + smemOut + tid + ); + } + mem_fence(CLK_LOCAL_MEM_FENCE); + + int4 tmp2; + if(tid % 4 == 0) + { + int4 out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]); + int4 out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]); + out = _mm_xor_si128(out, out2); + tmp2 = out; + *scratchpad_ptr(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out); + } + mem_fence(CLK_LOCAL_MEM_FENCE); + if(tid % 4 == 0) + smemOut[tid] = tmp2; + mem_fence(CLK_LOCAL_MEM_FENCE); + int4 out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12]; + + if(tid%2 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 1]; + if(tid%4 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 2]; + if(tid%8 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 4]; + if(tid%16 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 8]; + vs = smemVa[0]; + + vs = fabs(vs); // take abs(va) by masking the float sign bit + float4 xx = _mm_mul_ps(vs, (float4)(16777216.0f)); + // vs range 0 - 64 + int4 tmp = convert_int4_rte(xx); + tmp = _mm_xor_si128(tmp, out2); + // vs is now between 0 and 1 + vs = _mm_div_ps(vs, (float4)(64.0f)); + s = tmp.x ^ tmp.y ^ tmp.z ^ tmp.w; + } +} + +)===" +R"===( + +inline void generate_512(ulong idx, __local ulong* in, __global ulong* out) +{ + ulong hash[25]; + + hash[0] = in[0] ^ idx; + for(int i = 1; i < 25; ++i) + hash[i] = in[i]; + + keccakf1600_1(hash); + for(int i = 0; i < 20; ++i) + out[i] = hash[i]; + out+=160/8; + + keccakf1600_1(hash); + for(int i = 0; i < 22; ++i) + out[i] = hash[i]; + out+=176/8; + + keccakf1600_1(hash); + for(int i = 0; i < 22; ++i) + out[i] = hash[i]; +} + +__attribute__((reqd_work_group_size(8, 8, 1))) +__kernel void JOIN(cn0_cn_gpu,ALGO)(__global ulong *input, __global int *Scratchpad, __global ulong *states, uint Threads) +{ + const uint gIdx = getIdx(); + __local ulong State_buf[8 * 25]; + __local ulong* State = State_buf + get_local_id(0) * 25; + +#if(COMP_MODE==1) + // do not use early return here + if(gIdx < Threads) +#endif + { + states += 25 * gIdx; + +#if(STRIDED_INDEX==0) + Scratchpad = (__global int*)((__global char*)Scratchpad + MEMORY * gIdx); +#endif + + if (get_local_id(1) == 0) + { + +// NVIDIA +#ifdef __NV_CL_C_VERSION + for(uint i = 0; i < 8; ++i) + State[i] = input[i]; +#else + ((__local ulong8 *)State)[0] = vload8(0, input); +#endif + State[8] = input[8]; + State[9] = input[9]; + State[10] = input[10]; + + ((__local uint *)State)[9] &= 0x00FFFFFFU; + ((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24; + ((__local uint *)State)[10] &= 0xFF000000U; + /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA) + * handle get_global_id and get_global_offset as signed long long int and add + * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1` + * (even if it is correct casted to unsigned on the host) + */ + ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8)); + + for (int i = 11; i < 25; ++i) { + State[i] = 0x00UL; + } + + // Last bit of padding + State[16] = 0x8000000000000000UL; + + keccakf1600_2(State); + + #pragma unroll + for (int i = 0; i < 25; ++i) { + states[i] = State[i]; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + +#if(COMP_MODE==1) + // do not use early return here + if(gIdx < Threads) +#endif + { + for(ulong i = get_local_id(1); i < MEMORY / 512; i += get_local_size(1)) + { + generate_512(i, State, (__global ulong*)((__global uchar*)Scratchpad + i*512)); + } + } +} + +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index 4205a67c36922baa4f3df914d12af8939e39d74c..93e304aee3d316661b6f7dd1b211a7f498efb696 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -4,7 +4,7 @@ R"===( */ // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) static const __constant uint RCP_C[256] = { diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index f2dce7f90dbdec86e4030223830081082d60292f..7ca072c95abf93513264d478dc2f6807e252a9fe 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -137,6 +137,9 @@ private: // true for all cryptonight_heavy derivates since we check the user and dev pool bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); + // true for all cryptonight_gpu derivates since we check the user and dev pool + bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end(); + // set strided index to default ctx.stridedIndex = 1; @@ -158,13 +161,21 @@ private: if (hashMemSize <= CRYPTONIGHT_TURTLE_MEMORY) maxThreads *= 4u; + if(useCryptonight_gpu) + { + // 6 waves per compute unit are a good value (based on profiling) + // @todo check again after all optimizations + maxThreads = ctx.computeUnits * 6 * 8; + ctx.stridedIndex = 0; + } + // keep 128MiB memory free (value is randomly chosen) from the max available memory const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem; size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem); uint32_t numThreads = 1u; - if(ctx.isAMD) + if(ctx.isAMD && !useCryptonight_gpu) { numThreads = 2; size_t memDoubleThread = maxAvailableFreeMem / numThreads; diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4a7697b028b6ad9ff72c1c02bdade2b03142aae6 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include <stdint.h> + +#if defined(_WIN32) || defined(_WIN64) +#include <malloc.h> +#include <intrin.h> +#define HAS_WIN_INTRIN_API +#endif + +#ifdef __GNUC__ +#include <x86intrin.h> +#if !defined(HAS_WIN_INTRIN_API) +#include <cpuid.h> +#endif // !defined(HAS_WIN_INTRIN_API) +#endif // __GNUC__ + +inline void cngpu_cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) +{ + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + +#if defined(HAS_WIN_INTRIN_API) + __cpuidex(val, eax, ecx); +#else + __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); +#endif +} + +inline bool cngpu_check_avx2() +{ + int32_t cpu_info[4]; + cngpu_cpuid(7, 0, cpu_info); + return (cpu_info[1] & (1 << 5)) != 0; +} + +template<size_t ITER, uint32_t MASK> +void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad); + +template<size_t ITER, uint32_t MASK> +void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad); diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e46705fd0920d68834c8f97423cae343c3df9e3e --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp @@ -0,0 +1,176 @@ +#include "cn_gpu.hpp" +#include "../../cryptonight.hpp" + +#pragma GCC target ("avx2") + +inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01) +{ + v = _mm256_load_si256(idx); + n01 = _mm256_cvtepi32_ps(v); +} + +inline __m256 fma_break(const __m256& x) +{ + // Break the dependency chain by setitng the exp to ?????01 + __m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x); + return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx); +} + +// 14 +inline void sub_round(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& n, __m256& d, __m256& c) +{ + __m256 nn = _mm256_mul_ps(n0, c); + nn = _mm256_mul_ps(_mm256_add_ps(n1, c), _mm256_mul_ps(nn, nn)); + nn = fma_break(nn); + n = _mm256_add_ps(n, nn); + + __m256 dd = _mm256_mul_ps(n2, c); + dd = _mm256_mul_ps(_mm256_sub_ps(n3, c), _mm256_mul_ps(dd, dd)); + dd = fma_break(dd); + d = _mm256_add_ps(d, dd); + + //Constant feedback + c = _mm256_add_ps(c, rnd_c); + c = _mm256_add_ps(c, _mm256_set1_ps(0.734375f)); + __m256 r = _mm256_add_ps(nn, dd); + r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r); + r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r); + c = _mm256_add_ps(c, r); +} + +// 14*8 + 2 = 112 +inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& c, __m256& r) +{ + __m256 n = _mm256_setzero_ps(), d = _mm256_setzero_ps(); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFF7FFFFF)), d); + d = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), d); + r = _mm256_add_ps(r, _mm256_div_ps(n, d)); +} + +// 112×4 = 448 +template <bool add> +inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum) +{ + __m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1); + __m256 r = _mm256_setzero_ps(); + + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + + // do a quick fmod by setting exp to 2 + r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r); + r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r); + + if(add) + sum = _mm256_add_ps(sum, r); + else + sum = r; + + r = _mm256_mul_ps(r, _mm256_set1_ps(536870880.0f)); // 35 + return _mm256_cvttps_epi32(r); +} + +template <size_t rot> +inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out) +{ + __m256i r = double_comupte<rot % 2 != 0>(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum); + if(rot != 0) + r = _mm256_or_si256(_mm256_bslli_epi128(r, 16 - rot), _mm256_bsrli_epi128(r, rot)); + + out = _mm256_xor_si256(out, r); +} + +template<uint32_t MASK> +inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m256i*>(lpad + (idx & MASK) + n*16); } + +template<size_t ITER, uint32_t MASK> +void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad) +{ + uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8; + __m256i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0); + __m256i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2); + __m256 sum0 = _mm256_setzero_ps(); + + for(size_t i = 0; i < ITER; i++) + { + __m256i v01, v23; + __m256 suma, sumb, sum1; + __m256 rc = sum0; + + __m256 n01, n23; + __m256 d01, d23; + prep_dv_avx(idx0, v01, n01); + prep_dv_avx(idx2, v23, n23); + + __m256i out, out2; + __m256 n10, n22, n33; + n10 = _mm256_permute2f128_ps(n01, n01, 0x01); + n22 = _mm256_permute2f128_ps(n23, n23, 0x00); + n33 = _mm256_permute2f128_ps(n23, n23, 0x11); + + out = _mm256_setzero_si256(); + double_comupte_wrap<0>(n01, n10, n22, n33, 1.3437500f, 1.4296875f, rc, suma, out); + double_comupte_wrap<1>(n01, n22, n33, n10, 1.2812500f, 1.3984375f, rc, suma, out); + double_comupte_wrap<2>(n01, n33, n10, n22, 1.3593750f, 1.3828125f, rc, sumb, out); + double_comupte_wrap<3>(n01, n33, n22, n10, 1.3671875f, 1.3046875f, rc, sumb, out); + _mm256_store_si256(idx0, _mm256_xor_si256(v01, out)); + sum0 = _mm256_add_ps(suma, sumb); + out2 = out; + + __m256 n11, n02, n30; + n11 = _mm256_permute2f128_ps(n01, n01, 0x11); + n02 = _mm256_permute2f128_ps(n01, n23, 0x20); + n30 = _mm256_permute2f128_ps(n01, n23, 0x03); + + out = _mm256_setzero_si256(); + double_comupte_wrap<0>(n23, n11, n02, n30, 1.4140625f, 1.3203125f, rc, suma, out); + double_comupte_wrap<1>(n23, n02, n30, n11, 1.2734375f, 1.3515625f, rc, suma, out); + double_comupte_wrap<2>(n23, n30, n11, n02, 1.2578125f, 1.3359375f, rc, sumb, out); + double_comupte_wrap<3>(n23, n30, n02, n11, 1.2890625f, 1.4609375f, rc, sumb, out); + _mm256_store_si256(idx2, _mm256_xor_si256(v23, out)); + sum1 = _mm256_add_ps(suma, sumb); + + out2 = _mm256_xor_si256(out2, out); + out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2); + suma = _mm256_permute2f128_ps(sum0, sum1, 0x30); + sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21); + sum0 = _mm256_add_ps(suma, sumb); + sum0 = _mm256_add_ps(sum0, _mm256_permute2f128_ps(sum0, sum0, 0x41)); + + // Clear the high 128 bits + __m128 sum = _mm256_castps256_ps128(sum0); + + sum = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum); // take abs(va) by masking the float sign bit + // vs range 0 - 64 + __m128i v0 = _mm_cvttps_epi32(_mm_mul_ps(sum, _mm_set1_ps(16777216.0f))); + v0 = _mm_xor_si128(v0, _mm256_castsi256_si128(out2)); + __m128i v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3)); + v0 = _mm_xor_si128(v0, v1); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1)); + v0 = _mm_xor_si128(v0, v1); + + // vs is now between 0 and 1 + sum = _mm_div_ps(sum, _mm_set1_ps(64.0f)); + sum0 = _mm256_insertf128_ps(_mm256_castps128_ps256(sum), sum, 1); + uint32_t n = _mm_cvtsi128_si32(v0); + idx0 = scratchpad_ptr<MASK>(lpad, n, 0); + idx2 = scratchpad_ptr<MASK>(lpad, n, 2); + } +} + +template void cn_gpu_inner_avx<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad); diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bde34162a13f8cd1914632e140573771dfff78f4 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp @@ -0,0 +1,182 @@ +#include "cn_gpu.hpp" +#include "../../cryptonight.hpp" + +#pragma GCC target ("sse2") + +inline void prep_dv(__m128i* idx, __m128i& v, __m128& n) +{ + v = _mm_load_si128(idx); + n = _mm_cvtepi32_ps(v); +} + +inline __m128 fma_break(__m128 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x); + return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x); +} + +// 14 +inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c) +{ + n1 = _mm_add_ps(n1, c); + __m128 nn = _mm_mul_ps(n0, c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + n = _mm_add_ps(n, nn); + + n3 = _mm_sub_ps(n3, c); + __m128 dd = _mm_mul_ps(n2, c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + d = _mm_add_ps(d, dd); + + //Constant feedback + c = _mm_add_ps(c, rnd_c); + c = _mm_add_ps(c, _mm_set1_ps(0.734375f)); + __m128 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r); + r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r); + c = _mm_add_ps(c, r); +} + +// 14*8 + 2 = 112 +inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r) +{ + __m128 n = _mm_setzero_ps(), d = _mm_setzero_ps(); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d); + d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d); + r =_mm_add_ps(r, _mm_div_ps(n,d)); +} + +// 112×4 = 448 +template<bool add> +inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +{ + __m128 c = _mm_set1_ps(cnt); + __m128 r = _mm_setzero_ps(); + + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r); + r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r); + + if(add) + sum = _mm_add_ps(sum, r); + else + sum = r; + + r = _mm_mul_ps(r, _mm_set1_ps(536870880.0f)); // 35 + return _mm_cvttps_epi32(r); +} + +template<size_t rot> +inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +{ + __m128i r = single_comupte<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum); + if(rot != 0) + r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot)); + out = _mm_xor_si128(out, r); +} + +template<uint32_t MASK> +inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m128i*>(lpad + (idx & MASK) + n*16); } + +template<size_t ITER, uint32_t MASK> +void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad) +{ + uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8; + __m128i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0); + __m128i* idx1 = scratchpad_ptr<MASK>(lpad, s, 1); + __m128i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2); + __m128i* idx3 = scratchpad_ptr<MASK>(lpad, s, 3); + __m128 sum0 = _mm_setzero_ps(); + + for(size_t i = 0; i < ITER; i++) + { + __m128 n0, n1, n2, n3; + __m128i v0, v1, v2, v3; + __m128 suma, sumb, sum1, sum2, sum3; + + prep_dv(idx0, v0, n0); + prep_dv(idx1, v1, n1); + prep_dv(idx2, v2, n2); + prep_dv(idx3, v3, n3); + __m128 rc = sum0; + + __m128i out, out2; + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out); + single_comupte_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out); + single_comupte_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out); + single_comupte_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out); + sum0 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx0, _mm_xor_si128(v0, out)); + out2 = out; + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out); + single_comupte_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out); + single_comupte_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out); + single_comupte_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out); + sum1 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx1, _mm_xor_si128(v1, out)); + out2 = _mm_xor_si128(out2, out); + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out); + single_comupte_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out); + single_comupte_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out); + single_comupte_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out); + sum2 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx2, _mm_xor_si128(v2, out)); + out2 = _mm_xor_si128(out2, out); + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out); + single_comupte_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out); + single_comupte_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out); + single_comupte_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out); + sum3 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx3, _mm_xor_si128(v3, out)); + out2 = _mm_xor_si128(out2, out); + sum0 = _mm_add_ps(sum0, sum1); + sum2 = _mm_add_ps(sum2, sum3); + sum0 = _mm_add_ps(sum0, sum2); + + sum0 = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum0); // take abs(va) by masking the float sign bit + // vs range 0 - 64 + n0 = _mm_mul_ps(sum0, _mm_set1_ps(16777216.0f)); + v0 = _mm_cvttps_epi32(n0); + v0 = _mm_xor_si128(v0, out2); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3)); + v0 = _mm_xor_si128(v0, v1); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1)); + v0 = _mm_xor_si128(v0, v1); + + // vs is now between 0 and 1 + sum0 = _mm_div_ps(sum0, _mm_set1_ps(64.0f)); + uint32_t n = _mm_cvtsi128_si32(v0); + idx0 = scratchpad_ptr<MASK>(lpad, n, 0); + idx1 = scratchpad_ptr<MASK>(lpad, n, 1); + idx2 = scratchpad_ptr<MASK>(lpad, n, 2); + idx3 = scratchpad_ptr<MASK>(lpad, n, 3); + } +} + +template void cn_gpu_inner_ssse3<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad); diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 2218bf88a8700fd7e6fdc55e29a996027a2e7398..c75eff8ffc87154f25d1d9f7f49357cc477f8d66 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -17,6 +17,7 @@ #include "cryptonight.h" #include "xmrstak/backend/cryptonight.hpp" +#include "cn_gpu.hpp" #include <memory.h> #include <stdio.h> #include <cfenv> @@ -167,6 +168,8 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3 template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO> void cn_explode_scratchpad(const __m128i* input, __m128i* output) { + constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast; + // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -182,7 +185,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) xin6 = _mm_load_si128(input + 10); xin7 = _mm_load_si128(input + 11); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) { for(size_t i=0; i < 16; i++) { @@ -263,9 +266,45 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) } } +template<size_t MEM, bool PREFETCH, xmrstak_algo ALGO> +void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output) +{ + constexpr size_t hash_size = 200; // 25x8 bytes + alignas(128) uint64_t hash[25]; + + for (uint64_t i = 0; i < MEM / 512; i++) + { + memcpy(hash, input, hash_size); + hash[0] ^= i; + + keccakf(hash, 24); + memcpy(output, hash, 160); + output+=160; + + keccakf(hash, 24); + memcpy(output, hash, 176); + output+=176; + + keccakf(hash, 24); + memcpy(output, hash, 176); + output+=176; + + if(PREFETCH) + { + _mm_prefetch((const char*)output - 512, _MM_HINT_T2); + _mm_prefetch((const char*)output - 384, _MM_HINT_T2); + _mm_prefetch((const char*)output - 256, _MM_HINT_T2); + _mm_prefetch((const char*)output - 128, _MM_HINT_T2); + } + } +} + template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO> void cn_implode_scratchpad(const __m128i* input, __m128i* output) { + constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu; + // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -326,11 +365,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) { for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { @@ -377,7 +416,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } @@ -1000,3 +1039,28 @@ struct Cryptonight_hash_asm<2, 0> } } }; + +struct Cryptonight_hash_gpu +{ + static constexpr size_t N = 1; + + template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH> + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MASK = cn_select_mask<ALGO>(); + constexpr size_t ITERATIONS = cn_select_iter<ALGO>(); + constexpr size_t MEM = cn_select_memory<ALGO>(); + + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad_gpu<MEM, PREFETCH, ALGO>(ctx[0]->hash_state, ctx[0]->long_state); + + if(cngpu_check_avx2()) + cn_gpu_inner_avx<ITERATIONS, MASK>(ctx[0]->hash_state, ctx[0]->long_state); + else + cn_gpu_inner_ssse3<ITERATIONS, MASK>(ctx[0]->hash_state, ctx[0]->long_state); + + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + memcpy(output, ctx[0]->hash_state, 32); + } +}; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 2327bed1d08294928f8bacbefc6f5852d0173ffe..e1af701e8d7ab2b59ab500ebedaa6a39576cdfc6 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -397,22 +397,32 @@ bool minethd::self_test() hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx); bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; } - else if(algo == cryptonight_superfast) + else if(algo == cryptonight_superfast) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast); hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx); bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; } - else if (algo == cryptonight_turtle) + else if(algo == cryptonight_gpu) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle); - hashf("This is a test This is a test This is a test", 44, out, ctx); - bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle); - hashf("This is a test This is a test This is a test", 44, out, ctx); - bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; - } + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_gpu); + hashf("", 0, out, ctx); + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_gpu); + hashf("", 0, out, ctx); + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + } + else if (algo == cryptonight_turtle) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; + } if(!bResult) printer::inst()->print_msg(L0, @@ -541,9 +551,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_superfast: algv = 11; break; - case cryptonight_turtle: + case cryptonight_gpu: algv = 12; break; + case cryptonight_turtle: + algv = 13; + break; default: algv = 2; break; @@ -609,6 +622,11 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash<N>::template hash<cryptonight_superfast, true, false>, Cryptonight_hash<N>::template hash<cryptonight_superfast, false, true>, Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>, + + Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, false>, + Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, false>, + Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, true>, + Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, true>, Cryptonight_hash<N>::template hash<cryptonight_turtle, false, false>, Cryptonight_hash<N>::template hash<cryptonight_turtle, true, false>, diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index 2dd922f912970f40c6b8cc90cfb438bc2c3d1976..ae862abae9d816bd4aec0913464f38fb9da2e60e 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -18,7 +18,8 @@ enum xmrstak_algo cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks cryptonight_monero_v8 = 11, cryptonight_superfast = 12, - cryptonight_turtle = 13 + cryptonight_gpu = 13, + cryptonight_turtle = 14 }; // define aeon settings @@ -34,6 +35,9 @@ constexpr size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024; constexpr uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0; constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000; +constexpr uint32_t CRYPTONIGHT_GPU_MASK = 0x1FFFC0; +constexpr uint32_t CRYPTONIGHT_GPU_ITER = 0xC000; + constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000; constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; @@ -81,6 +85,9 @@ inline constexpr size_t cn_select_memory<cryptonight_bittube2>() { return CRYPTO template<> inline constexpr size_t cn_select_memory<cryptonight_superfast>() { return CRYPTONIGHT_MEMORY; } +template<> +inline constexpr size_t cn_select_memory<cryptonight_gpu>() { return CRYPTONIGHT_MEMORY; } + template<> inline constexpr size_t cn_select_memory<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_MEMORY; } @@ -94,6 +101,7 @@ inline size_t cn_select_memory(xmrstak_algo algo) case cryptonight_masari: case cryptonight: case cryptonight_superfast: + case cryptonight_gpu: return CRYPTONIGHT_MEMORY; case cryptonight_ipbc: case cryptonight_aeon: @@ -149,6 +157,9 @@ inline constexpr uint32_t cn_select_mask<cryptonight_bittube2>() { return CRYPTO template<> inline constexpr uint32_t cn_select_mask<cryptonight_superfast>() { return CRYPTONIGHT_MASK; } +template<> +inline constexpr uint32_t cn_select_mask<cryptonight_gpu>() { return CRYPTONIGHT_GPU_MASK; } + template<> inline constexpr uint32_t cn_select_mask<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_MASK; } @@ -171,6 +182,8 @@ inline size_t cn_select_mask(xmrstak_algo algo) case cryptonight_haven: case cryptonight_heavy: return CRYPTONIGHT_HEAVY_MASK; + case cryptonight_gpu: + return CRYPTONIGHT_GPU_MASK; case cryptonight_turtle: return CRYPTONIGHT_TURTLE_MASK; default: @@ -217,6 +230,9 @@ inline constexpr uint32_t cn_select_iter<cryptonight_bittube2>() { return CRYPTO template<> inline constexpr uint32_t cn_select_iter<cryptonight_superfast>() { return CRYPTONIGHT_SUPERFAST_ITER; } +template<> +inline constexpr uint32_t cn_select_iter<cryptonight_gpu>() { return CRYPTONIGHT_GPU_ITER; } + template<> inline constexpr uint32_t cn_select_iter<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_ITER; } @@ -241,6 +257,8 @@ inline size_t cn_select_iter(xmrstak_algo algo) return CRYPTONIGHT_MASARI_ITER; case cryptonight_superfast: return CRYPTONIGHT_SUPERFAST_ITER; + case cryptonight_gpu: + return CRYPTONIGHT_GPU_ITER; case cryptonight_turtle: return CRYPTONIGHT_TURTLE_ITER; default: diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index c3a97808c7f7a4f7633f91eed289cc118b423089..2acf1a387ed0f0333b5c75c22c2af1d0bd9ff66f 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -10,6 +10,7 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp" #ifdef _WIN32 @@ -724,7 +725,8 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti cn_aes_pseudo_round_mut( sharedMemory, text, key ); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { #pragma unroll for ( int j = 0; j < 4; ++j ) @@ -843,6 +845,73 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) } } +template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, uint32_t MEM_MODE> +void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce) +{ + dim3 grid( ctx->device_blocks ); + dim3 block( ctx->device_threads ); + dim3 block2( ctx->device_threads << 1 ); + dim3 block4( ctx->device_threads << 2 ); + dim3 block8( ctx->device_threads << 3 ); + + size_t intensity = ctx->device_blocks * ctx->device_threads; + + CUDA_CHECK_KERNEL( + ctx->device_id, + xmrstak::nvidia::cn_explode_gpu<MEMORY><<<intensity,32>>>((int*)ctx->d_ctx_state, (int*)ctx->d_long_state) + ); + + int partcount = 1 << ctx->device_bfactor; + for(int i = 0; i < partcount; i++) + { + CUDA_CHECK_KERNEL( + ctx->device_id, + // 36 x 16byte x numThreads + xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu<ITERATIONS, MEMORY> + <<<ctx->device_blocks, ctx->device_threads * 16, 36 * 16 * ctx->device_threads>>> + ( + (int*)ctx->d_ctx_state, + (int*)ctx->d_long_state, + ctx->device_bfactor, + i, + ctx->d_ctx_a, + ctx->d_ctx_b + ) + ); + } + + /* bfactor for phase 3 + * + * 3 consume less time than phase 2, therefore we begin with the + * kernel splitting if the user defined a `bfactor >= 5` + */ + int bfactorOneThree = ctx->device_bfactor - 4; + if( bfactorOneThree < 0 ) + bfactorOneThree = 0; + + int partcountOneThree = 1 << bfactorOneThree; + int roundsPhase3 = partcountOneThree; + + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) + { + // cryptonight_heavy used two full rounds over the scratchpad memory + roundsPhase3 *= 2; + } + + for ( int i = 0; i < roundsPhase3; i++ ) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY/4, ALGO><<< + grid, + block8, + block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( ctx->device_blocks*ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + ctx->d_ctx_state, ctx->d_ctx_key2 )); + } +} + void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); @@ -882,10 +951,13 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>, cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>, - + cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 0>, cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 1>, + cryptonight_core_gpu_hash_gpu<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK, CRYPTONIGHT_MEMORY, cryptonight_gpu, 0>, + cryptonight_core_gpu_hash_gpu<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK, CRYPTONIGHT_MEMORY, cryptonight_gpu, 1>, + cryptonight_core_gpu_hash<CRYPTONIGHT_TURTLE_ITER, CRYPTONIGHT_TURTLE_MASK, CRYPTONIGHT_TURTLE_MEMORY/4, cryptonight_turtle, 0>, cryptonight_core_gpu_hash<CRYPTONIGHT_TURTLE_ITER, CRYPTONIGHT_TURTLE_MASK, CRYPTONIGHT_TURTLE_MEMORY/4, cryptonight_turtle, 1> }; @@ -895,4 +967,5 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; selected_function(ctx, startNonce); + } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a0fe53418cfa619447d6b2c2550d17e896550f7b --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -0,0 +1,552 @@ +#pragma once + +#include <cuda_runtime.h> +#include <stdio.h> +#include <cstdint> + +#include "cuda_keccak.hpp" +#include "cuda_extra.hpp" + +namespace xmrstak +{ +namespace nvidia +{ + +struct __m128i : public int4 +{ + + __forceinline__ __device__ __m128i(){} + + __forceinline__ __device__ __m128i( + const uint32_t x0, const uint32_t x1, + const uint32_t x2, const uint32_t x3) + { + x = x0; + y = x1; + z = x2; + w = x3; + } + + __forceinline__ __device__ __m128i( const int x0) + { + x = x0; + y = x0; + z = x0; + w = x0; + } + + __forceinline__ __device__ __m128i operator|(const __m128i& other) + { + return __m128i( + x | other.x, + y | other.y, + z | other.z, + w | other.w + ); + } + + __forceinline__ __device__ __m128i operator^(const __m128i& other) + { + return __m128i( + x ^ other.x, + y ^ other.y, + z ^ other.z, + w ^ other.w + ); + } +}; + +struct __m128 : public float4 +{ + + __forceinline__ __device__ __m128(){} + + __forceinline__ __device__ __m128( + const float x0, const float x1, + const float x2, const float x3) + { + float4::x = x0; + float4::y = x1; + float4::z = x2; + float4::w = x3; + } + + __forceinline__ __device__ __m128( const float x0) + { + float4::x = x0; + float4::y = x0; + float4::z = x0; + float4::w = x0; + } + + __forceinline__ __device__ __m128( const __m128i& x0) + { + float4::x = int2float(x0.x); + float4::y = int2float(x0.y); + float4::z = int2float(x0.z); + float4::w = int2float(x0.w); + } + + __forceinline__ __device__ __m128i get_int( ) + { + return __m128i( + (int)x, + (int)y, + (int)z, + (int)w + ); + } + + __forceinline__ __device__ __m128 operator+(const __m128& other) + { + return __m128( + x + other.x, + y + other.y, + z + other.z, + w + other.w + ); + } + + __forceinline__ __device__ __m128 operator-(const __m128& other) + { + return __m128( + x - other.x, + y - other.y, + z - other.z, + w - other.w + ); + } + + __forceinline__ __device__ __m128 operator*(const __m128& other) + { + return __m128( + x * other.x, + y * other.y, + z * other.z, + w * other.w + ); + } + + __forceinline__ __device__ __m128 operator/(const __m128& other) + { + return __m128( + x / other.x, + y / other.y, + z / other.z, + w / other.w + ); + } + + __forceinline__ __device__ __m128& trunc() + { + x=::truncf(x); + y=::truncf(y); + z=::truncf(z); + w=::truncf(w); + + return *this; + } + + __forceinline__ __device__ __m128& abs() + { + x=::fabsf(x); + y=::fabsf(y); + z=::fabsf(z); + w=::fabsf(w); + + return *this; + } + + __forceinline__ __device__ __m128& floor() + { + x=::floorf(x); + y=::floorf(y); + z=::floorf(z); + w=::floorf(w); + + return *this; + } +}; + + +template<typename T> +__device__ void print(const char* name, T value) +{ + printf("g %s: ", name); + for(int i = 0; i < 4; ++i) + { + printf("%08X ",((uint32_t*)&value)[i]); + } + printf("\n"); +} + +template<> +__device__ void print<__m128>(const char* name, __m128 value) +{ + printf("g %s: ", name); + for(int i = 0; i < 4; ++i) + { + printf("%f ",((float*)&value)[i]); + } + printf("\n"); +} + +#define SHOW(name) print(#name, name) + + +__forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return a + b; +} + +__forceinline__ __device__ __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return a - b; +} + +__forceinline__ __device__ __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return a * b; +} + +__forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b) +{ + return a / b; +} + +__forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) & b), + int_as_float(float_as_int(a.y) & b), + int_as_float(float_as_int(a.z) & b), + int_as_float(float_as_int(a.w) & b) + ); +} + +__forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) | b), + int_as_float(float_as_int(a.y) | b), + int_as_float(float_as_int(a.z) | b), + int_as_float(float_as_int(a.w) | b) + ); +} + +__forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) ^ b), + int_as_float(float_as_int(a.y) ^ b), + int_as_float(float_as_int(a.z) ^ b), + int_as_float(float_as_int(a.w) ^ b) + ); +} + +__forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc) +{ + __m128 d(dc); + __m128 c = _mm_div_ps(v, d); + c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); + // c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2 + c = _mm_mul_ps(c, d); + return _mm_sub_ps(v, c); + + + //return a.fmodf(b); +} + +__forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return a ^ b; +} + + +__forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot) +{ + const uint32_t right = 8 * rot; + const uint32_t left = (32 - 8 * rot); + return __m128i( + ((uint32_t)a.x >> right) | ( a.y << left ), + ((uint32_t)a.y >> right) | ( a.z << left ), + ((uint32_t)a.z >> right) | ( a.w << left ), + ((uint32_t)a.w >> right) | ( a.x << left ) + ); +} + +template<uint32_t MASK> +__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); } + + +__forceinline__ __device__ __m128 fma_break(__m128 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(x, 0xFEFFFFFF); + return _mm_or_ps(x, 0x00800000); +} + +// 9 +__forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c) +{ + n1 = _mm_add_ps(n1, c); + __m128 nn = _mm_mul_ps(n0, c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + n = _mm_add_ps(n, nn); + + n3 = _mm_sub_ps(n3, c); + __m128 dd = _mm_mul_ps(n2, c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + d = _mm_add_ps(d, dd); + + //Constant feedback + c = _mm_add_ps(c, rnd_c); + c = _mm_add_ps(c, 0.734375f); + __m128 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + c = _mm_add_ps(c, r); +} + +// 9*8 + 2 = 74 +__forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r) +{ + __m128 n(0.0f), d(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(d, 0xFF7FFFFF); + d = _mm_or_ps(d, 0x40000000); + r =_mm_add_ps(r, _mm_div_ps(n,d)); +} + +// 74*8 = 595 +__forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +{ + __m128 c(cnt); + // 35 maths calls follow (140 FLOPS) + __m128 r = __m128(0.0f); + for(int i=0; i< 4; ++i) + round_compute(n0, n1, n2, n3, rnd_c, c, r); + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + sum = r; // 34 + r = _mm_mul_ps(r, __m128(536870880.0f)); // 35 + return r.get_int(); + +} + +__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, __m128i v0, __m128i v1, __m128i v2, __m128i v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +{ + __m128 n0(v0); + __m128 n1(v1); + __m128 n2(v2); + __m128 n3(v3); + + __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + out = rot == 0 ? r : _mm_alignr_epi8(r, rot); +} + +__constant__ uint32_t look[16][4] = { + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} +}; + +__constant__ float ccnt[16] = { + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f +}; + +template<size_t ITERATIONS, uint32_t MEMORY> +__global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS) +{ + static constexpr uint32_t MASK = ((MEMORY-1) >> 6) << 6; + + const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + + extern __shared__ __m128i smemExtern_in[]; + + const uint32_t chunk = threadIdx.x / 16; + const uint32_t numHashPerBlock = blockDim.x / 16; + + int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk)); + + __m128i* smem = smemExtern_in + 4 * chunk; + + __m128i* smemExtern = smemExtern_in + numHashPerBlock * 4; + __m128i* smemOut = smemExtern + 16 * chunk; + + smemExtern = smemExtern + numHashPerBlock * 16; + __m128* smemVa = (__m128*)smemExtern + 16 * chunk; + + uint32_t tid = threadIdx.x % 16; + + const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16; + uint32_t s = 0; + + __m128 vs(0); + if(partidx != 0) + { + vs = ((__m128*)roundVs)[idxHash]; + s = roundS[idxHash]; + } + else + { + s = ((uint32_t*)spad)[idxHash * 50] >> 8; + } + + for(size_t i = 0; i < batchsize; i++) + { + __syncthreads(); + + ((int*)smem)[tid] = ((int*)scratchpad_ptr<MASK>(s, tid/4, lpad))[tid%4]; + __syncthreads(); + + __m128 rc = vs; + + + single_comupte_wrap( + tid%4, + *(smem + look[tid][0]), + *(smem + look[tid][1]), + *(smem + look[tid][2]), + *(smem + look[tid][3]), + ccnt[tid], rc, smemVa[tid], + smemOut[tid] + ); + + __syncthreads(); + + if(tid % 4 == 0) + { + __m128i out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]); + __m128i out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]); + out = _mm_xor_si128(out, out2); + smemOut[tid] = out; + *scratchpad_ptr<MASK>(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out); + } + __syncthreads(); + + + __m128i out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12]; + + if(tid%2 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 1]; + + if(tid%4 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 2]; + if(tid%8 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 4]; + if(tid%16 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 8]; + vs = smemVa[0]; + + vs.abs(); // take abs(va) by masking the float sign bit + auto xx = _mm_mul_ps(vs, __m128(16777216.0f)); + // vs range 0 - 64 + *smem = xx.get_int(); + *smem = _mm_xor_si128(*smem, out2); + // vs is now between 0 and 1 + vs = _mm_div_ps(vs, __m128(64.0f)); + s = smem->x ^ smem->y ^ smem->z ^ smem->w; + } + if(partidx != ((1<<bfactor) - 1) && threadIdx.x % 16 == 0) + { + const uint32_t numHashPerBlock2 = blockDim.x / 16; + const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x/16; + ((__m128*)roundVs)[idxHash2] = vs; + roundS[idxHash2] = s; + } +} + +__forceinline__ __device__ void generate_512(uint64_t idx, const uint64_t* in, uint8_t* out) +{ + uint64_t hash[25]; + + hash[0] = in[0] ^ idx; + #pragma unroll 24 + for(int i = 1; i < 25; ++i) + hash[i] = in[i]; + + cn_keccakf2(hash); + #pragma unroll 10 + for(int i = 0; i < 10; ++i) + ((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i]; + out+=160; + + cn_keccakf2(hash); + #pragma unroll 11 + for(int i = 0; i < 11; ++i) + ((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i]; + out+=176; + + cn_keccakf2(hash); + #pragma unroll 11 + for(int i = 0; i < 11; ++i) + ((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i]; +} + +template<size_t MEMORY> +__global__ void cn_explode_gpu(int32_t *spad_in, int *lpad_in) +{ + __shared__ uint64_t state[25]; + + uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY; + uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200); + + constexpr size_t hash_size = 200; // 25x8 bytes + memcpy(state, spad, hash_size); + + for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x) + { + generate_512(i, state, (uint8_t*)lpad + i*512); + } +} + +} // namespace xmrstak +} // namespace nvidia diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 7149f37a661187a9dad8a3032cc55067be446c1e..e4574e20af4896b97709691cfe46ac8383518011 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -9,21 +9,6 @@ #include <algorithm> #include "xmrstak/jconf.hpp" -#ifdef __CUDACC__ -__constant__ -#else -const -#endif -uint64_t keccakf_rndc[24] ={ - 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, - 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, - 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, - 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, - 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, - 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, - 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, - 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 -}; typedef unsigned char BitSequence; typedef unsigned long long DataLength; @@ -184,7 +169,8 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 __shared__ uint32_t sharedMemory[1024]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { cn_aes_gpu_init( sharedMemory ); __syncthreads( ); @@ -201,7 +187,8 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 for ( i = 0; i < 50; i++ ) state[i] = ctx_state[i]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { uint32_t key[40]; @@ -220,33 +207,46 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 } cn_keccakf2( (uint64_t *) state ); - switch ( ( (uint8_t *) state )[0] & 0x03 ) + if(ALGO == cryptonight_gpu) { - case 0: - cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); - break; - case 1: - cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - case 2: - cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - case 3: - cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - default: - break; + if ( ((uint64_t*)state)[3] < target ) + { + uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + + if(idx < 10) + d_res_nonce[idx] = thread; + } } + else + { + switch ( ( (uint8_t *) state )[0] & 0x03 ) + { + case 0: + cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); + break; + case 1: + cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + case 2: + cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + case 3: + cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + default: + break; + } - // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values - // and expect an accurate result for target > 32-bit without implementing carries + // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values + // and expect an accurate result for target > 32-bit without implementing carries - if ( hash[3] < target ) - { - uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + if ( hash[3] < target ) + { + uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); - if(idx < 10) - d_res_nonce[idx] = thread; + if(idx < 10) + d_res_nonce[idx] = thread; + } } } @@ -373,6 +373,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_turtle> << <grid, block >> > (wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } + else if(miner_algo == cryptonight_gpu) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, @@ -426,6 +431,15 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, cryptonight_extra_gpu_final<cryptonight_bittube2><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) ); } + else if(miner_algo == cryptonight_gpu) + { + // fallback for all other algorithms + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", + cryptonight_extra_gpu_final<cryptonight_gpu><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) + ); + } else { // fallback for all other algorithms diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp index 99c651645ecbb97fcb810cc3d0ab1a82596df198..c75c74964f233c89a54532117824aae52003859f 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp @@ -1,3 +1,23 @@ +#pragma once + +#include "cuda_extra.hpp" + +#ifdef __CUDACC__ +__constant__ +#else +const +#endif +uint64_t keccakf_rndc[24] ={ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + #if __CUDA_ARCH__ >= 350 __forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset) { diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 165595c5f2c25599c51d21c117c813301ec64e20..80e6002d7ddfb38835d90a559dfbc9c7ceb6ddb3 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -103,6 +103,7 @@ xmrstak::coin_selection coins[] = { { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "cryptonight_gpu", {cryptonight_gpu, cryptonight_gpu, 255u}, {cryptonight_gpu, cryptonight_gpu, 0u}, nullptr }, { "freehaven", {cryptonight_heavy, cryptonight_superfast, 255u}, {cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, { "graft", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 2f4e2a11f5ff0649eb3f5433df76ecd94f6cbddc..c475c4129f141a9d11a1025bb88bf96872b3795a 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -560,6 +560,12 @@ void executor::ex_main() else pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true); break; + case cryptonight_gpu: + if(dev_tls) + pools.emplace_front(0, "donate.xmr-stak.net:8811", "", "", "", 0.0, true, true, "", false); + else + pools.emplace_front(0, "donate.xmr-stak.net:5511", "", "", "", 0.0, true, false, "", false); + break; case cryptonight_monero_v8: case cryptonight_monero: case cryptonight_turtle: