Skip to content
Snippets Groups Projects
Commit 0c1d805a authored by psychocrypt's avatar psychocrypt
Browse files

CUDA: optimize cn-heavy div


port OpenCl optimized division to CUDA

Co-authored-by: default avatarSChernykh <sergey.v.chernykh@gmail.com>
parent 447fef4b
No related branches found
No related tags found
No related merge requests found
......@@ -9,6 +9,7 @@
#include "xmrstak/jconf.hpp"
#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
#ifdef _WIN32
......@@ -647,7 +648,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
{
int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
int64_t q = n / (d | 0x5);
int64_t q = fast_div_heavy(n, (d | 0x5));
if(sub&1)
storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
......@@ -658,7 +659,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
{
int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
int64_t q = n / (d | 0x5);
int64_t q = fast_div_heavy(n, (d | 0x5));
if(sub&1)
storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
......@@ -840,9 +841,9 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
{
typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
if(miner_algo == invalid_algo) return;
static const cuda_hash_fn func_table[] = {
cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,
......
#pragma once
#include <stdint.h>
__device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b)
{
uint64_t a = abs(_a);
int b = abs(_b);
float rcp = __frcp_rn(__int2float_rn(b));
float rcp2 = __uint_as_float(__float_as_uint(rcp) + (32U << 23));
uint64_t q1 = __float2ull_rz(__int2float_rn(((int*)&a)[1]) * rcp2);
a -= q1 * static_cast<uint32_t>(b);
uint64_t tmp = a >> 12;
float q2f = __int2float_rn(((int*)&tmp)[0]) * rcp;
q2f = __uint_as_float(__float_as_uint(q2f) + (12U << 23));
int64_t q2 = __float2ll_rn(q2f);
int a2 = ((int*)&a)[0] - ((int*)&q2)[0] * b;
int q3 = __float2int_rn(__int2float_rn(a2) * rcp);
q3 += (a2 - q3 * b) >> 31;
const uint64_t q = q1 + q2 + q3;
return ((((int*)&_a)[1] ^ _b) < 0) ? -q : q;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment