Skip to content
Snippets Groups Projects
Unverified Commit c1eac4ac authored by fireice-uk's avatar fireice-uk Committed by GitHub
Browse files

Merge pull request #2045 from psychocrypt/topic-speedupAMDCNHeavy

AMD: speedup cryptonight_heavy division
parents e9f7b837 bfb3243c
No related branches found
No related tags found
No related merge requests found
......@@ -903,6 +903,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
const char *fastIntMathV2CL =
#include "./opencl/fast_int_math_v2.cl"
;
const char *fastDivHeavyCL =
#include "./opencl/fast_div_heavy.cl"
;
const char *cryptonightCL =
#include "./opencl/cryptonight.cl"
;
......@@ -924,6 +927,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
std::string source_code(cryptonightCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
......
......@@ -80,6 +80,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
//#include "opencl/fast_int_math_v2.cl"
XMRSTAK_INCLUDE_FAST_INT_MATH_V2
//#include "fast_div_heavy.cl"
XMRSTAK_INCLUDE_FAST_DIV_HEAVY
//#include "opencl/wolf-aes.cl"
XMRSTAK_INCLUDE_WOLF_AES
//#include "opencl/wolf-skein.cl"
......@@ -802,14 +804,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
#if (ALGO == 4 || ALGO == 10)
long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
long q = n / (d | 0x5);
long q = fast_div_heavy(n, d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
idx0 = (d ^ q) & MASK;
// cryptonight_haven
#elif (ALGO == 9)
long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
long q = n / (d | 0x5);
long q = fast_div_heavy(n, d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
idx0 = ((~d) ^ q) & MASK;
#endif
......
R"===(
#ifndef FAST_DIV_HEAVY_CL
#define FAST_DIV_HEAVY_CL
inline ulong get_reciprocal_heavy(uint a)
{
const uint shift = clz(a);
a <<= shift;
const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);
const float r = native_recip(a_hi);
const uint tmp0 = as_uint(r);
const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
const float r_scaled = as_float(tmp1);
const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
const float r_scaled_hi = as_float(tmp1 & ~4095U);
const float h_hi = as_float(as_uint(h) & ~4095U);
const float r_scaled_lo = r_scaled - r_scaled_hi;
const float h_lo = h - h_hi;
const float x1 = h_hi * r_scaled_hi;
const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;
const long h1 = convert_long_rte(x1);
const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));
const ulong result = tmp0 & 0xFFFFFF;
return (result << (shift + 9)) - ((h1 + h2) >> 2);
}
inline long fast_div_heavy(long _a, int _b)
{
const ulong a = abs(_a);
const uint b = abs(_b);
ulong q = mul_hi(a, get_reciprocal_heavy(b));
const long tmp = a - q * b;
const int overshoot = (tmp < 0) ? 1 : 0;
const int undershoot = (tmp >= b) ? 1 : 0;
q += undershoot - overshoot;
return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
}
#endif
)==="
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment