Skip to content
Snippets Groups Projects
Commit e6177f1c authored by SChernykh's avatar SChernykh Committed by psychocrypt
Browse files

optimize cn-heavy div

x-ref: https://github.com/xmrig/xmrig-amd/pull/192
parent 28ef8e3d
No related branches found
No related tags found
No related merge requests found
......@@ -2,49 +2,24 @@ R"===(
#ifndef FAST_DIV_HEAVY_CL
#define FAST_DIV_HEAVY_CL
inline ulong get_reciprocal_heavy(uint a)
inline long fast_div_heavy(long _a, int _b)
{
const uint shift = clz(a);
a <<= shift;
const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);
const float r = native_recip(a_hi);
const uint tmp0 = as_uint(r);
const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
const float r_scaled = as_float(tmp1);
const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
const float r_scaled_hi = as_float(tmp1 & ~4095U);
const float h_hi = as_float(as_uint(h) & ~4095U);
long a = abs(_a);
int b = abs(_b);
const float r_scaled_lo = r_scaled - r_scaled_hi;
const float h_lo = h - h_hi;
float rcp = native_recip(convert_float_rte(b));
float rcp2 = as_float(as_uint(rcp) + (32U << 23));
const float x1 = h_hi * r_scaled_hi;
const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;
ulong q1 = convert_ulong_rte(convert_float_rte(as_int2(a).s1) * rcp2);
a -= q1 * as_uint(b);
const long h1 = convert_long_rte(x1);
const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));
const ulong result = tmp0 & 0xFFFFFF;
return (result << (shift + 9)) - ((h1 + h2) >> 2);
}
inline long fast_div_heavy(long _a, int _b)
{
const ulong a = abs(_a);
const uint b = abs(_b);
ulong q = mul_hi(a, get_reciprocal_heavy(b));
long q2 = convert_long_rte(convert_float_rtn(a) * rcp);
int a2 = as_int2(a).s0 - as_int2(q2).s0 * b;
const long tmp = a - q * b;
const int overshoot = (tmp < 0) ? 1 : 0;
const int undershoot = (tmp >= b) ? 1 : 0;
q += undershoot - overshoot;
int q3 = convert_int_rte(convert_float_rte(a2) * rcp);
q3 += (a2 - q3 * b) >> 31;
const long q = q1 + q2 + q3;
return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment