optimize cn-heavy div

x-ref: https://github.com/xmrig/xmrig-amd/pull/192

optimize cn-heavy div
x-ref: https://github.com/xmrig/xmrig-amd/pull/192
e6177f1c · SChernykh · psychocrypt · 28ef8e3d · e6177f1c
Commit e6177f1c authored 6 years ago by SChernykh Committed by psychocrypt 6 years ago
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
@@ -2,49 +2,24 @@ R"===(
 #ifndef FAST_DIV_HEAVY_CL
 #define FAST_DIV_HEAVY_CL

-inline ulong get_reciprocal_heavy(uint a)
+inline long fast_div_heavy(long _a, int _b)
 {
-	const uint shift = clz(a);
-	a <<= shift;
-
-	const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
-	const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);
-
-	const float r = native_recip(a_hi);
-
-	const uint tmp0 = as_uint(r);
-	const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
-	const float r_scaled = as_float(tmp1);
-
-	const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
-
-	const float r_scaled_hi = as_float(tmp1 & ~4095U);
-	const float h_hi = as_float(as_uint(h) & ~4095U);
+	long a = abs(_a);
+	int b = abs(_b);

-	const float r_scaled_lo = r_scaled - r_scaled_hi;
-	const float h_lo = h - h_hi;
+	float rcp = native_recip(convert_float_rte(b));
+	float rcp2 = as_float(as_uint(rcp) + (32U << 23));

-	const float x1 = h_hi * r_scaled_hi;
-	const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;
+	ulong q1 = convert_ulong_rte(convert_float_rte(as_int2(a).s1) * rcp2);
+	a -= q1 * as_uint(b);

-	const long h1 = convert_long_rte(x1);
-	const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));
-
-	const ulong result = tmp0 & 0xFFFFFF;
-	return (result << (shift + 9)) - ((h1 + h2) >> 2);
-}
-
-inline long fast_div_heavy(long _a, int _b)
-{
-	const ulong a = abs(_a);
-	const uint b = abs(_b);
-	ulong q = mul_hi(a, get_reciprocal_heavy(b));
+	long q2 = convert_long_rte(convert_float_rtn(a) * rcp);
+	int a2 = as_int2(a).s0 - as_int2(q2).s0 * b;

-	const long tmp = a - q * b;
-	const int overshoot = (tmp < 0) ? 1 : 0;
-	const int undershoot = (tmp >= b) ? 1 : 0;
-	q += undershoot - overshoot;
+	int q3 = convert_int_rte(convert_float_rte(a2) * rcp);
+	q3 += (a2 - q3 * b) >> 31;

+	const long q = q1 + q2 + q3;
 	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
 }