Skip to content
Snippets Groups Projects
Unverified Commit c6846a82 authored by fireice-uk's avatar fireice-uk Committed by GitHub
Browse files

Merge pull request #2086 from psychocrypt/topic-amdOptimizeCNv8Div

OpenCl: optimize cn-v8 div
parents b06747f9 bff5b000
No related branches found
No related tags found
No related merge requests found
......@@ -68,61 +68,35 @@ inline uint get_reciprocal(const __local uchar *RCP, uint a)
inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
{
const uint r = get_reciprocal((const __local uchar *)RCP, b);
const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
ulong q;
((uint*)&q)[0] = as_uint2(k).s1;
#if defined(cl_amd_device_attribute_query) && (OPENCL_DRIVER_MAJOR == 14)
/* The AMD driver 14.XX is not able to compile `(k < a)`
* https://github.com/fireice-uk/xmr-stak/issues/1922
* This is a workaround for the broken compiler.
*/
ulong whyAMDwhy;
((uint*)&whyAMDwhy)[0] = as_uint2(k).s0;
((uint*)&whyAMDwhy)[1] = as_uint2(k).s1;
((uint*)&q)[1] = (whyAMDwhy < a) ? 1U : 0U;
#else
((uint*)&q)[1] = (k < a) ? 1U : 0U;
#endif
const long tmp = a - q * b;
const bool overshoot = (tmp < 0);
const bool undershoot = (tmp >= b);
return (uint2)(
as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U),
as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U)
);
const uint r = get_reciprocal((const __local uchar *)RCP, b);
const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
const uint q = as_uint2(k).s1;
long tmp = a - ((ulong)(q) * b);
((int*)&tmp)[1] -= (as_uint2(k).s1 < as_uint2(a).s1) ? b : 0;
const int overshoot = ((int*)&tmp)[1] >> 31;
const int undershoot = as_int2(as_uint(b - 1) - tmp).s1 >> 31;
return (uint2)(q + overshoot - undershoot, as_uint2(tmp).s0 + (as_uint(overshoot) & b) - (as_uint(undershoot) & b));
}
inline uint fast_sqrt_v2(const ulong n1)
{
float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
float x1 = native_rsqrt(x);
x = native_sqrt(x);
// The following line does x1 *= 4294967296.0f;
x1 = as_float(as_uint(x1) + (32U << 23));
const uint x0 = as_uint(x) - (158U << 23);
const long delta0 = n1 - (((long)(x0) * x0) << 18);
const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
uint result = (x0 << 10) + convert_int_rte(delta);
const uint s = result >> 1;
const uint b = result & 1;
const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
if ((long)(x2 + b) > 0) --result;
if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
return result;
float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
float x1 = native_rsqrt(x);
x = native_sqrt(x);
// The following line does x1 *= 4294967296.0f;
x1 = as_float(as_uint(x1) + (32U << 23));
const uint x0 = as_uint(x) - (158U << 23);
const long delta0 = n1 - (as_ulong((uint2)(mul24(x0, x0), mul_hi(x0, x0))) << 18);
const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
uint result = (x0 << 10) + convert_int_rte(delta);
const uint s = result >> 1;
const uint b = result & 1;
const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
if ((long)(x2 + as_int(b - 1)) >= 0) --result;
if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
return result;
}
#endif
)==="
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment