Skip to content
Snippets Groups Projects
Unverified Commit a8d09606 authored by fireice-uk's avatar fireice-uk Committed by GitHub
Browse files

Merge pull request #2104 from psychocrypt/topic-optimizev2Reciprocal

OpenCL: opimize reciprocal calculation
parents 19331413 bc91088a
No related branches found
No related tags found
No related merge requests found
......@@ -567,7 +567,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
// cryptonight_monero_v8
#if(ALGO==11)
# ifdef __clang__
__local uint RCP[256];
# endif
uint2 division_result;
uint sqrt_result;
#endif
......@@ -579,7 +582,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
// cryptonight_monero_v8
#if(ALGO==11)
#if(ALGO==11 && defined(__clang__))
RCP[i] = RCP_C[i];
#endif
}
......@@ -714,7 +717,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
// We drop the highest bit to fit both quotient and remainder in 32 bits
# ifdef __clang__
division_result = fast_div_v2(RCP, c[1], d);
# else
division_result = fast_div_v2(c[1], d);
# endif
// Use division_result as an input for the square root to prevent parallel implementation in hardware
sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
......
......@@ -42,6 +42,9 @@ static const __constant uint RCP_C[256] =
0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
};
// Rocm produce invalid results if get_reciprocal without lookup table is used
#ifdef __clang__
inline uint get_reciprocal(const __local uchar *RCP, uint a)
{
const uint index1 = (a & 0x7F000000U) >> 21;
......@@ -66,9 +69,33 @@ inline uint get_reciprocal(const __local uchar *RCP, uint a)
return as_uint2(k).s1 + (b ? r : 0);
}
#else
inline uint get_reciprocal(uint a)
{
const float a_hi = as_float((a >> 8) + ((126U + 31U) << 23));
const float a_lo = convert_float_rte(a & 0xFF);
const float r = native_recip(a_hi);
const float r_scaled = as_float(as_uint(r) + (64U << 23));
const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
return (as_uint(r) << 9) - convert_int_rte(h * r_scaled);
}
#endif
#ifdef __clang__
inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
{
const uint r = get_reciprocal((const __local uchar *)RCP, b);
const uint r = get_reciprocal((const __local uchar *)RCP, b);
#else
inline uint2 fast_div_v2(ulong a, uint b)
{
const uint r = get_reciprocal(b);
#endif
const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
const uint q = as_uint2(k).s1;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment