From bd96c36a61eaa3d20c01a3f302ccde5d7c550732 Mon Sep 17 00:00:00 2001 From: psychocrypt <psychocryptHPC@gmail.com> Date: Mon, 11 Mar 2019 20:34:38 +0100 Subject: [PATCH] cryptonight_r without mediaops Add support for non media ops for cryptonight_r --- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 69 +------------------ .../backend/amd/amd_gpu/opencl/wolf-aes.cl | 64 +++++++++++++++++ 2 files changed, 66 insertions(+), 67 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 0e08d00..12478ae 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -32,69 +32,6 @@ R"===( #define cryptonight_conceal 14 #define cryptonight_v8_reversewaltz 17 -/* For Mesa clover support */ -#ifdef cl_clang_storage_class_specifiers -# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable -#endif - -#ifdef cl_amd_media_ops -#pragma OPENCL EXTENSION cl_amd_media_ops : enable -#else -/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt - * Build-in Function - * uintn amd_bitalign (uintn src0, uintn src1, uintn src2) - * Description - * dst.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31)) - * similar operation applied to other components of the vectors. - * - * The implemented function is modified because the last is in our case always a scalar. - * We can ignore the bitwise AND operation. - */ -inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2) -{ - uint2 result; - result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2)); - result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2)); - return result; -} -#endif - -#ifdef cl_amd_media_ops2 -#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable -#else -/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt - * Built-in Function: - * uintn amd_bfe (uintn src0, uintn src1, uintn src2) - * Description - * NOTE: operator >> below represent logical right shift - * offset = src1.s0 & 31; - * width = src2.s0 & 31; - * if width = 0 - * dst.s0 = 0; - * else if (offset + width) < 32 - * dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width); - * else - * dst.s0 = src0.s0 >> offset; - * similar operation applied to other components of the vectors - */ -inline int amd_bfe(const uint src0, const uint offset, const uint width) -{ - /* casts are removed because we can implement everything as uint - * int offset = src1; - * int width = src2; - * remove check for edge case, this function is always called with - * `width==8` - * @code - * if ( width == 0 ) - * return 0; - * @endcode - */ - if ( (offset + width) < 32u ) - return (src0 << (32u - offset - width)) >> (32u - width); - - return src0 >> offset; -} -#endif static const __constant ulong keccakf_rndc[24] = { @@ -128,6 +65,8 @@ static const __constant uchar sbox[256] = 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16 }; +//#include "opencl/wolf-aes.cl" +XMRSTAK_INCLUDE_WOLF_AES void keccakf1600(ulong *s) { @@ -355,8 +294,6 @@ inline uint getIdx() XMRSTAK_INCLUDE_FAST_INT_MATH_V2 //#include "fast_div_heavy.cl" XMRSTAK_INCLUDE_FAST_DIV_HEAVY -//#include "opencl/wolf-aes.cl" -XMRSTAK_INCLUDE_WOLF_AES //#include "opencl/wolf-skein.cl" XMRSTAK_INCLUDE_WOLF_SKEIN //#include "opencl/jh.cl" @@ -461,8 +398,6 @@ void CNKeccak(ulong *output, ulong *input) static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 }; -#define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U)) - #define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)]) void AESExpandKey256(uint *keybuf) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl index b99b62d..f1457c0 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl @@ -2,6 +2,70 @@ R"===( #ifndef WOLF_AES_CL #define WOLF_AES_CL +/* For Mesa clover support */ +#ifdef cl_clang_storage_class_specifiers +# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable +#endif + +#ifdef cl_amd_media_ops +#pragma OPENCL EXTENSION cl_amd_media_ops : enable +#else +/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt + * Build-in Function + * uintn amd_bitalign (uintn src0, uintn src1, uintn src2) + * Description + * dst.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31)) + * similar operation applied to other components of the vectors. + * + * The implemented function is modified because the last is in our case always a scalar. + * We can ignore the bitwise AND operation. + */ +inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2) +{ + uint2 result; + result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2)); + result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2)); + return result; +} +#endif + +#ifdef cl_amd_media_ops2 +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable +#else +/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt + * Built-in Function: + * uintn amd_bfe (uintn src0, uintn src1, uintn src2) + * Description + * NOTE: operator >> below represent logical right shift + * offset = src1.s0 & 31; + * width = src2.s0 & 31; + * if width = 0 + * dst.s0 = 0; + * else if (offset + width) < 32 + * dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width); + * else + * dst.s0 = src0.s0 >> offset; + * similar operation applied to other components of the vectors + */ +inline int amd_bfe(const uint src0, const uint offset, const uint width) +{ + /* casts are removed because we can implement everything as uint + * int offset = src1; + * int width = src2; + * remove check for edge case, this function is always called with + * `width==8` + * @code + * if ( width == 0 ) + * return 0; + * @endcode + */ + if ( (offset + width) < 32u ) + return (src0 << (32u - offset - width)) >> (32u - width); + + return src0 >> offset; +} +#endif + // AES table - the other three are generated on the fly static const __constant uint AES0_C[256] = -- GitLab