diff --git a/CMakeLists.txt b/CMakeLists.txt index cf439227fe394bb46421614b2c98a01506b073b0..b51eb2ae4838d6dd4f4a0a3aa97e788cd0843e17 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -445,6 +445,26 @@ if(CMAKE_LINK_STATIC) endif() endif() +if(CMAKE_C_COMPILER_ID MATCHES "MSVC") + # asm optimized monero v8 code + enable_language(ASM_MASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" PROPERTY ASM_MASM) + add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" + ) +else() + # asm optimized monero v8 code + enable_language(ASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY C) + add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" + ) +endif() + +set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) + # compile C files file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c") @@ -456,17 +476,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99) if(MICROHTTPD_ENABLE) target_link_libraries(xmr-stak-c ${MHTD}) endif() -target_link_libraries(xmr-stak-c ${LIBS}) - -enable_language(ASM) -set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY LANGUAGE C) -# asm optimized monero v8 code -add_library(xmr-stak-asm - STATIC - "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" -) -set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) - +target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm) # compile generic backend files file(GLOB BACKEND_CPP @@ -509,7 +519,7 @@ if(CUDA_FOUND) ) endif() target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES}) - target_link_libraries(xmrstak_cuda_backend xmr-stak-backend) + target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm) endif() # compile AMD backend @@ -522,7 +532,7 @@ if(OpenCL_FOUND) ${OPENCLSRCFILES} ) target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} ) - target_link_libraries(xmrstak_opencl_backend xmr-stak-backend) + target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm) endif() # compile final binary @@ -538,7 +548,7 @@ endif() set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}") set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}") -target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend) +target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm) ################################################################################ # Install @@ -569,4 +579,4 @@ if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR ) else() # this rule is used if the install prefix is the build directory install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")") -endif() +endif() \ No newline at end of file diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S index cd747f7c56cc5aaebfdb64b9cf263005b68fa327..736dac7de0234af9601f3a184ed52478ddf9289b 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S @@ -1,21 +1,27 @@ #define ALIGN .align .intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn .section .text -.global cryptonigh_v8_mainloop_ivybridge_asm -.global cryptonigh_v8_mainloop_ryzen_asm +#endif +.global FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm) +.global FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm) -ALIGN 64 -cryptonigh_v8_mainloop_ivybridge_asm: +ALIGN 8 +FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cryptonigh_v8_main_loop_ivybridge.inc" + #include "cryptonigh_v8_main_loop_ivybridge_linux.inc" add rsp, 48 ret 0 -ALIGN 64 -cryptonigh_v8_mainloop_ryzen_asm: +ALIGN 8 +FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm): sub rsp, 48 mov rcx, rdi - #include "cryptonigh_v8_main_loop_ryzen.inc" + #include "cryptonigh_v8_main_loop_ryzen_linux.inc" add rsp, 48 ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm index 2101a59ce88b53e4c75b3e632ad25d232c1ec929..7f2d6a584af330a5ae0dfa59f265b0c8323010b0 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm @@ -2,15 +2,15 @@ _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE PUBLIC cryptonigh_v8_mainloop_ivybridge_asm PUBLIC cryptonigh_v8_mainloop_ryzen_asm -ALIGN 64 +ALIGN 8 cryptonigh_v8_mainloop_ivybridge_asm PROC - INCLUDE cryptonigh_v8_main_loop_ivybridge.inc + INCLUDE cryptonigh_v8_main_loop_ivybridge_win64.inc ret 0 cryptonigh_v8_mainloop_ivybridge_asm ENDP -ALIGN 64 +ALIGN 8 cryptonigh_v8_mainloop_ryzen_asm PROC - INCLUDE cryptonigh_v8_main_loop_ryzen.inc + INCLUDE cryptonigh_v8_main_loop_ryzen_win64.inc ret 0 cryptonigh_v8_mainloop_ryzen_asm ENDP diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc similarity index 91% rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc index 1cc20b35aed070f5d2fe04aa3251304a02adff9a..23f6cc06069e436731c96055584a0c94eebca570 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc @@ -49,8 +49,8 @@ movq xmm0, rcx punpcklqdq xmm5, xmm0 - ALIGN 64 -$main_loop_ivybridge: + ALIGN 8 +main_loop_ivybridge: movdqu xmm6, XMMWORD PTR [r10+rbx] lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d @@ -105,10 +105,10 @@ $main_loop_ivybridge: sqrtsd xmm3, xmm0 movq rdx, xmm3 test rdx, 524287 - je $sqrt_fixup_ivybridge + je sqrt_fixup_ivybridge psrlq xmm3, 19 psubq xmm3, XMMWORD PTR [rsp+16] -$sqrt_fixup_ivybridge_ret: +sqrt_fixup_ivybridge_ret: mov r9, r10 mov rax, rdi @@ -138,7 +138,7 @@ $sqrt_fixup_ivybridge_ret: and r10d, 2097136 xor r11, r12 dec rsi - jne $main_loop_ivybridge + jne main_loop_ivybridge ldmxcsr DWORD PTR [rsp] mov rbx, QWORD PTR [rsp+160] @@ -153,24 +153,24 @@ $sqrt_fixup_ivybridge_ret: pop rdi pop rsi pop rbp - jmp $cnv2_main_loop_ivybridge_endp + jmp cnv2_main_loop_ivybridge_endp -$sqrt_fixup_ivybridge: +sqrt_fixup_ivybridge: dec rdx - movq r13, -4389456576512 + movq r13, -4389456576512 mov rax, rdx shr rdx, 19 shr rax, 20 mov rcx, rdx sub rcx, rax add rax, r13 - movq r13, 4389456576511 + movq r13, 4389456576511 sub rcx, r13 mov r13d, -2147483647 imul rcx, rax sub rcx, r9 adc rdx, 0 movq xmm3, rdx - jmp $sqrt_fixup_ivybridge_ret + jmp sqrt_fixup_ivybridge_ret -$cnv2_main_loop_ivybridge_endp: +cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc new file mode 100644 index 0000000000000000000000000000000000000000..ee7f3171633834ec1684a0908586da8381abbbb7 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc @@ -0,0 +1,176 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movd xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movd xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, rcx + punpcklqdq xmm5, xmm0 + + ALIGN 8 +main_loop_ivybridge: + movdqu xmm6, XMMWORD PTR [r10+rbx] + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movd xmm0, r11 + movd xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movd rcx, xmm3 + movdqu XMMWORD PTR [rax+rbx], xmm2 + mov rax, rcx + movdqu XMMWORD PTR [r10+rbx], xmm1 + shl rax, 32 + xor rdi, rax + movd rbp, xmm6 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov r10, rbp + and r10d, 2097136 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r10+rbx] + lea r14, QWORD PTR [r10+rbx] + mov r12, QWORD PTR [r10+rbx+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movd rax, xmm0 + div r9 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movd rdx, xmm3 + test rdx, 524287 + je sqrt_fixup_ivybridge + psrlq xmm3, 19 + psubq xmm3, XMMWORD PTR [rsp+16] +sqrt_fixup_ivybridge_ret: + + mov r9, r10 + mov rax, rdi + mul rbp + + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + add r8, rdx + add r11, rax + movdqu xmm0, XMMWORD PTR [r10+rbx] + movdqu xmm2, XMMWORD PTR [r9+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov QWORD PTR [r14], r8 + xor r8, rdi + mov r10, r8 + mov QWORD PTR [r14+8], r11 + and r10d, 2097136 + xor r11, r12 + dec rsi + jne main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_ivybridge_endp + +sqrt_fixup_ivybridge: + dec rdx + mov r13, -4389456576512 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + mov r13, 4389456576511 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movd xmm3, rdx + jmp sqrt_fixup_ivybridge_ret + +cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc similarity index 92% rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc index c564d8949f783eafc3b55874cb4044140febe257..551ee85734e032268b8415fc889b74bb1fefcf3b 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc @@ -45,8 +45,8 @@ movq xmm0, rcx punpcklqdq xmm4, xmm0 - ALIGN 64 -$main_loop_ryzen: + ALIGN 8 +main_loop_ryzen: movdqa xmm5, XMMWORD PTR [r10+rbx] movq xmm0, r11 movq xmm6, r8 @@ -103,10 +103,10 @@ $main_loop_ryzen: sqrtsd xmm1, xmm0 movq rdi, xmm1 test rdi, 524287 - je $sqrt_fixup_ryzen + je sqrt_fixup_ryzen shr rdi, 19 -$sqrt_fixup_ryzen_ret: +sqrt_fixup_ryzen_ret: mov rax, rsi mul r14 @@ -136,7 +136,7 @@ $sqrt_fixup_ryzen_ret: and r10d, 2097136 movdqa xmm3, xmm5 dec ebp - jne $main_loop_ryzen + jne main_loop_ryzen ldmxcsr DWORD PTR [rsp] movaps xmm6, XMMWORD PTR [rsp+48] @@ -152,23 +152,23 @@ $sqrt_fixup_ryzen_ret: pop r13 pop r12 pop rdi - jmp $cnv2_main_loop_ryzen_endp + jmp cnv2_main_loop_ryzen_endp -$sqrt_fixup_ryzen: +sqrt_fixup_ryzen: movq r9, xmm2 dec rdi - movq rdx, 4389456576511 + movq rdx, 4389456576511 mov rax, rdi shr rdi, 19 shr rax, 20 mov rcx, rdi sub rcx, rax sub rcx, rdx - movq rdx, -4389456576512 + movq rdx, -4389456576512 add rax, rdx imul rcx, rax sub rcx, r9 adc rdi, 0 - jmp $sqrt_fixup_ryzen_ret + jmp sqrt_fixup_ryzen_ret -$cnv2_main_loop_ryzen_endp: +cnv2_main_loop_ryzen_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc new file mode 100644 index 0000000000000000000000000000000000000000..f70dccef80732cd97bf0e31967f0db50a015ed78 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc @@ -0,0 +1,174 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 8 +main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm0, r11 + movd xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movd r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movd rax, xmm0 + + div r9 + movd xmm0, rax + movd xmm1, rdx + punpckldq xmm0, xmm1 + movd r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ryzen + shr rdi, 19 + +sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm0, XMMWORD PTR [r10+rbx] + movdqa xmm2, XMMWORD PTR [r9+rbx] + movdqa xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm4 + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm0 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ryzen_endp + +sqrt_fixup_ryzen: + movd r9, xmm2 + dec rdi + mov rdx, 4389456576511 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + sub rcx, rdx + mov rdx, -4389456576512 + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ryzen_ret + +cnv2_main_loop_ryzen_endp: