From 13fbb8a541db75484af7a457b2c892e7e0b5cbca Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 17 Sep 2018 09:16:06 +0200
Subject: [PATCH] asm compiler compatibility

- add special asm version for win64 and linux
- add cmake path for MSVC and other systems
---
 CMakeLists.txt                                |  40 ++--
 .../cpu/crypto/asm/cryptonigh_v8_main_loop.S  |  22 ++-
 .../crypto/asm/cryptonigh_v8_main_loop.asm    |   8 +-
 ...yptonigh_v8_main_loop_ivybridge_linux.inc} |  22 +--
 ...ryptonigh_v8_main_loop_ivybridge_win64.inc | 176 ++++++++++++++++++
 ...> cryptonigh_v8_main_loop_ryzen_linux.inc} |  22 +--
 .../cryptonigh_v8_main_loop_ryzen_win64.inc   | 174 +++++++++++++++++
 7 files changed, 415 insertions(+), 49 deletions(-)
 rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ivybridge.inc => cryptonigh_v8_main_loop_ivybridge_linux.inc} (91%)
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
 rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ryzen.inc => cryptonigh_v8_main_loop_ryzen_linux.inc} (92%)
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf43922..b51eb2a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -445,6 +445,26 @@ if(CMAKE_LINK_STATIC)
     endif()
 endif()
 
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+    # asm optimized monero v8 code
+    enable_language(ASM_MASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" PROPERTY ASM_MASM)
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm"
+    )
+else()
+    # asm optimized monero v8 code
+    enable_language(ASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY C)
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
+    )
+endif()
+
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
 # compile C files
 file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c")
 
@@ -456,17 +476,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99)
 if(MICROHTTPD_ENABLE)
     target_link_libraries(xmr-stak-c ${MHTD})
 endif()
-target_link_libraries(xmr-stak-c ${LIBS})
-
-enable_language(ASM)
-set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY LANGUAGE C)
-# asm optimized monero v8 code
-add_library(xmr-stak-asm
-    STATIC
-    "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
-)
-set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
-
+target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm)
 
 # compile generic backend files
 file(GLOB BACKEND_CPP
@@ -509,7 +519,7 @@ if(CUDA_FOUND)
         )
     endif()
     target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES})
-    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile AMD backend
@@ -522,7 +532,7 @@ if(OpenCL_FOUND)
         ${OPENCLSRCFILES}
     )
     target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} )
-    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile final binary
@@ -538,7 +548,7 @@ endif()
 set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}")
 set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}")
 
-target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend)
+target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm)
 
 ################################################################################
 # Install
@@ -569,4 +579,4 @@ if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
 else()
     # this rule is used if the install prefix is the build directory
     install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")")
-endif()
+endif()
\ No newline at end of file
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
index cd747f7..736dac7 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
@@ -1,21 +1,27 @@
 #define ALIGN .align
 .intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
 .section .text
-.global cryptonigh_v8_mainloop_ivybridge_asm
-.global cryptonigh_v8_mainloop_ryzen_asm
+#endif
+.global FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm)
+.global FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm)
 
-ALIGN 64
-cryptonigh_v8_mainloop_ivybridge_asm:
+ALIGN 8
+FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cryptonigh_v8_main_loop_ivybridge.inc"
+        #include "cryptonigh_v8_main_loop_ivybridge_linux.inc"
 	add rsp, 48
 	ret 0
 
-ALIGN 64
-cryptonigh_v8_mainloop_ryzen_asm:
+ALIGN 8
+FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cryptonigh_v8_main_loop_ryzen.inc"
+        #include "cryptonigh_v8_main_loop_ryzen_linux.inc"
 	add rsp, 48
 	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
index 2101a59..7f2d6a5 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
@@ -2,15 +2,15 @@ _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
 PUBLIC cryptonigh_v8_mainloop_ivybridge_asm
 PUBLIC cryptonigh_v8_mainloop_ryzen_asm
 
-ALIGN 64
+ALIGN 8
 cryptonigh_v8_mainloop_ivybridge_asm PROC
-	INCLUDE cryptonigh_v8_main_loop_ivybridge.inc
+        INCLUDE cryptonigh_v8_main_loop_ivybridge_win64.inc
 	ret 0
 cryptonigh_v8_mainloop_ivybridge_asm ENDP
 
-ALIGN 64
+ALIGN 8
 cryptonigh_v8_mainloop_ryzen_asm PROC
-	INCLUDE cryptonigh_v8_main_loop_ryzen.inc
+        INCLUDE cryptonigh_v8_main_loop_ryzen_win64.inc
 	ret 0
 cryptonigh_v8_mainloop_ryzen_asm ENDP
 
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
similarity index 91%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
index 1cc20b3..23f6cc0 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
@@ -49,8 +49,8 @@
 	movq	 xmm0, rcx
 	punpcklqdq xmm5, xmm0
 
-	ALIGN 64
-$main_loop_ivybridge:
+	ALIGN 8
+main_loop_ivybridge:
 	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
 	lea	 rdx, QWORD PTR [r10+rbx]
 	mov	 ecx, r10d
@@ -105,10 +105,10 @@ $main_loop_ivybridge:
 	sqrtsd	 xmm3, xmm0
 	movq	 rdx, xmm3
 	test	 rdx, 524287
-	je	 $sqrt_fixup_ivybridge
+	je	 sqrt_fixup_ivybridge
 	psrlq	 xmm3, 19
 	psubq	 xmm3, XMMWORD PTR [rsp+16]
-$sqrt_fixup_ivybridge_ret:
+sqrt_fixup_ivybridge_ret:
 
 	mov	 r9, r10
 	mov	 rax, rdi
@@ -138,7 +138,7 @@ $sqrt_fixup_ivybridge_ret:
 	and	 r10d, 2097136
 	xor	 r11, r12
 	dec rsi
-	jne	 $main_loop_ivybridge
+	jne	 main_loop_ivybridge
 
 	ldmxcsr DWORD PTR [rsp]
 	mov	 rbx, QWORD PTR [rsp+160]
@@ -153,24 +153,24 @@ $sqrt_fixup_ivybridge_ret:
 	pop	 rdi
 	pop	 rsi
 	pop	 rbp
-	jmp $cnv2_main_loop_ivybridge_endp
+	jmp cnv2_main_loop_ivybridge_endp
 
-$sqrt_fixup_ivybridge:
+sqrt_fixup_ivybridge:
 	dec	 rdx
-	movq	 r13, -4389456576512
+	movq r13, -4389456576512
 	mov	 rax, rdx
 	shr	 rdx, 19
 	shr	 rax, 20
 	mov	 rcx, rdx
 	sub	 rcx, rax
 	add	 rax, r13
-	movq	 r13, 4389456576511
+	movq r13, 4389456576511
 	sub	 rcx, r13
 	mov	 r13d, -2147483647
 	imul	 rcx, rax
 	sub	 rcx, r9
 	adc	 rdx, 0
 	movq	 xmm3, rdx
-	jmp	 $sqrt_fixup_ivybridge_ret
+	jmp	 sqrt_fixup_ivybridge_ret
 
-$cnv2_main_loop_ivybridge_endp:
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
new file mode 100644
index 0000000..ee7f317
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
@@ -0,0 +1,176 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movd	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movd	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movd	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movd xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+
+	ALIGN 8
+main_loop_ivybridge:
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movd	 xmm0, r11
+	movd	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movd	 rcx, xmm3
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	mov	 rax, rcx
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	shl	 rax, 32
+	xor	 rdi, rax
+	movd	 rbp, xmm6
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	mov	 r10, rbp
+	and	 r10d, 2097136
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r10+rbx]
+	lea	 r14, QWORD PTR [r10+rbx]
+	mov	 r12, QWORD PTR [r10+rbx+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movd	 rax, xmm0
+	div	 r9
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movd	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	movd	 rdx, xmm3
+	test	 rdx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+	mov	 r9, r10
+	mov	 rax, rdi
+	mul	 rbp
+
+	xor	 r9, 16
+	mov	 rcx, r10
+	xor	 rcx, 32
+	xor	 r10, 48
+	add	 r8, rdx
+	add	 r11, rax
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov	 r10, r8
+	mov	 QWORD PTR [r14+8], r11
+	and	 r10d, 2097136
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov  r13, -4389456576512
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	mov  r13, 4389456576511
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movd	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
similarity index 92%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
index c564d89..551ee85 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
@@ -45,8 +45,8 @@
 	movq	xmm0, rcx
 	punpcklqdq xmm4, xmm0
 
-	ALIGN 64
-$main_loop_ryzen:
+	ALIGN 8
+main_loop_ryzen:
 	movdqa	xmm5, XMMWORD PTR [r10+rbx]
 	movq	xmm0, r11
 	movq	xmm6, r8
@@ -103,10 +103,10 @@ $main_loop_ryzen:
 	sqrtsd	xmm1, xmm0
 	movq	rdi, xmm1
 	test	rdi, 524287
-	je	$sqrt_fixup_ryzen
+	je	sqrt_fixup_ryzen
 	shr	rdi, 19
 
-$sqrt_fixup_ryzen_ret:
+sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
 
@@ -136,7 +136,7 @@ $sqrt_fixup_ryzen_ret:
 	and	r10d, 2097136
 	movdqa	xmm3, xmm5
 	dec	ebp
-	jne	$main_loop_ryzen
+	jne	main_loop_ryzen
 
 	ldmxcsr DWORD PTR [rsp]
 	movaps	xmm6, XMMWORD PTR [rsp+48]
@@ -152,23 +152,23 @@ $sqrt_fixup_ryzen_ret:
 	pop	r13
 	pop	r12
 	pop	rdi
-	jmp $cnv2_main_loop_ryzen_endp
+	jmp cnv2_main_loop_ryzen_endp
 
-$sqrt_fixup_ryzen:
+sqrt_fixup_ryzen:
 	movq r9, xmm2
 	dec	rdi
-	movq	rdx, 4389456576511
+	movq rdx, 4389456576511
 	mov	rax, rdi
 	shr	rdi, 19
 	shr	rax, 20
 	mov	rcx, rdi
 	sub	rcx, rax
 	sub	rcx, rdx
-	movq	rdx, -4389456576512
+	movq rdx, -4389456576512
 	add	rax, rdx
 	imul	rcx, rax
 	sub	rcx, r9
 	adc	rdi, 0
-	jmp	$sqrt_fixup_ryzen_ret
+	jmp	sqrt_fixup_ryzen_ret
 
-$cnv2_main_loop_ryzen_endp:
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc
new file mode 100644
index 0000000..f70dcce
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc
@@ -0,0 +1,174 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movd	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movd	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movd xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN 8
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movd	xmm0, r11
+	movd	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movd	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movd	rax, xmm0
+
+	div	r9
+	movd xmm0, rax
+	movd xmm1, rdx
+	punpckldq xmm0, xmm1
+	movd r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movd	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	xmm0, xmm4
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm0
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movd r9, xmm2
+	dec	rdi
+	mov rdx, 4389456576511
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	sub	rcx, rdx
+	mov rdx, -4389456576512
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
-- 
GitLab