diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf439227fe394bb46421614b2c98a01506b073b0..b51eb2ae4838d6dd4f4a0a3aa97e788cd0843e17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -445,6 +445,26 @@ if(CMAKE_LINK_STATIC)
     endif()
 endif()
 
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+    # asm optimized monero v8 code
+    enable_language(ASM_MASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" PROPERTY ASM_MASM)
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm"
+    )
+else()
+    # asm optimized monero v8 code
+    enable_language(ASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY C)
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
+    )
+endif()
+
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
 # compile C files
 file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c")
 
@@ -456,17 +476,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99)
 if(MICROHTTPD_ENABLE)
     target_link_libraries(xmr-stak-c ${MHTD})
 endif()
-target_link_libraries(xmr-stak-c ${LIBS})
-
-enable_language(ASM)
-set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY LANGUAGE C)
-# asm optimized monero v8 code
-add_library(xmr-stak-asm
-    STATIC
-    "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
-)
-set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
-
+target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm)
 
 # compile generic backend files
 file(GLOB BACKEND_CPP
@@ -509,7 +519,7 @@ if(CUDA_FOUND)
         )
     endif()
     target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES})
-    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile AMD backend
@@ -522,7 +532,7 @@ if(OpenCL_FOUND)
         ${OPENCLSRCFILES}
     )
     target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} )
-    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile final binary
@@ -538,7 +548,7 @@ endif()
 set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}")
 set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}")
 
-target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend)
+target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm)
 
 ################################################################################
 # Install
@@ -569,4 +579,4 @@ if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
 else()
     # this rule is used if the install prefix is the build directory
     install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")")
-endif()
+endif()
\ No newline at end of file
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
index cd747f7c56cc5aaebfdb64b9cf263005b68fa327..736dac7de0234af9601f3a184ed52478ddf9289b 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
@@ -1,21 +1,27 @@
 #define ALIGN .align
 .intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
 .section .text
-.global cryptonigh_v8_mainloop_ivybridge_asm
-.global cryptonigh_v8_mainloop_ryzen_asm
+#endif
+.global FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm)
+.global FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm)
 
-ALIGN 64
-cryptonigh_v8_mainloop_ivybridge_asm:
+ALIGN 8
+FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cryptonigh_v8_main_loop_ivybridge.inc"
+        #include "cryptonigh_v8_main_loop_ivybridge_linux.inc"
 	add rsp, 48
 	ret 0
 
-ALIGN 64
-cryptonigh_v8_mainloop_ryzen_asm:
+ALIGN 8
+FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cryptonigh_v8_main_loop_ryzen.inc"
+        #include "cryptonigh_v8_main_loop_ryzen_linux.inc"
 	add rsp, 48
 	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
index 2101a59ce88b53e4c75b3e632ad25d232c1ec929..7f2d6a584af330a5ae0dfa59f265b0c8323010b0 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
@@ -2,15 +2,15 @@ _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
 PUBLIC cryptonigh_v8_mainloop_ivybridge_asm
 PUBLIC cryptonigh_v8_mainloop_ryzen_asm
 
-ALIGN 64
+ALIGN 8
 cryptonigh_v8_mainloop_ivybridge_asm PROC
-	INCLUDE cryptonigh_v8_main_loop_ivybridge.inc
+        INCLUDE cryptonigh_v8_main_loop_ivybridge_win64.inc
 	ret 0
 cryptonigh_v8_mainloop_ivybridge_asm ENDP
 
-ALIGN 64
+ALIGN 8
 cryptonigh_v8_mainloop_ryzen_asm PROC
-	INCLUDE cryptonigh_v8_main_loop_ryzen.inc
+        INCLUDE cryptonigh_v8_main_loop_ryzen_win64.inc
 	ret 0
 cryptonigh_v8_mainloop_ryzen_asm ENDP
 
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
similarity index 91%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
index 1cc20b35aed070f5d2fe04aa3251304a02adff9a..23f6cc06069e436731c96055584a0c94eebca570 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
@@ -49,8 +49,8 @@
 	movq	 xmm0, rcx
 	punpcklqdq xmm5, xmm0
 
-	ALIGN 64
-$main_loop_ivybridge:
+	ALIGN 8
+main_loop_ivybridge:
 	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
 	lea	 rdx, QWORD PTR [r10+rbx]
 	mov	 ecx, r10d
@@ -105,10 +105,10 @@ $main_loop_ivybridge:
 	sqrtsd	 xmm3, xmm0
 	movq	 rdx, xmm3
 	test	 rdx, 524287
-	je	 $sqrt_fixup_ivybridge
+	je	 sqrt_fixup_ivybridge
 	psrlq	 xmm3, 19
 	psubq	 xmm3, XMMWORD PTR [rsp+16]
-$sqrt_fixup_ivybridge_ret:
+sqrt_fixup_ivybridge_ret:
 
 	mov	 r9, r10
 	mov	 rax, rdi
@@ -138,7 +138,7 @@ $sqrt_fixup_ivybridge_ret:
 	and	 r10d, 2097136
 	xor	 r11, r12
 	dec rsi
-	jne	 $main_loop_ivybridge
+	jne	 main_loop_ivybridge
 
 	ldmxcsr DWORD PTR [rsp]
 	mov	 rbx, QWORD PTR [rsp+160]
@@ -153,24 +153,24 @@ $sqrt_fixup_ivybridge_ret:
 	pop	 rdi
 	pop	 rsi
 	pop	 rbp
-	jmp $cnv2_main_loop_ivybridge_endp
+	jmp cnv2_main_loop_ivybridge_endp
 
-$sqrt_fixup_ivybridge:
+sqrt_fixup_ivybridge:
 	dec	 rdx
-	movq	 r13, -4389456576512
+	movq r13, -4389456576512
 	mov	 rax, rdx
 	shr	 rdx, 19
 	shr	 rax, 20
 	mov	 rcx, rdx
 	sub	 rcx, rax
 	add	 rax, r13
-	movq	 r13, 4389456576511
+	movq r13, 4389456576511
 	sub	 rcx, r13
 	mov	 r13d, -2147483647
 	imul	 rcx, rax
 	sub	 rcx, r9
 	adc	 rdx, 0
 	movq	 xmm3, rdx
-	jmp	 $sqrt_fixup_ivybridge_ret
+	jmp	 sqrt_fixup_ivybridge_ret
 
-$cnv2_main_loop_ivybridge_endp:
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
new file mode 100644
index 0000000000000000000000000000000000000000..ee7f3171633834ec1684a0908586da8381abbbb7
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
@@ -0,0 +1,176 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movd	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movd	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movd	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movd xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+
+	ALIGN 8
+main_loop_ivybridge:
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movd	 xmm0, r11
+	movd	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movd	 rcx, xmm3
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	mov	 rax, rcx
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	shl	 rax, 32
+	xor	 rdi, rax
+	movd	 rbp, xmm6
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	mov	 r10, rbp
+	and	 r10d, 2097136
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r10+rbx]
+	lea	 r14, QWORD PTR [r10+rbx]
+	mov	 r12, QWORD PTR [r10+rbx+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movd	 rax, xmm0
+	div	 r9
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movd	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	movd	 rdx, xmm3
+	test	 rdx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+	mov	 r9, r10
+	mov	 rax, rdi
+	mul	 rbp
+
+	xor	 r9, 16
+	mov	 rcx, r10
+	xor	 rcx, 32
+	xor	 r10, 48
+	add	 r8, rdx
+	add	 r11, rax
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov	 r10, r8
+	mov	 QWORD PTR [r14+8], r11
+	and	 r10d, 2097136
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov  r13, -4389456576512
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	mov  r13, 4389456576511
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movd	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
similarity index 92%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
index c564d8949f783eafc3b55874cb4044140febe257..551ee85734e032268b8415fc889b74bb1fefcf3b 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
@@ -45,8 +45,8 @@
 	movq	xmm0, rcx
 	punpcklqdq xmm4, xmm0
 
-	ALIGN 64
-$main_loop_ryzen:
+	ALIGN 8
+main_loop_ryzen:
 	movdqa	xmm5, XMMWORD PTR [r10+rbx]
 	movq	xmm0, r11
 	movq	xmm6, r8
@@ -103,10 +103,10 @@ $main_loop_ryzen:
 	sqrtsd	xmm1, xmm0
 	movq	rdi, xmm1
 	test	rdi, 524287
-	je	$sqrt_fixup_ryzen
+	je	sqrt_fixup_ryzen
 	shr	rdi, 19
 
-$sqrt_fixup_ryzen_ret:
+sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
 
@@ -136,7 +136,7 @@ $sqrt_fixup_ryzen_ret:
 	and	r10d, 2097136
 	movdqa	xmm3, xmm5
 	dec	ebp
-	jne	$main_loop_ryzen
+	jne	main_loop_ryzen
 
 	ldmxcsr DWORD PTR [rsp]
 	movaps	xmm6, XMMWORD PTR [rsp+48]
@@ -152,23 +152,23 @@ $sqrt_fixup_ryzen_ret:
 	pop	r13
 	pop	r12
 	pop	rdi
-	jmp $cnv2_main_loop_ryzen_endp
+	jmp cnv2_main_loop_ryzen_endp
 
-$sqrt_fixup_ryzen:
+sqrt_fixup_ryzen:
 	movq r9, xmm2
 	dec	rdi
-	movq	rdx, 4389456576511
+	movq rdx, 4389456576511
 	mov	rax, rdi
 	shr	rdi, 19
 	shr	rax, 20
 	mov	rcx, rdi
 	sub	rcx, rax
 	sub	rcx, rdx
-	movq	rdx, -4389456576512
+	movq rdx, -4389456576512
 	add	rax, rdx
 	imul	rcx, rax
 	sub	rcx, r9
 	adc	rdi, 0
-	jmp	$sqrt_fixup_ryzen_ret
+	jmp	sqrt_fixup_ryzen_ret
 
-$cnv2_main_loop_ryzen_endp:
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc
new file mode 100644
index 0000000000000000000000000000000000000000..f70dccef80732cd97bf0e31967f0db50a015ed78
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc
@@ -0,0 +1,174 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movd	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movd	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movd xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN 8
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movd	xmm0, r11
+	movd	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movd	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movd	rax, xmm0
+
+	div	r9
+	movd xmm0, rax
+	movd xmm1, rdx
+	punpckldq xmm0, xmm1
+	movd r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movd	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	xmm0, xmm4
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm0
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movd r9, xmm2
+	dec	rdi
+	mov rdx, 4389456576511
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	sub	rcx, rdx
+	mov rdx, -4389456576512
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp: