From 25634d4aab915c48c6deaf574990b72c5954454e Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 1 Oct 2018 22:02:16 +0200
Subject: [PATCH] cpu: asm double hash

- restructe asm preparation function
- add double hash asm code
---
 ..._v8_double_main_loop_sandybridge_linux.inc | 410 ++++++++++++++++++
 ..._v8_double_main_loop_sandybridge_win64.inc | 410 ++++++++++++++++++
 .../cpu/crypto/asm/cryptonight_v8_main_loop.S |  10 +
 .../crypto/asm/cryptonight_v8_main_loop.asm   |   7 +
 .../backend/cpu/crypto/cryptonight_aesni.h    |  68 ++-
 xmrstak/backend/cpu/minethd.cpp               |  21 +-
 6 files changed, 904 insertions(+), 22 deletions(-)
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc

diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
new file mode 100644
index 0000000..79adab6
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+ALIGN 16
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
new file mode 100644
index 0000000..ad8f182
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movd	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movd	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movd xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movd xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movd	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movd	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movd	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movd	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN 64
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movd	xmm0, r11
+	movd	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movd	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movd	xmm0, rbp
+	movd	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movd	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movd	rdx, xmm5
+	shl	rdx, 32
+	movd	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movd xmm0, rdx
+	xor rdx, [r11+r13]
+	movd xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movd	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movd	r11, xmm0
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movd	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movd	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movd	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movd	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movd r9, xmm10
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movd xmm0, rax
+	movd xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movd	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movd xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movd	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movd xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
index b6be943..c0a3d0b 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
@@ -9,6 +9,7 @@
 #endif
 .global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm)
 .global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm)
+.global FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm)
 
 ALIGN 8
 FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm):
@@ -25,3 +26,12 @@ FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm):
 	#include "cryptonight_v8_main_loop_ryzen_linux.inc"
 	add rsp, 48
 	ret 0
+
+ALIGN 16
+FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	mov rdx, rsi
+	#include "cryptonight_v8_double_main_loop_sandybridge_linux.inc"
+	add rsp, 48
+	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
index a1615e9..1f3d2e1 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
@@ -1,6 +1,7 @@
 _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
 PUBLIC cryptonight_v8_mainloop_ivybridge_asm
 PUBLIC cryptonight_v8_mainloop_ryzen_asm
+PUBLIC cryptonight_v8_double_mainloop_sandybridge_asm
 
 ALIGN 8
 cryptonight_v8_mainloop_ivybridge_asm PROC
@@ -14,5 +15,11 @@ cryptonight_v8_mainloop_ryzen_asm PROC
 	ret 0
 cryptonight_v8_mainloop_ryzen_asm ENDP
 
+ALIGN 8
+cryptonight_v8_double_mainloop_sandybridge_asm PROC
+	INCLUDE cryptonight_v8_double_main_loop_sandybridge_win64.inc
+	ret 0
+cryptonight_v8_double_mainloop_sandybridge_asm ENDP
+
 _TEXT_CNV8_MAINLOOP ENDS
 END
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index c0f122f..e8c0aca 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -940,21 +940,63 @@ struct Cryptonight_hash<5>
 
 extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
 extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
 
-template<xmrstak_algo ALGO, int asm_version>
-void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+
+template< size_t N, size_t asm_version>
+struct Cryptonight_hash_asm;
+
+template<size_t asm_version>
+struct Cryptonight_hash_asm<1, asm_version>
 {
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+	static constexpr size_t N = 1;
 
-	keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
-	cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+	template<xmrstak_algo ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	if (asm_version == 1)
-		cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
-	else
-		cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
+		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+		cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
 
-	cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
-	keccakf((uint64_t*)ctx[0]->hash_state, 24);
-	extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
-}
+		if(asm_version == 0)
+			cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
+		else if(asm_version == 1)
+			cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
+
+		cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+		keccakf((uint64_t*)ctx[0]->hash_state, 24);
+		extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
+	}
+};
+
+// double hash only for intel
+template< >
+struct Cryptonight_hash_asm<2, 0>
+{
+	static constexpr size_t N = 2;
+
+	template<xmrstak_algo ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MEM = cn_select_memory<ALGO>();
+
+		for(size_t i = 0; i < N; ++i)
+		{
+			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+			/* Optim - 99% time boundary */
+			cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+		}
+
+		cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
+
+		for(size_t i = 0; i < N; ++i)
+		{
+			/* Optim - 90% time boundary */
+			cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+			/* Optim - 99% time boundary */
+			keccakf((uint64_t*)ctx[i]->hash_state, 24);
+			extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		}
+	}
+};
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index a344a9f..bb80b93 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -455,24 +455,27 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	static_assert(N >= 1, "number of threads must be >= 1" );
 
 	// check for asm optimized version for cryptonight_v8
-	if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes)
+	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
 	{
 		if(asm_version_str != "off")
 		{
+			if(asm_version_str != "intel" && asm_version_str != "ryzen")
+				printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str());
+
 			if(asm_version_str == "intel")
 			{
 				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
-				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 1>;
+				if(N == 1)
+					return Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
+				else if(N == 2)
+					return Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
 			}
-			if(asm_version_str == "ryzen")
+			// supports only 1 thread per hash
+			if(N == 1 && asm_version_str == "ryzen")
 			{
 				// AMD Ryzen (1xxx and 2xxx series)
-				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 2>;
-			}
-			else
-			{
-				printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str());
-			}
+				return Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
+			}		
 		}
 	}
 	// We have two independent flag bits in the functions
-- 
GitLab