diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S new file mode 100644 index 0000000000000000000000000000000000000000..cd747f7c56cc5aaebfdb64b9cf263005b68fa327 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S @@ -0,0 +1,21 @@ +#define ALIGN .align +.intel_syntax noprefix +.section .text +.global cryptonigh_v8_mainloop_ivybridge_asm +.global cryptonigh_v8_mainloop_ryzen_asm + +ALIGN 64 +cryptonigh_v8_mainloop_ivybridge_asm: + sub rsp, 48 + mov rcx, rdi + #include "cryptonigh_v8_main_loop_ivybridge.inc" + add rsp, 48 + ret 0 + +ALIGN 64 +cryptonigh_v8_mainloop_ryzen_asm: + sub rsp, 48 + mov rcx, rdi + #include "cryptonigh_v8_main_loop_ryzen.inc" + add rsp, 48 + ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm new file mode 100644 index 0000000000000000000000000000000000000000..2101a59ce88b53e4c75b3e632ad25d232c1ec929 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm @@ -0,0 +1,18 @@ +_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cryptonigh_v8_mainloop_ivybridge_asm +PUBLIC cryptonigh_v8_mainloop_ryzen_asm + +ALIGN 64 +cryptonigh_v8_mainloop_ivybridge_asm PROC + INCLUDE cryptonigh_v8_main_loop_ivybridge.inc + ret 0 +cryptonigh_v8_mainloop_ivybridge_asm ENDP + +ALIGN 64 +cryptonigh_v8_mainloop_ryzen_asm PROC + INCLUDE cryptonigh_v8_main_loop_ryzen.inc + ret 0 +cryptonigh_v8_mainloop_ryzen_asm ENDP + +_TEXT_CNV8_MAINLOOP ENDS +END diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc new file mode 100644 index 0000000000000000000000000000000000000000..ea7f799fd7b7f0769fbc1d6d6dbc06af1c63628d --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc @@ -0,0 +1,176 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + + ALIGN 64 +$main_loop_ivybridge: + movdqu xmm6, XMMWORD PTR [r10+rbx] + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movq rcx, xmm3 + movdqu XMMWORD PTR [rax+rbx], xmm2 + mov rax, rcx + movdqu XMMWORD PTR [r10+rbx], xmm1 + shl rax, 32 + xor rdi, rax + movq rbp, xmm6 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov r10, rbp + and r10d, 2097136 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r10+rbx] + lea r14, QWORD PTR [r10+rbx] + mov r12, QWORD PTR [r10+rbx+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movq rdx, xmm3 + test rdx, 524287 + je $sqrt_fixup_ivybridge + psrlq xmm3, 19 + psubq xmm3, XMMWORD PTR [rsp+16] +$sqrt_fixup_ivybridge_ret: + + mov r9, r10 + mov rax, rdi + mul rbp + + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + add r8, rdx + add r11, rax + movdqu xmm0, XMMWORD PTR [r10+rbx] + movdqu xmm2, XMMWORD PTR [r9+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov QWORD PTR [r14], r8 + xor r8, rdi + mov r10, r8 + mov QWORD PTR [r14+8], r11 + and r10d, 2097136 + xor r11, r12 + dec rsi + jne $main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp $cnv2_main_loop_ivybridge_endp + +$sqrt_fixup_ivybridge: + dec rdx + mov r13, -4389456576512 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + mov r13, 4389456576511 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp $sqrt_fixup_ivybridge_ret + +$cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc new file mode 100644 index 0000000000000000000000000000000000000000..5797f5497ad21972d49b0b9f714b4595bcbb570e --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc @@ -0,0 +1,174 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 64 +$main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je $sqrt_fixup_ryzen + shr rdi, 19 + +$sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm0, XMMWORD PTR [r10+rbx] + movdqa xmm2, XMMWORD PTR [r9+rbx] + movdqa xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm4 + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm0 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne $main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp $cnv2_main_loop_ryzen_endp + +$sqrt_fixup_ryzen: + movq r9, xmm2 + dec rdi + mov rdx, 4389456576511 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + sub rcx, rdx + mov rdx, -4389456576512 + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp $sqrt_fixup_ryzen_ret + +$cnv2_main_loop_ryzen_endp: