diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc new file mode 100644 index 0000000000000000000000000000000000000000..79adab671e1bbb0f0be1178764282817267049fb --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + +ALIGN 16 +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc new file mode 100644 index 0000000000000000000000000000000000000000..ad8f1823309c2cc2f77391e87de6e6de8b36de84 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movd xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movd xmm7, rax + + mov eax, 1 + shl rax, 52 + movd xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movd xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movd xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movd xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movd xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movd xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 64 +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movd xmm0, r11 + movd xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movd r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movd xmm0, rbp + movd xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movd rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movd rdx, xmm5 + shl rdx, 32 + movd rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movd xmm0, rdx + xor rdx, [r11+r13] + movd xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movd r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movd r11, xmm0 + psrldq xmm1, 8 + movd r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movd rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movd rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movd r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movd r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movd r9, xmm10 + psrldq xmm1, 8 + movd r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movd r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movd xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movd r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movd xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S index b6be9438f65c19d9e305419982184d845d9c81c3..c0a3d0b41498ee419deca860ea1c307407a5d0ab 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S @@ -9,6 +9,7 @@ #endif .global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm) .global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm) +.global FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm) ALIGN 8 FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm): @@ -25,3 +26,12 @@ FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm): #include "cryptonight_v8_main_loop_ryzen_linux.inc" add rsp, 48 ret 0 + +ALIGN 16 +FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cryptonight_v8_double_main_loop_sandybridge_linux.inc" + add rsp, 48 + ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm index a1615e9bd5a630e403582f43bbadea86633b6ef2..1f3d2e15cebb3268ce566f62921f40ce9eafafc7 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm @@ -1,6 +1,7 @@ _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE PUBLIC cryptonight_v8_mainloop_ivybridge_asm PUBLIC cryptonight_v8_mainloop_ryzen_asm +PUBLIC cryptonight_v8_double_mainloop_sandybridge_asm ALIGN 8 cryptonight_v8_mainloop_ivybridge_asm PROC @@ -14,5 +15,11 @@ cryptonight_v8_mainloop_ryzen_asm PROC ret 0 cryptonight_v8_mainloop_ryzen_asm ENDP +ALIGN 8 +cryptonight_v8_double_mainloop_sandybridge_asm PROC + INCLUDE cryptonight_v8_double_main_loop_sandybridge_win64.inc + ret 0 +cryptonight_v8_double_mainloop_sandybridge_asm ENDP + _TEXT_CNV8_MAINLOOP ENDS END diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index c0f122fd6f94bc658a746e85af170c4835a54837..e8c0aca2bd532e9a2eaa7a351086bca25107d405 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -940,21 +940,63 @@ struct Cryptonight_hash<5> extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); -template<xmrstak_algo ALGO, int asm_version> -void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + +template< size_t N, size_t asm_version> +struct Cryptonight_hash_asm; + +template<size_t asm_version> +struct Cryptonight_hash_asm<1, asm_version> { - constexpr size_t MEM = cn_select_memory<ALGO>(); + static constexpr size_t N = 1; - keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); - cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + template<xmrstak_algo ALGO> + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MEM = cn_select_memory<ALGO>(); - if (asm_version == 1) - cryptonight_v8_mainloop_ivybridge_asm(ctx[0]); - else - cryptonight_v8_mainloop_ryzen_asm(ctx[0]); + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); - cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); - keccakf((uint64_t*)ctx[0]->hash_state, 24); - extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); -} + if(asm_version == 0) + cryptonight_v8_mainloop_ivybridge_asm(ctx[0]); + else if(asm_version == 1) + cryptonight_v8_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); + } +}; + +// double hash only for intel +template< > +struct Cryptonight_hash_asm<2, 0> +{ + static constexpr size_t N = 2; + + template<xmrstak_algo ALGO> + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MEM = cn_select_memory<ALGO>(); + + for(size_t i = 0; i < N; ++i) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + /* Optim - 99% time boundary */ + cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); + + for(size_t i = 0; i < N; ++i) + { + /* Optim - 90% time boundary */ + cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + /* Optim - 99% time boundary */ + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } + } +}; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index a344a9ffe74a17f584313ae7e28940c0d6567359..bb80b938f7a9cbda6546964059e9dd8710bb5471 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -455,24 +455,27 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc static_assert(N >= 1, "number of threads must be >= 1" ); // check for asm optimized version for cryptonight_v8 - if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) + if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes) { if(asm_version_str != "off") { + if(asm_version_str != "intel" && asm_version_str != "ryzen") + printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); + if(asm_version_str == "intel") { // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) - return cryptonight_hash_v2_asm<cryptonight_monero_v8, 1>; + if(N == 1) + return Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>; + else if(N == 2) + return Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>; } - if(asm_version_str == "ryzen") + // supports only 1 thread per hash + if(N == 1 && asm_version_str == "ryzen") { // AMD Ryzen (1xxx and 2xxx series) - return cryptonight_hash_v2_asm<cryptonight_monero_v8, 2>; - } - else - { - printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); - } + return Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>; + } } } // We have two independent flag bits in the functions