From 69186f2470f5291bf3b479794d35bdba12d27bcd Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 19 Apr 2021 12:29:44 +0200 Subject: [PATCH] Optimized cn/upx for Zen3 0.9% faster --- .../asm/cn2/cnv2_upx_double_mainloop_zen3.inc | 18 ++++++++--------- .../cn2/cnv2_upx_double_mainloop_zen3.inc | 20 +++++++++---------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc b/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc index e0710086f..4f6b70a04 100644 --- a/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc +++ b/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc @@ -7,7 +7,7 @@ push r13 push r14 push r15 - sub rsp, 216 + sub rsp, 232 mov rdi, QWORD PTR [rcx+8] @@ -84,6 +84,8 @@ movq xmm10, rax mov rax, 4389456576511 mov QWORD PTR [rsp+16], rax + mov rax, -4389456576512 + mov QWORD PTR [rsp+216], rax punpcklqdq xmm10, xmm0 ALIGN(64) @@ -170,14 +172,13 @@ upx2_main_loop: movq xmm0, rax paddq xmm0, xmm11 sqrtsd xmm1, xmm0 - mov r13, -4389456576512 movq rdx, xmm1 mov rax, rdx shr rdx, 19 shr rax, 20 mov rcx, rdx sub rcx, rax - add rax, r13 + add rax, QWORD PTR [rsp+216] sub rcx, QWORD PTR [rsp+16] mov r13, QWORD PTR [rsp] imul rcx, rax @@ -251,12 +252,10 @@ upx2_main_loop: shr rdx, 19 mov rcx, rdx sub rcx, rax - mov rbx, 4389456576511 - sub rcx, rbx + sub rcx, QWORD PTR [rsp+16] movdqa xmm9, xmm7 - mov rbx, -4389456576512 movdqa xmm7, xmm6 - add rax, rbx + add rax, QWORD PTR [rsp+216] imul rcx, rax mov rax, r9 sub rcx, r8 @@ -264,10 +263,9 @@ upx2_main_loop: adc rdx, 0 xor rcx, 32 and ecx, 131056 - movq xmm0, rdx + mov QWORD PTR [rsp+32], rdx movdqu xmm1, XMMWORD PTR [rcx+r13] mul rdi - movdqa XMMWORD PTR [rsp+32], xmm0 paddq xmm1, xmm5 mov r8, rax xor r8, QWORD PTR [rcx+r13+8] @@ -301,7 +299,7 @@ upx2_main_loop: ldmxcsr DWORD PTR [rsp+24] movaps xmm13, XMMWORD PTR [rsp+80] - lea r11, QWORD PTR [rsp+216] + lea r11, QWORD PTR [rsp+232] movaps xmm6, XMMWORD PTR [r11-24] movaps xmm7, XMMWORD PTR [r11-40] movaps xmm8, XMMWORD PTR [r11-56] diff --git a/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc b/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc index 284956fee..854fbf111 100644 --- a/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc +++ b/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc @@ -7,7 +7,7 @@ push r13 push r14 push r15 - sub rsp, 216 + sub rsp, 232 mov rdi, QWORD PTR [rcx+8] @@ -84,6 +84,8 @@ movd xmm10, rax mov rax, 4389456576511 mov QWORD PTR [rsp+16], rax + mov rax, -4389456576512 + mov QWORD PTR [rsp+216], rax punpcklqdq xmm10, xmm0 ALIGN(64) @@ -170,14 +172,13 @@ upx2_main_loop: movd xmm0, rax paddq xmm0, xmm11 sqrtsd xmm1, xmm0 - mov r13, -4389456576512 movd rdx, xmm1 mov rax, rdx shr rdx, 19 shr rax, 20 mov rcx, rdx sub rcx, rax - add rax, r13 + add rax, QWORD PTR [rsp+216] sub rcx, QWORD PTR [rsp+16] mov r13, QWORD PTR [rsp] imul rcx, rax @@ -251,12 +252,10 @@ upx2_main_loop: shr rdx, 19 mov rcx, rdx sub rcx, rax - mov rbx, 4389456576511 - sub rcx, rbx + sub rcx, QWORD PTR [rsp+16] movdqa xmm9, xmm7 - mov rbx, -4389456576512 movdqa xmm7, xmm6 - add rax, rbx + add rax, QWORD PTR [rsp+216] imul rcx, rax mov rax, r9 sub rcx, r8 @@ -264,10 +263,9 @@ upx2_main_loop: adc rdx, 0 xor rcx, 32 and ecx, 131056 - movd xmm0, rdx + mov QWORD PTR [rsp+32], rdx movdqu xmm1, XMMWORD PTR [rcx+r13] mul rdi - movdqa XMMWORD PTR [rsp+32], xmm0 paddq xmm1, xmm5 mov r8, rax xor r8, QWORD PTR [rcx+r13+8] @@ -298,10 +296,10 @@ upx2_main_loop: sub QWORD PTR [rsp+8], 1 jne upx2_main_loop - ldmxcsr DWORD PTR [rsp+28] + ldmxcsr DWORD PTR [rsp+24] movaps xmm13, XMMWORD PTR [rsp+80] - lea r11, QWORD PTR [rsp+216] + lea r11, QWORD PTR [rsp+232] movaps xmm6, XMMWORD PTR [r11-24] movaps xmm7, XMMWORD PTR [r11-40] movaps xmm8, XMMWORD PTR [r11-56]