mirror of
https://github.com/xmrig/xmrig.git
synced 2024-12-23 12:09:22 +00:00
Optimized cn/upx2 for Ryzen CPUs
This commit is contained in:
parent
ed456b02cf
commit
16fe462cad
8 changed files with 682 additions and 7 deletions
|
@ -293,12 +293,6 @@ xmrig::CnHash::CnHash()
|
||||||
# ifdef XMRIG_ALGO_CN_FEMTO
|
# ifdef XMRIG_ALGO_CN_FEMTO
|
||||||
ADD_FN(Algorithm::CN_UPX2);
|
ADD_FN(Algorithm::CN_UPX2);
|
||||||
ADD_FN_ASM(Algorithm::CN_UPX2);
|
ADD_FN_ASM(Algorithm::CN_UPX2);
|
||||||
|
|
||||||
# if defined(_MSC_VER) && defined(XMRIG_FEATURE_ASM)
|
|
||||||
// This is somehow faster on Ryzen
|
|
||||||
m_map[Algorithm::CN_UPX2][AV_DOUBLE][Assembly::RYZEN] = cryptonight_double_hash<Algorithm::CN_UPX2, false>;
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
# ifdef XMRIG_ALGO_ARGON2
|
# ifdef XMRIG_ALGO_ARGON2
|
||||||
|
|
|
@ -789,6 +789,7 @@ extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx **ctx);
|
||||||
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx **ctx);
|
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx **ctx);
|
||||||
extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx **ctx);
|
extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx **ctx);
|
||||||
extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx **ctx);
|
extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx **ctx);
|
||||||
|
extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx * *ctx);
|
||||||
|
|
||||||
|
|
||||||
namespace xmrig {
|
namespace xmrig {
|
||||||
|
@ -986,7 +987,12 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
|
||||||
# endif
|
# endif
|
||||||
# ifdef XMRIG_ALGO_CN_FEMTO
|
# ifdef XMRIG_ALGO_CN_FEMTO
|
||||||
else if (ALGO == Algorithm::CN_UPX2) {
|
else if (ALGO == Algorithm::CN_UPX2) {
|
||||||
cn_upx2_double_mainloop_asm(ctx);
|
if (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3) {
|
||||||
|
cnv2_upx_double_mainloop_zen3_asm(ctx);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
cn_upx2_double_mainloop_asm(ctx);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
# endif
|
# endif
|
||||||
else if (ALGO == Algorithm::CN_RWZ) {
|
else if (ALGO == Algorithm::CN_RWZ) {
|
||||||
|
|
322
src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc
Normal file
322
src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
mov rax, rsp
|
||||||
|
push rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 216
|
||||||
|
|
||||||
|
mov rdi, QWORD PTR [rcx+8]
|
||||||
|
|
||||||
|
mov edx, 768
|
||||||
|
mov rbx, QWORD PTR [rcx]
|
||||||
|
mov ecx, 256
|
||||||
|
movaps XMMWORD PTR [rax-88], xmm6
|
||||||
|
movaps XMMWORD PTR [rax-104], xmm7
|
||||||
|
mov r13, QWORD PTR [rdi+224]
|
||||||
|
movq xmm0, QWORD PTR [rdi+104]
|
||||||
|
mov r12, QWORD PTR [rbx+224]
|
||||||
|
movaps XMMWORD PTR [rax-120], xmm8
|
||||||
|
movaps XMMWORD PTR [rax-136], xmm9
|
||||||
|
movaps XMMWORD PTR [rax-152], xmm10
|
||||||
|
movaps XMMWORD PTR [rsp+112], xmm11
|
||||||
|
movaps XMMWORD PTR [rsp+96], xmm12
|
||||||
|
movaps XMMWORD PTR [rsp+80], xmm13
|
||||||
|
movq xmm13, QWORD PTR [rbx+96]
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm14
|
||||||
|
movq xmm14, QWORD PTR [rbx+104]
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm15
|
||||||
|
movq xmm15, QWORD PTR [rdi+96]
|
||||||
|
mov QWORD PTR [rsp], r13
|
||||||
|
movdqa XMMWORD PTR [rsp+32], xmm0
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp+24]
|
||||||
|
mov DWORD PTR [rsp+28], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+28]
|
||||||
|
|
||||||
|
mov rcx, QWORD PTR [rbx+56]
|
||||||
|
xorps xmm12, xmm12
|
||||||
|
xor rcx, QWORD PTR [rbx+24]
|
||||||
|
mov rax, QWORD PTR [rbx+48]
|
||||||
|
xor rax, QWORD PTR [rbx+16]
|
||||||
|
mov rsi, QWORD PTR [rbx+32]
|
||||||
|
mov rbp, QWORD PTR [rdi+32]
|
||||||
|
movq xmm0, rcx
|
||||||
|
|
||||||
|
mov rcx, QWORD PTR [rbx+88]
|
||||||
|
xor rcx, QWORD PTR [rbx+72]
|
||||||
|
movq xmm7, rax
|
||||||
|
mov rax, QWORD PTR [rbx+80]
|
||||||
|
xor rax, QWORD PTR [rbx+64]
|
||||||
|
mov r14, QWORD PTR [rbx+40]
|
||||||
|
mov r15, QWORD PTR [rdi+40]
|
||||||
|
xor rsi, QWORD PTR [rbx]
|
||||||
|
xor rbp, QWORD PTR [rdi]
|
||||||
|
movq xmm9, rax
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rdi+48]
|
||||||
|
xor rax, QWORD PTR [rdi+16]
|
||||||
|
xor r14, QWORD PTR [rbx+8]
|
||||||
|
xor r15, QWORD PTR [rdi+8]
|
||||||
|
movq xmm8, rax
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
|
||||||
|
mov eax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm11, rax
|
||||||
|
punpcklqdq xmm11, xmm11
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rdi+80]
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [rdi+56]
|
||||||
|
xor rcx, QWORD PTR [rdi+24]
|
||||||
|
punpcklqdq xmm9, xmm0
|
||||||
|
mov QWORD PTR [rsp+8], 16384
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [rdi+88]
|
||||||
|
xor rcx, QWORD PTR [rdi+72]
|
||||||
|
xor rax, QWORD PTR [rdi+64]
|
||||||
|
punpcklqdq xmm8, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
movq xmm10, rax
|
||||||
|
mov rax, 4389456576511
|
||||||
|
mov QWORD PTR [rsp+16], rax
|
||||||
|
punpcklqdq xmm10, xmm0
|
||||||
|
|
||||||
|
ALIGN(64)
|
||||||
|
upx2_main_loop:
|
||||||
|
mov rdx, rsi
|
||||||
|
mov r9, rbp
|
||||||
|
and edx, 131056
|
||||||
|
and r9d, 131056
|
||||||
|
movdqu xmm6, XMMWORD PTR [rdx+r12]
|
||||||
|
lea r8, QWORD PTR [rdx+r12]
|
||||||
|
movdqu xmm4, XMMWORD PTR [r9+r13]
|
||||||
|
lea r10, QWORD PTR [r9+r13]
|
||||||
|
mov ecx, edx
|
||||||
|
mov eax, edx
|
||||||
|
xor rax, 32
|
||||||
|
xor rcx, 48
|
||||||
|
xor rdx, 16
|
||||||
|
movq xmm0, r14
|
||||||
|
movq xmm3, rsi
|
||||||
|
movq xmm5, rbp
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movq xmm0, r15
|
||||||
|
movdqu xmm2, XMMWORD PTR [rax+r12]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r12]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
aesenc xmm6, xmm3
|
||||||
|
aesenc xmm4, xmm5
|
||||||
|
movdqa xmm0, xmm9
|
||||||
|
movq rdi, xmm4
|
||||||
|
paddq xmm0, XMMWORD PTR [rdx+r12]
|
||||||
|
movdqu XMMWORD PTR [rdx+r12], xmm0
|
||||||
|
xor edx, edx
|
||||||
|
movdqu XMMWORD PTR [rax+r12], xmm1
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
movdqu XMMWORD PTR [rcx+r12], xmm2
|
||||||
|
pxor xmm0, xmm7
|
||||||
|
movdqu XMMWORD PTR [r8], xmm0
|
||||||
|
mov ecx, r9d
|
||||||
|
xor rcx, 48
|
||||||
|
mov eax, r9d
|
||||||
|
xor rax, 32
|
||||||
|
xor r9, 16
|
||||||
|
movdqa xmm0, xmm10
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r13]
|
||||||
|
movdqu xmm2, XMMWORD PTR [rax+r13]
|
||||||
|
paddq xmm1, xmm8
|
||||||
|
paddq xmm0, XMMWORD PTR [r9+r13]
|
||||||
|
paddq xmm2, xmm5
|
||||||
|
movdqu XMMWORD PTR [r9+r13], xmm0
|
||||||
|
movq r9, xmm6
|
||||||
|
movdqu XMMWORD PTR [rax+r13], xmm1
|
||||||
|
movdqa xmm0, xmm4
|
||||||
|
movdqu XMMWORD PTR [rcx+r13], xmm2
|
||||||
|
pxor xmm0, xmm8
|
||||||
|
movdqu XMMWORD PTR [r10], xmm0
|
||||||
|
movq rcx, xmm14
|
||||||
|
mov rax, rcx
|
||||||
|
movq r10, xmm13
|
||||||
|
shl rax, 32
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
xor r10, rax
|
||||||
|
psrldq xmm0, 8
|
||||||
|
lea r8, QWORD PTR [rcx+rcx]
|
||||||
|
movq rax, xmm0
|
||||||
|
add r8d, r9d
|
||||||
|
mov ecx, -2147483647
|
||||||
|
or r8, rcx
|
||||||
|
mov r11, r9
|
||||||
|
div r8
|
||||||
|
and r11d, 131056
|
||||||
|
movaps xmm1, xmm12
|
||||||
|
mov eax, eax
|
||||||
|
add r11, r12
|
||||||
|
shl rdx, 32
|
||||||
|
add rdx, rax
|
||||||
|
xor r10, QWORD PTR [r11]
|
||||||
|
mov rbx, QWORD PTR [r11+8]
|
||||||
|
lea r8, QWORD PTR [rdx+r9]
|
||||||
|
movq xmm13, rdx
|
||||||
|
mov rax, r8
|
||||||
|
shr rax, 12
|
||||||
|
movq xmm0, rax
|
||||||
|
paddq xmm0, xmm11
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
mov r13, -4389456576512
|
||||||
|
movq rdx, xmm1
|
||||||
|
mov rax, rdx
|
||||||
|
shr rdx, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
add rax, r13
|
||||||
|
sub rcx, QWORD PTR [rsp+16]
|
||||||
|
mov r13, QWORD PTR [rsp]
|
||||||
|
imul rcx, rax
|
||||||
|
mov rax, r10
|
||||||
|
sub rcx, r8
|
||||||
|
mov rcx, r9
|
||||||
|
adc rdx, 0
|
||||||
|
xor rcx, 32
|
||||||
|
and ecx, 131056
|
||||||
|
movq xmm14, rdx
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r12]
|
||||||
|
mul r9
|
||||||
|
paddq xmm1, xmm3
|
||||||
|
mov r8, rax
|
||||||
|
xor r8, QWORD PTR [rcx+r12+8]
|
||||||
|
add r14, r8
|
||||||
|
movq xmm0, rax
|
||||||
|
movq xmm2, rdx
|
||||||
|
xor rdx, QWORD PTR [rcx+r12]
|
||||||
|
mov rax, r9
|
||||||
|
xor rax, 48
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
and eax, 131056
|
||||||
|
add rsi, rdx
|
||||||
|
xor r9, 16
|
||||||
|
xor edx, edx
|
||||||
|
and r9d, 131056
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+r12]
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
pxor xmm2, XMMWORD PTR [r9+r12]
|
||||||
|
paddq xmm2, xmm9
|
||||||
|
movdqu XMMWORD PTR [r9+r12], xmm2
|
||||||
|
movq r9, xmm15
|
||||||
|
movdqu XMMWORD PTR [rcx+r12], xmm0
|
||||||
|
movdqa xmm0, xmm4
|
||||||
|
mov rcx, QWORD PTR [rsp+32]
|
||||||
|
movdqu XMMWORD PTR [rax+r12], xmm1
|
||||||
|
mov rax, rcx
|
||||||
|
shl rax, 32
|
||||||
|
movaps xmm1, xmm12
|
||||||
|
xor r9, rax
|
||||||
|
psrldq xmm0, 8
|
||||||
|
lea r8, QWORD PTR [rcx+rcx]
|
||||||
|
mov QWORD PTR [r11], rsi
|
||||||
|
add r8d, edi
|
||||||
|
mov QWORD PTR [r11+8], r14
|
||||||
|
movq rax, xmm0
|
||||||
|
mov ecx, -2147483647
|
||||||
|
or r8, rcx
|
||||||
|
xor rsi, r10
|
||||||
|
div r8
|
||||||
|
mov r10, rdi
|
||||||
|
xor r14, rbx
|
||||||
|
mov eax, eax
|
||||||
|
and r10d, 131056
|
||||||
|
shl rdx, 32
|
||||||
|
add r10, r13
|
||||||
|
add rdx, rax
|
||||||
|
xor r9, QWORD PTR [r10]
|
||||||
|
mov r11, QWORD PTR [r10+8]
|
||||||
|
lea r8, QWORD PTR [rdx+rdi]
|
||||||
|
mov rax, r8
|
||||||
|
movq xmm15, rdx
|
||||||
|
shr rax, 12
|
||||||
|
movq xmm0, rax
|
||||||
|
paddq xmm0, xmm11
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movq rdx, xmm1
|
||||||
|
mov rax, rdx
|
||||||
|
shr rax, 20
|
||||||
|
shr rdx, 19
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
mov rbx, 4389456576511
|
||||||
|
sub rcx, rbx
|
||||||
|
movdqa xmm9, xmm7
|
||||||
|
mov rbx, -4389456576512
|
||||||
|
movdqa xmm7, xmm6
|
||||||
|
add rax, rbx
|
||||||
|
imul rcx, rax
|
||||||
|
mov rax, r9
|
||||||
|
sub rcx, r8
|
||||||
|
mov rcx, rdi
|
||||||
|
adc rdx, 0
|
||||||
|
xor rcx, 32
|
||||||
|
and ecx, 131056
|
||||||
|
movq xmm0, rdx
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r13]
|
||||||
|
mul rdi
|
||||||
|
movdqa XMMWORD PTR [rsp+32], xmm0
|
||||||
|
paddq xmm1, xmm5
|
||||||
|
mov r8, rax
|
||||||
|
xor r8, QWORD PTR [rcx+r13+8]
|
||||||
|
add r15, r8
|
||||||
|
movq xmm0, rax
|
||||||
|
movq xmm2, rdx
|
||||||
|
xor rdx, QWORD PTR [rcx+r13]
|
||||||
|
mov rax, rdi
|
||||||
|
xor rdi, 16
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
xor rax, 48
|
||||||
|
and edi, 131056
|
||||||
|
and eax, 131056
|
||||||
|
add rbp, rdx
|
||||||
|
pxor xmm2, XMMWORD PTR [rdi+r13]
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||||
|
paddq xmm2, xmm10
|
||||||
|
movdqu XMMWORD PTR [rdi+r13], xmm2
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
movdqu XMMWORD PTR [rcx+r13], xmm0
|
||||||
|
movdqa xmm10, xmm8
|
||||||
|
movdqu XMMWORD PTR [rax+r13], xmm1
|
||||||
|
movdqa xmm8, xmm4
|
||||||
|
mov QWORD PTR [r10], rbp
|
||||||
|
xor rbp, r9
|
||||||
|
mov QWORD PTR [r10+8], r15
|
||||||
|
xor r15, r11
|
||||||
|
sub QWORD PTR [rsp+8], 1
|
||||||
|
jne upx2_main_loop
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp+28]
|
||||||
|
|
||||||
|
movaps xmm13, XMMWORD PTR [rsp+80]
|
||||||
|
lea r11, QWORD PTR [rsp+216]
|
||||||
|
movaps xmm6, XMMWORD PTR [r11-24]
|
||||||
|
movaps xmm7, XMMWORD PTR [r11-40]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-56]
|
||||||
|
movaps xmm9, XMMWORD PTR [r11-72]
|
||||||
|
movaps xmm10, XMMWORD PTR [r11-88]
|
||||||
|
movaps xmm11, XMMWORD PTR [r11-104]
|
||||||
|
movaps xmm12, XMMWORD PTR [r11-120]
|
||||||
|
movaps xmm14, XMMWORD PTR [rsp+64]
|
||||||
|
movaps xmm15, XMMWORD PTR [rsp+48]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
pop rbx
|
|
@ -17,6 +17,7 @@
|
||||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||||
.global FN_PREFIX(cnv2_rwz_mainloop_asm)
|
.global FN_PREFIX(cnv2_rwz_mainloop_asm)
|
||||||
.global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
|
.global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
|
||||||
|
.global FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm)
|
||||||
|
|
||||||
ALIGN(64)
|
ALIGN(64)
|
||||||
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||||
|
@ -72,6 +73,15 @@ FN_PREFIX(cnv2_rwz_double_mainloop_asm):
|
||||||
ret 0
|
ret 0
|
||||||
mov eax, 3735929054
|
mov eax, 3735929054
|
||||||
|
|
||||||
|
ALIGN(64)
|
||||||
|
FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm):
|
||||||
|
sub rsp, 48
|
||||||
|
mov rcx, rdi
|
||||||
|
#include "cn2/cnv2_upx_double_mainloop_zen3.inc"
|
||||||
|
add rsp, 48
|
||||||
|
ret 0
|
||||||
|
mov eax, 3735929054
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ELF__)
|
#if defined(__linux__) && defined(__ELF__)
|
||||||
.section .note.GNU-stack,"",%progbits
|
.section .note.GNU-stack,"",%progbits
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -48,5 +48,12 @@ cnv2_rwz_double_mainloop_asm PROC
|
||||||
mov eax, 3735929054
|
mov eax, 3735929054
|
||||||
cnv2_rwz_double_mainloop_asm ENDP
|
cnv2_rwz_double_mainloop_asm ENDP
|
||||||
|
|
||||||
|
ALIGN(64)
|
||||||
|
cnv2_upx_double_mainloop_zen3_asm PROC
|
||||||
|
INCLUDE cn2/cnv2_upx_double_mainloop_zen3.inc
|
||||||
|
ret 0
|
||||||
|
mov eax, 3735929054
|
||||||
|
cnv2_upx_double_mainloop_zen3_asm ENDP
|
||||||
|
|
||||||
_TEXT_CNV2_MAINLOOP ENDS
|
_TEXT_CNV2_MAINLOOP ENDS
|
||||||
END
|
END
|
||||||
|
|
322
src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc
Normal file
322
src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
mov rax, rsp
|
||||||
|
push rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 216
|
||||||
|
|
||||||
|
mov rdi, QWORD PTR [rcx+8]
|
||||||
|
|
||||||
|
mov edx, 768
|
||||||
|
mov rbx, QWORD PTR [rcx]
|
||||||
|
mov ecx, 256
|
||||||
|
movaps XMMWORD PTR [rax-88], xmm6
|
||||||
|
movaps XMMWORD PTR [rax-104], xmm7
|
||||||
|
mov r13, QWORD PTR [rdi+224]
|
||||||
|
movd xmm0, QWORD PTR [rdi+104]
|
||||||
|
mov r12, QWORD PTR [rbx+224]
|
||||||
|
movaps XMMWORD PTR [rax-120], xmm8
|
||||||
|
movaps XMMWORD PTR [rax-136], xmm9
|
||||||
|
movaps XMMWORD PTR [rax-152], xmm10
|
||||||
|
movaps XMMWORD PTR [rsp+112], xmm11
|
||||||
|
movaps XMMWORD PTR [rsp+96], xmm12
|
||||||
|
movaps XMMWORD PTR [rsp+80], xmm13
|
||||||
|
movd xmm13, QWORD PTR [rbx+96]
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm14
|
||||||
|
movd xmm14, QWORD PTR [rbx+104]
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm15
|
||||||
|
movd xmm15, QWORD PTR [rdi+96]
|
||||||
|
mov QWORD PTR [rsp], r13
|
||||||
|
movdqa XMMWORD PTR [rsp+32], xmm0
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp+24]
|
||||||
|
mov DWORD PTR [rsp+28], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+28]
|
||||||
|
|
||||||
|
mov rcx, QWORD PTR [rbx+56]
|
||||||
|
xorps xmm12, xmm12
|
||||||
|
xor rcx, QWORD PTR [rbx+24]
|
||||||
|
mov rax, QWORD PTR [rbx+48]
|
||||||
|
xor rax, QWORD PTR [rbx+16]
|
||||||
|
mov rsi, QWORD PTR [rbx+32]
|
||||||
|
mov rbp, QWORD PTR [rdi+32]
|
||||||
|
movd xmm0, rcx
|
||||||
|
|
||||||
|
mov rcx, QWORD PTR [rbx+88]
|
||||||
|
xor rcx, QWORD PTR [rbx+72]
|
||||||
|
movd xmm7, rax
|
||||||
|
mov rax, QWORD PTR [rbx+80]
|
||||||
|
xor rax, QWORD PTR [rbx+64]
|
||||||
|
mov r14, QWORD PTR [rbx+40]
|
||||||
|
mov r15, QWORD PTR [rdi+40]
|
||||||
|
xor rsi, QWORD PTR [rbx]
|
||||||
|
xor rbp, QWORD PTR [rdi]
|
||||||
|
movd xmm9, rax
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rdi+48]
|
||||||
|
xor rax, QWORD PTR [rdi+16]
|
||||||
|
xor r14, QWORD PTR [rbx+8]
|
||||||
|
xor r15, QWORD PTR [rdi+8]
|
||||||
|
movd xmm8, rax
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
|
||||||
|
mov eax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movd xmm11, rax
|
||||||
|
punpcklqdq xmm11, xmm11
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rdi+80]
|
||||||
|
movd xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [rdi+56]
|
||||||
|
xor rcx, QWORD PTR [rdi+24]
|
||||||
|
punpcklqdq xmm9, xmm0
|
||||||
|
mov QWORD PTR [rsp+8], 16384
|
||||||
|
movd xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [rdi+88]
|
||||||
|
xor rcx, QWORD PTR [rdi+72]
|
||||||
|
xor rax, QWORD PTR [rdi+64]
|
||||||
|
punpcklqdq xmm8, xmm0
|
||||||
|
movd xmm0, rcx
|
||||||
|
movd xmm10, rax
|
||||||
|
mov rax, 4389456576511
|
||||||
|
mov QWORD PTR [rsp+16], rax
|
||||||
|
punpcklqdq xmm10, xmm0
|
||||||
|
|
||||||
|
ALIGN(64)
|
||||||
|
upx2_main_loop:
|
||||||
|
mov rdx, rsi
|
||||||
|
mov r9, rbp
|
||||||
|
and edx, 131056
|
||||||
|
and r9d, 131056
|
||||||
|
movdqu xmm6, XMMWORD PTR [rdx+r12]
|
||||||
|
lea r8, QWORD PTR [rdx+r12]
|
||||||
|
movdqu xmm4, XMMWORD PTR [r9+r13]
|
||||||
|
lea r10, QWORD PTR [r9+r13]
|
||||||
|
mov ecx, edx
|
||||||
|
mov eax, edx
|
||||||
|
xor rax, 32
|
||||||
|
xor rcx, 48
|
||||||
|
xor rdx, 16
|
||||||
|
movd xmm0, r14
|
||||||
|
movd xmm3, rsi
|
||||||
|
movd xmm5, rbp
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movd xmm0, r15
|
||||||
|
movdqu xmm2, XMMWORD PTR [rax+r12]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r12]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
aesenc xmm6, xmm3
|
||||||
|
aesenc xmm4, xmm5
|
||||||
|
movdqa xmm0, xmm9
|
||||||
|
movd rdi, xmm4
|
||||||
|
paddq xmm0, XMMWORD PTR [rdx+r12]
|
||||||
|
movdqu XMMWORD PTR [rdx+r12], xmm0
|
||||||
|
xor edx, edx
|
||||||
|
movdqu XMMWORD PTR [rax+r12], xmm1
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
movdqu XMMWORD PTR [rcx+r12], xmm2
|
||||||
|
pxor xmm0, xmm7
|
||||||
|
movdqu XMMWORD PTR [r8], xmm0
|
||||||
|
mov ecx, r9d
|
||||||
|
xor rcx, 48
|
||||||
|
mov eax, r9d
|
||||||
|
xor rax, 32
|
||||||
|
xor r9, 16
|
||||||
|
movdqa xmm0, xmm10
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r13]
|
||||||
|
movdqu xmm2, XMMWORD PTR [rax+r13]
|
||||||
|
paddq xmm1, xmm8
|
||||||
|
paddq xmm0, XMMWORD PTR [r9+r13]
|
||||||
|
paddq xmm2, xmm5
|
||||||
|
movdqu XMMWORD PTR [r9+r13], xmm0
|
||||||
|
movd r9, xmm6
|
||||||
|
movdqu XMMWORD PTR [rax+r13], xmm1
|
||||||
|
movdqa xmm0, xmm4
|
||||||
|
movdqu XMMWORD PTR [rcx+r13], xmm2
|
||||||
|
pxor xmm0, xmm8
|
||||||
|
movdqu XMMWORD PTR [r10], xmm0
|
||||||
|
movd rcx, xmm14
|
||||||
|
mov rax, rcx
|
||||||
|
movd r10, xmm13
|
||||||
|
shl rax, 32
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
xor r10, rax
|
||||||
|
psrldq xmm0, 8
|
||||||
|
lea r8, QWORD PTR [rcx+rcx]
|
||||||
|
movd rax, xmm0
|
||||||
|
add r8d, r9d
|
||||||
|
mov ecx, -2147483647
|
||||||
|
or r8, rcx
|
||||||
|
mov r11, r9
|
||||||
|
div r8
|
||||||
|
and r11d, 131056
|
||||||
|
movaps xmm1, xmm12
|
||||||
|
mov eax, eax
|
||||||
|
add r11, r12
|
||||||
|
shl rdx, 32
|
||||||
|
add rdx, rax
|
||||||
|
xor r10, QWORD PTR [r11]
|
||||||
|
mov rbx, QWORD PTR [r11+8]
|
||||||
|
lea r8, QWORD PTR [rdx+r9]
|
||||||
|
movd xmm13, rdx
|
||||||
|
mov rax, r8
|
||||||
|
shr rax, 12
|
||||||
|
movd xmm0, rax
|
||||||
|
paddq xmm0, xmm11
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
mov r13, -4389456576512
|
||||||
|
movd rdx, xmm1
|
||||||
|
mov rax, rdx
|
||||||
|
shr rdx, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
add rax, r13
|
||||||
|
sub rcx, QWORD PTR [rsp+16]
|
||||||
|
mov r13, QWORD PTR [rsp]
|
||||||
|
imul rcx, rax
|
||||||
|
mov rax, r10
|
||||||
|
sub rcx, r8
|
||||||
|
mov rcx, r9
|
||||||
|
adc rdx, 0
|
||||||
|
xor rcx, 32
|
||||||
|
and ecx, 131056
|
||||||
|
movd xmm14, rdx
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r12]
|
||||||
|
mul r9
|
||||||
|
paddq xmm1, xmm3
|
||||||
|
mov r8, rax
|
||||||
|
xor r8, QWORD PTR [rcx+r12+8]
|
||||||
|
add r14, r8
|
||||||
|
movd xmm0, rax
|
||||||
|
movd xmm2, rdx
|
||||||
|
xor rdx, QWORD PTR [rcx+r12]
|
||||||
|
mov rax, r9
|
||||||
|
xor rax, 48
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
and eax, 131056
|
||||||
|
add rsi, rdx
|
||||||
|
xor r9, 16
|
||||||
|
xor edx, edx
|
||||||
|
and r9d, 131056
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+r12]
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
pxor xmm2, XMMWORD PTR [r9+r12]
|
||||||
|
paddq xmm2, xmm9
|
||||||
|
movdqu XMMWORD PTR [r9+r12], xmm2
|
||||||
|
movd r9, xmm15
|
||||||
|
movdqu XMMWORD PTR [rcx+r12], xmm0
|
||||||
|
movdqa xmm0, xmm4
|
||||||
|
mov rcx, QWORD PTR [rsp+32]
|
||||||
|
movdqu XMMWORD PTR [rax+r12], xmm1
|
||||||
|
mov rax, rcx
|
||||||
|
shl rax, 32
|
||||||
|
movaps xmm1, xmm12
|
||||||
|
xor r9, rax
|
||||||
|
psrldq xmm0, 8
|
||||||
|
lea r8, QWORD PTR [rcx+rcx]
|
||||||
|
mov QWORD PTR [r11], rsi
|
||||||
|
add r8d, edi
|
||||||
|
mov QWORD PTR [r11+8], r14
|
||||||
|
movd rax, xmm0
|
||||||
|
mov ecx, -2147483647
|
||||||
|
or r8, rcx
|
||||||
|
xor rsi, r10
|
||||||
|
div r8
|
||||||
|
mov r10, rdi
|
||||||
|
xor r14, rbx
|
||||||
|
mov eax, eax
|
||||||
|
and r10d, 131056
|
||||||
|
shl rdx, 32
|
||||||
|
add r10, r13
|
||||||
|
add rdx, rax
|
||||||
|
xor r9, QWORD PTR [r10]
|
||||||
|
mov r11, QWORD PTR [r10+8]
|
||||||
|
lea r8, QWORD PTR [rdx+rdi]
|
||||||
|
mov rax, r8
|
||||||
|
movd xmm15, rdx
|
||||||
|
shr rax, 12
|
||||||
|
movd xmm0, rax
|
||||||
|
paddq xmm0, xmm11
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movd rdx, xmm1
|
||||||
|
mov rax, rdx
|
||||||
|
shr rax, 20
|
||||||
|
shr rdx, 19
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
mov rbx, 4389456576511
|
||||||
|
sub rcx, rbx
|
||||||
|
movdqa xmm9, xmm7
|
||||||
|
mov rbx, -4389456576512
|
||||||
|
movdqa xmm7, xmm6
|
||||||
|
add rax, rbx
|
||||||
|
imul rcx, rax
|
||||||
|
mov rax, r9
|
||||||
|
sub rcx, r8
|
||||||
|
mov rcx, rdi
|
||||||
|
adc rdx, 0
|
||||||
|
xor rcx, 32
|
||||||
|
and ecx, 131056
|
||||||
|
movd xmm0, rdx
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r13]
|
||||||
|
mul rdi
|
||||||
|
movdqa XMMWORD PTR [rsp+32], xmm0
|
||||||
|
paddq xmm1, xmm5
|
||||||
|
mov r8, rax
|
||||||
|
xor r8, QWORD PTR [rcx+r13+8]
|
||||||
|
add r15, r8
|
||||||
|
movd xmm0, rax
|
||||||
|
movd xmm2, rdx
|
||||||
|
xor rdx, QWORD PTR [rcx+r13]
|
||||||
|
mov rax, rdi
|
||||||
|
xor rdi, 16
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
xor rax, 48
|
||||||
|
and edi, 131056
|
||||||
|
and eax, 131056
|
||||||
|
add rbp, rdx
|
||||||
|
pxor xmm2, XMMWORD PTR [rdi+r13]
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||||
|
paddq xmm2, xmm10
|
||||||
|
movdqu XMMWORD PTR [rdi+r13], xmm2
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
movdqu XMMWORD PTR [rcx+r13], xmm0
|
||||||
|
movdqa xmm10, xmm8
|
||||||
|
movdqu XMMWORD PTR [rax+r13], xmm1
|
||||||
|
movdqa xmm8, xmm4
|
||||||
|
mov QWORD PTR [r10], rbp
|
||||||
|
xor rbp, r9
|
||||||
|
mov QWORD PTR [r10+8], r15
|
||||||
|
xor r15, r11
|
||||||
|
sub QWORD PTR [rsp+8], 1
|
||||||
|
jne upx2_main_loop
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp+28]
|
||||||
|
|
||||||
|
movaps xmm13, XMMWORD PTR [rsp+80]
|
||||||
|
lea r11, QWORD PTR [rsp+216]
|
||||||
|
movaps xmm6, XMMWORD PTR [r11-24]
|
||||||
|
movaps xmm7, XMMWORD PTR [r11-40]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-56]
|
||||||
|
movaps xmm9, XMMWORD PTR [r11-72]
|
||||||
|
movaps xmm10, XMMWORD PTR [r11-88]
|
||||||
|
movaps xmm11, XMMWORD PTR [r11-104]
|
||||||
|
movaps xmm12, XMMWORD PTR [r11-120]
|
||||||
|
movaps xmm14, XMMWORD PTR [rsp+64]
|
||||||
|
movaps xmm15, XMMWORD PTR [rsp+48]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
pop rbx
|
|
@ -7,6 +7,7 @@
|
||||||
.global cnv2_double_mainloop_sandybridge_asm
|
.global cnv2_double_mainloop_sandybridge_asm
|
||||||
.global cnv2_rwz_mainloop_asm
|
.global cnv2_rwz_mainloop_asm
|
||||||
.global cnv2_rwz_double_mainloop_asm
|
.global cnv2_rwz_double_mainloop_asm
|
||||||
|
.global cnv2_upx_double_mainloop_zen3_asm
|
||||||
|
|
||||||
ALIGN(64)
|
ALIGN(64)
|
||||||
cnv2_mainloop_ivybridge_asm:
|
cnv2_mainloop_ivybridge_asm:
|
||||||
|
@ -43,3 +44,9 @@ cnv2_rwz_double_mainloop_asm:
|
||||||
#include "cn2/cnv2_rwz_double_main_loop.inc"
|
#include "cn2/cnv2_rwz_double_main_loop.inc"
|
||||||
ret 0
|
ret 0
|
||||||
mov eax, 3735929054
|
mov eax, 3735929054
|
||||||
|
|
||||||
|
ALIGN(64)
|
||||||
|
cnv2_upx_double_mainloop_zen3_asm:
|
||||||
|
#include "cn2/cnv2_upx_double_mainloop_zen3.inc"
|
||||||
|
ret 0
|
||||||
|
mov eax, 3735929054
|
||||||
|
|
|
@ -48,5 +48,12 @@ cnv2_rwz_double_mainloop_asm PROC
|
||||||
mov eax, 3735929054
|
mov eax, 3735929054
|
||||||
cnv2_rwz_double_mainloop_asm ENDP
|
cnv2_rwz_double_mainloop_asm ENDP
|
||||||
|
|
||||||
|
ALIGN(64)
|
||||||
|
cnv2_upx_double_mainloop_zen3_asm PROC
|
||||||
|
INCLUDE cn2/cnv2_upx_double_mainloop_zen3.inc
|
||||||
|
ret 0
|
||||||
|
mov eax, 3735929054
|
||||||
|
cnv2_upx_double_mainloop_zen3_asm ENDP
|
||||||
|
|
||||||
_TEXT_CNV2_MAINLOOP ENDS
|
_TEXT_CNV2_MAINLOOP ENDS
|
||||||
END
|
END
|
||||||
|
|
Loading…
Reference in a new issue