mirror of
https://github.com/xmrig/xmrig.git
synced 2025-01-08 20:09:52 +00:00
#768 Fixed build error with MSVC 2015 and enabled ASM code.
This commit is contained in:
parent
c2f6c70044
commit
152c4f2f1b
6 changed files with 811 additions and 5 deletions
|
@ -3,13 +3,19 @@ if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||||
|
|
||||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||||
enable_language(ASM_MASM)
|
enable_language(ASM_MASM)
|
||||||
set(XMRIG_ASM_FILE "src/crypto/asm/cnv2_main_loop.asm")
|
|
||||||
|
if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141)
|
||||||
|
set(XMRIG_ASM_FILE "src/crypto/asm/cnv2_main_loop.asm")
|
||||||
|
else()
|
||||||
|
set(XMRIG_ASM_FILE "src/crypto/asm/win64/cnv2_main_loop.asm")
|
||||||
|
endif()
|
||||||
|
|
||||||
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
|
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
|
||||||
else()
|
else()
|
||||||
enable_language(ASM)
|
enable_language(ASM)
|
||||||
|
|
||||||
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
|
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
|
||||||
set(XMRIG_ASM_FILE "src/crypto/asm/cnv2_main_loop_win.S")
|
set(XMRIG_ASM_FILE "src/crypto/asm/win64/cnv2_main_loop.S")
|
||||||
else()
|
else()
|
||||||
set(XMRIG_ASM_FILE "src/crypto/asm/cnv2_main_loop.S")
|
set(XMRIG_ASM_FILE "src/crypto/asm/cnv2_main_loop.S")
|
||||||
endif()
|
endif()
|
||||||
|
|
410
src/crypto/asm/win64/cnv2_double_main_loop_sandybridge.inc
Normal file
410
src/crypto/asm/win64/cnv2_double_main_loop_sandybridge.inc
Normal file
|
@ -0,0 +1,410 @@
|
||||||
|
mov rax, rsp
|
||||||
|
push rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 184
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp+272]
|
||||||
|
mov DWORD PTR [rsp+276], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+276]
|
||||||
|
|
||||||
|
mov r13, QWORD PTR [rcx+224]
|
||||||
|
mov r9, rdx
|
||||||
|
mov r10, QWORD PTR [rcx+32]
|
||||||
|
mov r8, rcx
|
||||||
|
xor r10, QWORD PTR [rcx]
|
||||||
|
mov r14d, 524288
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rsi, QWORD PTR [rdx+224]
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
mov rdi, QWORD PTR [r9+32]
|
||||||
|
xor rdi, QWORD PTR [r9]
|
||||||
|
mov rbp, QWORD PTR [r9+40]
|
||||||
|
xor rbp, QWORD PTR [r9+8]
|
||||||
|
movd xmm0, rdx
|
||||||
|
movaps XMMWORD PTR [rax-88], xmm6
|
||||||
|
movaps XMMWORD PTR [rax-104], xmm7
|
||||||
|
movaps XMMWORD PTR [rax-120], xmm8
|
||||||
|
movaps XMMWORD PTR [rsp+112], xmm9
|
||||||
|
movaps XMMWORD PTR [rsp+96], xmm10
|
||||||
|
movaps XMMWORD PTR [rsp+80], xmm11
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm12
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm13
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm14
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm15
|
||||||
|
mov rdx, r10
|
||||||
|
movd xmm4, QWORD PTR [r8+96]
|
||||||
|
and edx, 2097136
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
xorps xmm13, xmm13
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r8+72]
|
||||||
|
movd xmm5, QWORD PTR [r8+104]
|
||||||
|
movd xmm7, rax
|
||||||
|
|
||||||
|
mov eax, 1
|
||||||
|
shl rax, 52
|
||||||
|
movd xmm14, rax
|
||||||
|
punpcklqdq xmm14, xmm14
|
||||||
|
|
||||||
|
mov eax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movd xmm12, rax
|
||||||
|
punpcklqdq xmm12, xmm12
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [r8+80]
|
||||||
|
xor rax, QWORD PTR [r8+64]
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
movd xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [r9+56]
|
||||||
|
xor rcx, QWORD PTR [r9+24]
|
||||||
|
movd xmm3, rax
|
||||||
|
mov rax, QWORD PTR [r9+48]
|
||||||
|
xor rax, QWORD PTR [r9+16]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movd xmm0, rcx
|
||||||
|
mov QWORD PTR [rsp], r13
|
||||||
|
mov rcx, QWORD PTR [r9+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
movd xmm6, rax
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
punpcklqdq xmm6, xmm0
|
||||||
|
movd xmm0, rcx
|
||||||
|
mov QWORD PTR [rsp+256], r10
|
||||||
|
mov rcx, rdi
|
||||||
|
mov QWORD PTR [rsp+264], r11
|
||||||
|
movd xmm8, rax
|
||||||
|
and ecx, 2097136
|
||||||
|
punpcklqdq xmm8, xmm0
|
||||||
|
movd xmm0, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
movd xmm0, QWORD PTR [r9+104]
|
||||||
|
lea r8, QWORD PTR [rcx+rsi]
|
||||||
|
movdqu xmm11, XMMWORD PTR [r8]
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
lea r9, QWORD PTR [rdx+r13]
|
||||||
|
movdqu xmm15, XMMWORD PTR [r9]
|
||||||
|
|
||||||
|
ALIGN 16
|
||||||
|
main_loop_double_sandybridge:
|
||||||
|
movdqu xmm9, xmm15
|
||||||
|
mov eax, edx
|
||||||
|
mov ebx, edx
|
||||||
|
xor eax, 16
|
||||||
|
xor ebx, 32
|
||||||
|
xor edx, 48
|
||||||
|
|
||||||
|
movd xmm0, r11
|
||||||
|
movd xmm2, r10
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
aesenc xmm9, xmm2
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
paddq xmm1, xmm2
|
||||||
|
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||||
|
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||||
|
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||||
|
paddq xmm0, xmm3
|
||||||
|
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||||
|
|
||||||
|
movd r11, xmm9
|
||||||
|
mov edx, r11d
|
||||||
|
and edx, 2097136
|
||||||
|
movdqa xmm0, xmm9
|
||||||
|
pxor xmm0, xmm7
|
||||||
|
movdqu XMMWORD PTR [r9], xmm0
|
||||||
|
|
||||||
|
lea rbx, QWORD PTR [rdx+r13]
|
||||||
|
mov r10, QWORD PTR [rdx+r13]
|
||||||
|
|
||||||
|
movdqu xmm10, xmm11
|
||||||
|
movd xmm0, rbp
|
||||||
|
movd xmm11, rdi
|
||||||
|
punpcklqdq xmm11, xmm0
|
||||||
|
aesenc xmm10, xmm11
|
||||||
|
|
||||||
|
mov eax, ecx
|
||||||
|
mov r12d, ecx
|
||||||
|
xor eax, 16
|
||||||
|
xor r12d, 32
|
||||||
|
xor ecx, 48
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||||
|
paddq xmm0, xmm6
|
||||||
|
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||||
|
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||||
|
paddq xmm1, xmm11
|
||||||
|
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||||
|
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||||
|
|
||||||
|
movd rcx, xmm10
|
||||||
|
and ecx, 2097136
|
||||||
|
|
||||||
|
movdqa xmm0, xmm10
|
||||||
|
pxor xmm0, xmm6
|
||||||
|
movdqu XMMWORD PTR [r8], xmm0
|
||||||
|
mov r12, QWORD PTR [rcx+rsi]
|
||||||
|
|
||||||
|
mov r9, QWORD PTR [rbx+8]
|
||||||
|
|
||||||
|
xor edx, 16
|
||||||
|
mov r8d, edx
|
||||||
|
mov r15d, edx
|
||||||
|
|
||||||
|
movd rdx, xmm5
|
||||||
|
shl rdx, 32
|
||||||
|
movd rax, xmm4
|
||||||
|
xor rdx, rax
|
||||||
|
xor r10, rdx
|
||||||
|
mov rax, r10
|
||||||
|
mul r11
|
||||||
|
mov r11d, r8d
|
||||||
|
xor r11d, 48
|
||||||
|
movd xmm0, rdx
|
||||||
|
xor rdx, [r11+r13]
|
||||||
|
movd xmm1, rax
|
||||||
|
xor rax, [r11+r13+8]
|
||||||
|
punpcklqdq xmm0, xmm1
|
||||||
|
|
||||||
|
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||||
|
xor r8d, 32
|
||||||
|
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
paddq xmm1, xmm2
|
||||||
|
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||||
|
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||||
|
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||||
|
paddq xmm0, xmm3
|
||||||
|
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||||
|
|
||||||
|
mov r11, QWORD PTR [rsp+256]
|
||||||
|
add r11, rdx
|
||||||
|
mov rdx, QWORD PTR [rsp+264]
|
||||||
|
add rdx, rax
|
||||||
|
mov QWORD PTR [rbx], r11
|
||||||
|
xor r11, r10
|
||||||
|
mov QWORD PTR [rbx+8], rdx
|
||||||
|
xor rdx, r9
|
||||||
|
mov QWORD PTR [rsp+256], r11
|
||||||
|
and r11d, 2097136
|
||||||
|
mov QWORD PTR [rsp+264], rdx
|
||||||
|
mov QWORD PTR [rsp+8], r11
|
||||||
|
lea r15, QWORD PTR [r11+r13]
|
||||||
|
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||||
|
lea r13, QWORD PTR [rsi+rcx]
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movaps xmm2, xmm13
|
||||||
|
movd r10, xmm0
|
||||||
|
psllq xmm5, 1
|
||||||
|
shl r10, 32
|
||||||
|
movdqa xmm0, xmm9
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movdqa xmm1, xmm10
|
||||||
|
movd r11, xmm0
|
||||||
|
psrldq xmm1, 8
|
||||||
|
movd r8, xmm1
|
||||||
|
psrldq xmm4, 8
|
||||||
|
movaps xmm0, xmm13
|
||||||
|
movd rax, xmm4
|
||||||
|
xor r10, rax
|
||||||
|
movaps xmm1, xmm13
|
||||||
|
xor r10, r12
|
||||||
|
lea rax, QWORD PTR [r11+1]
|
||||||
|
shr rax, 1
|
||||||
|
movdqa xmm3, xmm9
|
||||||
|
punpcklqdq xmm3, xmm10
|
||||||
|
paddq xmm5, xmm3
|
||||||
|
movd rdx, xmm5
|
||||||
|
psrldq xmm5, 8
|
||||||
|
cvtsi2sd xmm2, rax
|
||||||
|
or edx, -2147483647
|
||||||
|
lea rax, QWORD PTR [r8+1]
|
||||||
|
shr rax, 1
|
||||||
|
movd r9, xmm5
|
||||||
|
cvtsi2sd xmm0, rax
|
||||||
|
or r9d, -2147483647
|
||||||
|
cvtsi2sd xmm1, rdx
|
||||||
|
unpcklpd xmm2, xmm0
|
||||||
|
movaps xmm0, xmm13
|
||||||
|
cvtsi2sd xmm0, r9
|
||||||
|
unpcklpd xmm1, xmm0
|
||||||
|
divpd xmm2, xmm1
|
||||||
|
paddq xmm2, xmm14
|
||||||
|
cvttsd2si rax, xmm2
|
||||||
|
psrldq xmm2, 8
|
||||||
|
mov rbx, rax
|
||||||
|
imul rax, rdx
|
||||||
|
sub r11, rax
|
||||||
|
js div_fix_1_sandybridge
|
||||||
|
div_fix_1_ret_sandybridge:
|
||||||
|
|
||||||
|
cvttsd2si rdx, xmm2
|
||||||
|
mov rax, rdx
|
||||||
|
imul rax, r9
|
||||||
|
movd xmm2, r11d
|
||||||
|
movd xmm4, ebx
|
||||||
|
sub r8, rax
|
||||||
|
js div_fix_2_sandybridge
|
||||||
|
div_fix_2_ret_sandybridge:
|
||||||
|
|
||||||
|
movd xmm1, r8d
|
||||||
|
movd xmm0, edx
|
||||||
|
punpckldq xmm2, xmm1
|
||||||
|
punpckldq xmm4, xmm0
|
||||||
|
punpckldq xmm4, xmm2
|
||||||
|
paddq xmm3, xmm4
|
||||||
|
movdqa xmm0, xmm3
|
||||||
|
psrlq xmm0, 12
|
||||||
|
paddq xmm0, xmm12
|
||||||
|
sqrtpd xmm1, xmm0
|
||||||
|
movd r9, xmm1
|
||||||
|
movdqa xmm5, xmm1
|
||||||
|
psrlq xmm5, 19
|
||||||
|
test r9, 524287
|
||||||
|
je sqrt_fix_1_sandybridge
|
||||||
|
sqrt_fix_1_ret_sandybridge:
|
||||||
|
|
||||||
|
movd r9, xmm10
|
||||||
|
psrldq xmm1, 8
|
||||||
|
movd r8, xmm1
|
||||||
|
test r8, 524287
|
||||||
|
je sqrt_fix_2_sandybridge
|
||||||
|
sqrt_fix_2_ret_sandybridge:
|
||||||
|
|
||||||
|
mov r12d, ecx
|
||||||
|
mov r8d, ecx
|
||||||
|
xor r12d, 16
|
||||||
|
xor r8d, 32
|
||||||
|
xor ecx, 48
|
||||||
|
mov rax, r10
|
||||||
|
mul r9
|
||||||
|
movd xmm0, rax
|
||||||
|
movd xmm3, rdx
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||||
|
pxor xmm0, xmm3
|
||||||
|
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||||
|
xor rdx, [r8+rsi]
|
||||||
|
xor rax, [r8+rsi+8]
|
||||||
|
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||||
|
paddq xmm0, xmm6
|
||||||
|
paddq xmm1, xmm11
|
||||||
|
paddq xmm3, xmm8
|
||||||
|
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||||
|
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||||
|
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||||
|
|
||||||
|
add rdi, rdx
|
||||||
|
mov QWORD PTR [r13], rdi
|
||||||
|
xor rdi, r10
|
||||||
|
mov ecx, edi
|
||||||
|
and ecx, 2097136
|
||||||
|
lea r8, QWORD PTR [rcx+rsi]
|
||||||
|
|
||||||
|
mov rdx, QWORD PTR [r13+8]
|
||||||
|
add rbp, rax
|
||||||
|
mov QWORD PTR [r13+8], rbp
|
||||||
|
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||||
|
xor rbp, rdx
|
||||||
|
mov r13, QWORD PTR [rsp]
|
||||||
|
movdqa xmm3, xmm7
|
||||||
|
mov rdx, QWORD PTR [rsp+8]
|
||||||
|
movdqa xmm8, xmm6
|
||||||
|
mov r10, QWORD PTR [rsp+256]
|
||||||
|
movdqa xmm7, xmm9
|
||||||
|
mov r11, QWORD PTR [rsp+264]
|
||||||
|
movdqa xmm6, xmm10
|
||||||
|
mov r9, r15
|
||||||
|
dec r14d
|
||||||
|
jne main_loop_double_sandybridge
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp+272]
|
||||||
|
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||||
|
lea r11, QWORD PTR [rsp+184]
|
||||||
|
movaps xmm6, XMMWORD PTR [r11-24]
|
||||||
|
movaps xmm7, XMMWORD PTR [r11-40]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-56]
|
||||||
|
movaps xmm9, XMMWORD PTR [r11-72]
|
||||||
|
movaps xmm10, XMMWORD PTR [r11-88]
|
||||||
|
movaps xmm11, XMMWORD PTR [r11-104]
|
||||||
|
movaps xmm12, XMMWORD PTR [r11-120]
|
||||||
|
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||||
|
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
pop rbx
|
||||||
|
jmp cnv2_double_mainloop_asm_sandybridge_endp
|
||||||
|
|
||||||
|
div_fix_1_sandybridge:
|
||||||
|
dec rbx
|
||||||
|
add r11, rdx
|
||||||
|
jmp div_fix_1_ret_sandybridge
|
||||||
|
|
||||||
|
div_fix_2_sandybridge:
|
||||||
|
dec rdx
|
||||||
|
add r8, r9
|
||||||
|
jmp div_fix_2_ret_sandybridge
|
||||||
|
|
||||||
|
sqrt_fix_1_sandybridge:
|
||||||
|
movd r8, xmm3
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
dec r9
|
||||||
|
mov r11d, -1022
|
||||||
|
shl r11, 32
|
||||||
|
mov rax, r9
|
||||||
|
shr r9, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rdx, r9
|
||||||
|
sub rdx, rax
|
||||||
|
lea rdx, [rdx+r11+1]
|
||||||
|
add rax, r11
|
||||||
|
imul rdx, rax
|
||||||
|
sub rdx, r8
|
||||||
|
adc r9, 0
|
||||||
|
movd xmm5, r9
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
jmp sqrt_fix_1_ret_sandybridge
|
||||||
|
|
||||||
|
sqrt_fix_2_sandybridge:
|
||||||
|
psrldq xmm3, 8
|
||||||
|
movd r11, xmm3
|
||||||
|
dec r8
|
||||||
|
mov ebx, -1022
|
||||||
|
shl rbx, 32
|
||||||
|
mov rax, r8
|
||||||
|
shr r8, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rdx, r8
|
||||||
|
sub rdx, rax
|
||||||
|
lea rdx, [rdx+rbx+1]
|
||||||
|
add rax, rbx
|
||||||
|
imul rdx, rax
|
||||||
|
sub rdx, r11
|
||||||
|
adc r8, 0
|
||||||
|
movd xmm0, r8
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
jmp sqrt_fix_2_ret_sandybridge
|
||||||
|
|
||||||
|
cnv2_double_mainloop_asm_sandybridge_endp:
|
|
@ -7,15 +7,15 @@
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
cnv2_mainloop_ivybridge_asm:
|
cnv2_mainloop_ivybridge_asm:
|
||||||
#include "cnv2_main_loop_ivybridge.inc"
|
#include "../cnv2_main_loop_ivybridge.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
cnv2_mainloop_ryzen_asm:
|
cnv2_mainloop_ryzen_asm:
|
||||||
#include "cnv2_main_loop_ryzen.inc"
|
#include "../cnv2_main_loop_ryzen.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
cnv2_double_mainloop_sandybridge_asm:
|
cnv2_double_mainloop_sandybridge_asm:
|
||||||
#include "cnv2_double_main_loop_sandybridge.inc"
|
#include "../cnv2_double_main_loop_sandybridge.inc"
|
||||||
ret 0
|
ret 0
|
25
src/crypto/asm/win64/cnv2_main_loop.asm
Normal file
25
src/crypto/asm/win64/cnv2_main_loop.asm
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
|
||||||
|
PUBLIC cnv2_mainloop_ivybridge_asm
|
||||||
|
PUBLIC cnv2_mainloop_ryzen_asm
|
||||||
|
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cnv2_mainloop_ivybridge_asm PROC
|
||||||
|
INCLUDE cnv2_main_loop_ivybridge.inc
|
||||||
|
ret 0
|
||||||
|
cnv2_mainloop_ivybridge_asm ENDP
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cnv2_mainloop_ryzen_asm PROC
|
||||||
|
INCLUDE cnv2_main_loop_ryzen.inc
|
||||||
|
ret 0
|
||||||
|
cnv2_mainloop_ryzen_asm ENDP
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cnv2_double_mainloop_sandybridge_asm PROC
|
||||||
|
INCLUDE cnv2_double_main_loop_sandybridge.inc
|
||||||
|
ret 0
|
||||||
|
cnv2_double_mainloop_sandybridge_asm ENDP
|
||||||
|
|
||||||
|
_TEXT_CNV2_MAINLOOP ENDS
|
||||||
|
END
|
186
src/crypto/asm/win64/cnv2_main_loop_ivybridge.inc
Normal file
186
src/crypto/asm/win64/cnv2_main_loop_ivybridge.inc
Normal file
|
@ -0,0 +1,186 @@
|
||||||
|
mov QWORD PTR [rsp+24], rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 80
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp]
|
||||||
|
mov DWORD PTR [rsp+4], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r9, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov esi, 524288
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
mov r13d, -2147483647
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
mov r10, r8
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
movd xmm4, rax
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rbx, QWORD PTR [rcx+224]
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
movd xmm0, rdx
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
movd xmm3, QWORD PTR [r9+104]
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm6
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm8
|
||||||
|
and r10d, 2097136
|
||||||
|
movd xmm5, rax
|
||||||
|
|
||||||
|
xor eax, eax
|
||||||
|
mov QWORD PTR [rsp+16], rax
|
||||||
|
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movd xmm8, rax
|
||||||
|
mov r15, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
movd xmm0, rcx
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||||
|
|
||||||
|
ALIGN 16
|
||||||
|
main_loop_ivybridge:
|
||||||
|
lea rdx, QWORD PTR [r10+rbx]
|
||||||
|
mov ecx, r10d
|
||||||
|
mov eax, r10d
|
||||||
|
mov rdi, r15
|
||||||
|
xor ecx, 16
|
||||||
|
xor eax, 32
|
||||||
|
xor r10d, 48
|
||||||
|
movd xmm0, r11
|
||||||
|
movd xmm7, r8
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
aesenc xmm6, xmm7
|
||||||
|
movd rbp, xmm6
|
||||||
|
mov r9, rbp
|
||||||
|
and r9d, 2097136
|
||||||
|
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||||
|
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||||
|
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||||
|
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
mov r10, r9
|
||||||
|
xor r10d, 32
|
||||||
|
movd rcx, xmm3
|
||||||
|
mov rax, rcx
|
||||||
|
shl rax, 32
|
||||||
|
xor rdi, rax
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
pxor xmm0, xmm4
|
||||||
|
movdqu XMMWORD PTR [rdx], xmm0
|
||||||
|
xor rdi, QWORD PTR [r9+rbx]
|
||||||
|
lea r14, QWORD PTR [r9+rbx]
|
||||||
|
mov r12, QWORD PTR [r14+8]
|
||||||
|
xor edx, edx
|
||||||
|
lea r9d, DWORD PTR [ecx+ecx]
|
||||||
|
add r9d, ebp
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
psrldq xmm0, 8
|
||||||
|
or r9d, r13d
|
||||||
|
movd rax, xmm0
|
||||||
|
div r9
|
||||||
|
xorps xmm3, xmm3
|
||||||
|
mov eax, eax
|
||||||
|
shl rdx, 32
|
||||||
|
add rdx, rax
|
||||||
|
lea r9, QWORD PTR [rdx+rbp]
|
||||||
|
mov r15, rdx
|
||||||
|
mov rax, r9
|
||||||
|
shr rax, 12
|
||||||
|
movd xmm0, rax
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
sqrtsd xmm3, xmm0
|
||||||
|
psubq xmm3, XMMWORD PTR [rsp+16]
|
||||||
|
movd rdx, xmm3
|
||||||
|
test edx, 524287
|
||||||
|
je sqrt_fixup_ivybridge
|
||||||
|
psrlq xmm3, 19
|
||||||
|
sqrt_fixup_ivybridge_ret:
|
||||||
|
|
||||||
|
mov ecx, r10d
|
||||||
|
mov rax, rdi
|
||||||
|
mul rbp
|
||||||
|
movd xmm2, rdx
|
||||||
|
xor rdx, [rcx+rbx]
|
||||||
|
add r8, rdx
|
||||||
|
mov QWORD PTR [r14], r8
|
||||||
|
xor r8, rdi
|
||||||
|
mov edi, r8d
|
||||||
|
and edi, 2097136
|
||||||
|
movd xmm0, rax
|
||||||
|
xor rax, [rcx+rbx+8]
|
||||||
|
add r11, rax
|
||||||
|
mov QWORD PTR [r14+8], r11
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
|
||||||
|
mov r9d, r10d
|
||||||
|
xor r9d, 48
|
||||||
|
xor r10d, 16
|
||||||
|
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||||
|
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
movdqa xmm5, xmm4
|
||||||
|
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||||
|
movdqa xmm4, xmm6
|
||||||
|
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||||
|
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
movdqu xmm6, [rdi+rbx]
|
||||||
|
mov r10d, edi
|
||||||
|
xor r11, r12
|
||||||
|
dec rsi
|
||||||
|
jne main_loop_ivybridge
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
mov rbx, QWORD PTR [rsp+160]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||||
|
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||||
|
add rsp, 80
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
jmp cnv2_main_loop_ivybridge_endp
|
||||||
|
|
||||||
|
sqrt_fixup_ivybridge:
|
||||||
|
dec rdx
|
||||||
|
mov r13d, -1022
|
||||||
|
shl r13, 32
|
||||||
|
mov rax, rdx
|
||||||
|
shr rdx, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
add rax, r13
|
||||||
|
not r13
|
||||||
|
sub rcx, r13
|
||||||
|
mov r13d, -2147483647
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdx, 0
|
||||||
|
movd xmm3, rdx
|
||||||
|
jmp sqrt_fixup_ivybridge_ret
|
||||||
|
|
||||||
|
cnv2_main_loop_ivybridge_endp:
|
179
src/crypto/asm/win64/cnv2_main_loop_ryzen.inc
Normal file
179
src/crypto/asm/win64/cnv2_main_loop_ryzen.inc
Normal file
|
@ -0,0 +1,179 @@
|
||||||
|
mov QWORD PTR [rsp+16], rbx
|
||||||
|
mov QWORD PTR [rsp+24], rbp
|
||||||
|
mov QWORD PTR [rsp+32], rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 64
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp]
|
||||||
|
mov DWORD PTR [rsp+4], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r9, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov ebp, 524288
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
mov r10, r8
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
movd xmm3, rax
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rbx, QWORD PTR [rcx+224]
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
movd xmm0, rdx
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
mov rdi, QWORD PTR [r9+104]
|
||||||
|
and r10d, 2097136
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm6
|
||||||
|
movd xmm4, rax
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm8
|
||||||
|
xorps xmm8, xmm8
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movd xmm7, rax
|
||||||
|
mov r15, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movd xmm0, rcx
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
|
||||||
|
ALIGN 16
|
||||||
|
main_loop_ryzen:
|
||||||
|
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||||
|
movd xmm0, r11
|
||||||
|
movd xmm6, r8
|
||||||
|
punpcklqdq xmm6, xmm0
|
||||||
|
lea rdx, QWORD PTR [r10+rbx]
|
||||||
|
lea r9, QWORD PTR [rdi+rdi]
|
||||||
|
shl rdi, 32
|
||||||
|
|
||||||
|
mov ecx, r10d
|
||||||
|
mov eax, r10d
|
||||||
|
xor ecx, 16
|
||||||
|
xor eax, 32
|
||||||
|
xor r10d, 48
|
||||||
|
aesenc xmm5, xmm6
|
||||||
|
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||||
|
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||||
|
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
paddq xmm0, xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||||
|
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movaps xmm1, xmm8
|
||||||
|
mov rsi, r15
|
||||||
|
xor rsi, rdi
|
||||||
|
movd r14, xmm5
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
pxor xmm0, xmm3
|
||||||
|
mov r10, r14
|
||||||
|
and r10d, 2097136
|
||||||
|
movdqa XMMWORD PTR [rdx], xmm0
|
||||||
|
xor rsi, QWORD PTR [r10+rbx]
|
||||||
|
lea r12, QWORD PTR [r10+rbx]
|
||||||
|
mov r13, QWORD PTR [r10+rbx+8]
|
||||||
|
|
||||||
|
add r9d, r14d
|
||||||
|
or r9d, -2147483647
|
||||||
|
xor edx, edx
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movd rax, xmm0
|
||||||
|
|
||||||
|
div r9
|
||||||
|
movd xmm0, rax
|
||||||
|
movd xmm1, rdx
|
||||||
|
punpckldq xmm0, xmm1
|
||||||
|
movd r15, xmm0
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
movdqa xmm2, xmm0
|
||||||
|
psrlq xmm0, 12
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movd rdi, xmm1
|
||||||
|
test rdi, 524287
|
||||||
|
je sqrt_fixup_ryzen
|
||||||
|
shr rdi, 19
|
||||||
|
|
||||||
|
sqrt_fixup_ryzen_ret:
|
||||||
|
mov rax, rsi
|
||||||
|
mul r14
|
||||||
|
movd xmm1, rax
|
||||||
|
movd xmm0, rdx
|
||||||
|
punpcklqdq xmm0, xmm1
|
||||||
|
|
||||||
|
mov r9d, r10d
|
||||||
|
mov ecx, r10d
|
||||||
|
xor r9d, 16
|
||||||
|
xor ecx, 32
|
||||||
|
xor r10d, 48
|
||||||
|
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||||
|
xor rdx, [rcx+rbx]
|
||||||
|
xor rax, [rcx+rbx+8]
|
||||||
|
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||||
|
pxor xmm2, xmm0
|
||||||
|
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movdqa xmm4, xmm3
|
||||||
|
add r8, rdx
|
||||||
|
add r11, rax
|
||||||
|
mov QWORD PTR [r12], r8
|
||||||
|
xor r8, rsi
|
||||||
|
mov QWORD PTR [r12+8], r11
|
||||||
|
mov r10, r8
|
||||||
|
xor r11, r13
|
||||||
|
and r10d, 2097136
|
||||||
|
movdqa xmm3, xmm5
|
||||||
|
dec ebp
|
||||||
|
jne main_loop_ryzen
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||||
|
lea r11, QWORD PTR [rsp+64]
|
||||||
|
mov rbx, QWORD PTR [r11+56]
|
||||||
|
mov rbp, QWORD PTR [r11+64]
|
||||||
|
mov rsi, QWORD PTR [r11+72]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-48]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
jmp cnv2_main_loop_ryzen_endp
|
||||||
|
|
||||||
|
sqrt_fixup_ryzen:
|
||||||
|
movd r9, xmm2
|
||||||
|
dec rdi
|
||||||
|
mov edx, -1022
|
||||||
|
shl rdx, 32
|
||||||
|
mov rax, rdi
|
||||||
|
shr rdi, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdi
|
||||||
|
sub rcx, rax
|
||||||
|
lea rcx, [rcx+rdx+1]
|
||||||
|
add rax, rdx
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdi, 0
|
||||||
|
jmp sqrt_fixup_ryzen_ret
|
||||||
|
|
||||||
|
cnv2_main_loop_ryzen_endp:
|
Loading…
Reference in a new issue