Optimized cn/r asm code

Average over 100 block heights:
Coffee Lake +0.1%
Ryzen +0.4%
Sandy Bridge +1.5%
This commit is contained in:
SChernykh 2019-03-04 17:39:49 +01:00
parent 4ebfc135e0
commit 9f2d821970
4 changed files with 76 additions and 68 deletions

View file

@ -70,29 +70,30 @@ FN_PREFIX(CryptonightR_template_mainloop):
aesenc xmm5, xmm4 aesenc xmm5, xmm4
mov r12d, r9d mov r13d, r9d
mov eax, r9d mov eax, r9d
xor r9d, 48 xor r9d, 48
xor r12d, 16 xor r13d, 16
xor eax, 32 xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11] movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0 movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11] movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11] movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2 pxor xmm0, xmm2
pxor xmm5, xmm1 pxor xmm5, xmm1
pxor xmm5, xmm0 pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movq r12, xmm5 movq r12, xmm5
movd r10d, xmm5 movd r10d, xmm5
and r10d, 2097136 and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5 movdqa xmm0, xmm5
pxor xmm0, xmm6 pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0 movdqu XMMWORD PTR [rdx], xmm0
@ -102,14 +103,16 @@ FN_PREFIX(CryptonightR_template_mainloop):
shl rdx, 32 shl rdx, 32
or r13, rdx or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6 movd eax, xmm6
movd edx, xmm7 movd edx, xmm7
pextrd r9d, xmm7, 2 pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
FN_PREFIX(CryptonightR_template_part2): FN_PREFIX(CryptonightR_template_part2):
lea rcx, [r10+r11]
mov eax, edi mov eax, edi
mov edx, ebp mov edx, ebp
shl rdx, 32 shl rdx, 32
@ -124,6 +127,8 @@ FN_PREFIX(CryptonightR_template_part2):
mov rax, r13 mov rax, r13
mul r12 mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d mov r9d, r10d
mov r12d, r10d mov r12d, r10d
@ -145,13 +150,10 @@ FN_PREFIX(CryptonightR_template_part2):
movdqu XMMWORD PTR [r10+r11], xmm3 movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6 movdqa xmm7, xmm6
add r15, rax mov QWORD PTR [rcx], rsp
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13 xor rsp, r13
mov r9d, esp mov r9d, esp
mov QWORD PTR [r10+r11+8], r15 mov QWORD PTR [rcx+8], r15
and r9d, 2097136 and r9d, 2097136
xor r15, r14 xor r15, r14
movdqa xmm6, xmm5 movdqa xmm6, xmm5

View file

@ -70,29 +70,30 @@ CryptonightR_template_mainloop:
aesenc xmm5, xmm4 aesenc xmm5, xmm4
mov r12d, r9d mov r13d, r9d
mov eax, r9d mov eax, r9d
xor r9d, 48 xor r9d, 48
xor r12d, 16 xor r13d, 16
xor eax, 32 xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11] movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0 movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11] movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11] movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2 pxor xmm0, xmm2
pxor xmm5, xmm1 pxor xmm5, xmm1
pxor xmm5, xmm0 pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movq r12, xmm5 movq r12, xmm5
movd r10d, xmm5 movd r10d, xmm5
and r10d, 2097136 and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5 movdqa xmm0, xmm5
pxor xmm0, xmm6 pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0 movdqu XMMWORD PTR [rdx], xmm0
@ -102,14 +103,16 @@ CryptonightR_template_mainloop:
shl rdx, 32 shl rdx, 32
or r13, rdx or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6 movd eax, xmm6
movd edx, xmm7 movd edx, xmm7
pextrd r9d, xmm7, 2 pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
CryptonightR_template_part2: CryptonightR_template_part2:
lea rcx, [r10+r11]
mov eax, edi mov eax, edi
mov edx, ebp mov edx, ebp
shl rdx, 32 shl rdx, 32
@ -124,6 +127,8 @@ CryptonightR_template_part2:
mov rax, r13 mov rax, r13
mul r12 mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d mov r9d, r10d
mov r12d, r10d mov r12d, r10d
@ -145,13 +150,10 @@ CryptonightR_template_part2:
movdqu XMMWORD PTR [r10+r11], xmm3 movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6 movdqa xmm7, xmm6
add r15, rax mov QWORD PTR [rcx], rsp
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13 xor rsp, r13
mov r9d, esp mov r9d, esp
mov QWORD PTR [r10+r11+8], r15 mov QWORD PTR [rcx+8], r15
and r9d, 2097136 and r9d, 2097136
xor r15, r14 xor r15, r14
movdqa xmm6, xmm5 movdqa xmm6, xmm5

View file

@ -70,29 +70,30 @@ FN_PREFIX(CryptonightR_template_mainloop):
aesenc xmm5, xmm4 aesenc xmm5, xmm4
mov r12d, r9d mov r13d, r9d
mov eax, r9d mov eax, r9d
xor r9d, 48 xor r9d, 48
xor r12d, 16 xor r13d, 16
xor eax, 32 xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11] movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0 movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11] movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11] movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2 pxor xmm0, xmm2
pxor xmm5, xmm1 pxor xmm5, xmm1
pxor xmm5, xmm0 pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movd r12, xmm5 movd r12, xmm5
movd r10d, xmm5 movd r10d, xmm5
and r10d, 2097136 and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5 movdqa xmm0, xmm5
pxor xmm0, xmm6 pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0 movdqu XMMWORD PTR [rdx], xmm0
@ -102,14 +103,16 @@ FN_PREFIX(CryptonightR_template_mainloop):
shl rdx, 32 shl rdx, 32
or r13, rdx or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6 movd eax, xmm6
movd edx, xmm7 movd edx, xmm7
pextrd r9d, xmm7, 2 pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
FN_PREFIX(CryptonightR_template_part2): FN_PREFIX(CryptonightR_template_part2):
lea rcx, [r10+r11]
mov eax, edi mov eax, edi
mov edx, ebp mov edx, ebp
shl rdx, 32 shl rdx, 32
@ -124,6 +127,8 @@ FN_PREFIX(CryptonightR_template_part2):
mov rax, r13 mov rax, r13
mul r12 mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d mov r9d, r10d
mov r12d, r10d mov r12d, r10d
@ -145,13 +150,10 @@ FN_PREFIX(CryptonightR_template_part2):
movdqu XMMWORD PTR [r10+r11], xmm3 movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6 movdqa xmm7, xmm6
add r15, rax mov QWORD PTR [rcx], rsp
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13 xor rsp, r13
mov r9d, esp mov r9d, esp
mov QWORD PTR [r10+r11+8], r15 mov QWORD PTR [rcx+8], r15
and r9d, 2097136 and r9d, 2097136
xor r15, r14 xor r15, r14
movdqa xmm6, xmm5 movdqa xmm6, xmm5

View file

@ -70,29 +70,30 @@ CryptonightR_template_mainloop:
aesenc xmm5, xmm4 aesenc xmm5, xmm4
mov r12d, r9d mov r13d, r9d
mov eax, r9d mov eax, r9d
xor r9d, 48 xor r9d, 48
xor r12d, 16 xor r13d, 16
xor eax, 32 xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11] movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0 movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11] movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11] movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2 pxor xmm0, xmm2
pxor xmm5, xmm1 pxor xmm5, xmm1
pxor xmm5, xmm0 pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movd r12, xmm5 movd r12, xmm5
movd r10d, xmm5 movd r10d, xmm5
and r10d, 2097136 and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5 movdqa xmm0, xmm5
pxor xmm0, xmm6 pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0 movdqu XMMWORD PTR [rdx], xmm0
@ -102,14 +103,16 @@ CryptonightR_template_mainloop:
shl rdx, 32 shl rdx, 32
or r13, rdx or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6 movd eax, xmm6
movd edx, xmm7 movd edx, xmm7
pextrd r9d, xmm7, 2 pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
CryptonightR_template_part2: CryptonightR_template_part2:
lea rcx, [r10+r11]
mov eax, edi mov eax, edi
mov edx, ebp mov edx, ebp
shl rdx, 32 shl rdx, 32
@ -124,6 +127,8 @@ CryptonightR_template_part2:
mov rax, r13 mov rax, r13
mul r12 mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d mov r9d, r10d
mov r12d, r10d mov r12d, r10d
@ -145,13 +150,10 @@ CryptonightR_template_part2:
movdqu XMMWORD PTR [r10+r11], xmm3 movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6 movdqa xmm7, xmm6
add r15, rax mov QWORD PTR [rcx], rsp
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13 xor rsp, r13
mov r9d, esp mov r9d, esp
mov QWORD PTR [r10+r11+8], r15 mov QWORD PTR [rcx+8], r15
and r9d, 2097136 and r9d, 2097136
xor r15, r14 xor r15, r14
movdqa xmm6, xmm5 movdqa xmm6, xmm5