Optimized cn/r asm code

Average over 100 block heights:
Coffee Lake +0.1%
Ryzen +0.4%
Sandy Bridge +1.5%
This commit is contained in:
SChernykh 2019-03-04 17:39:49 +01:00
parent 4ebfc135e0
commit 9f2d821970
4 changed files with 76 additions and 68 deletions

View file

@ -70,29 +70,30 @@ FN_PREFIX(CryptonightR_template_mainloop):
aesenc xmm5, xmm4
mov r12d, r9d
mov r13d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor r13d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movq r12, xmm5
movd r10d, xmm5
and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
@ -102,14 +103,16 @@ FN_PREFIX(CryptonightR_template_mainloop):
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
FN_PREFIX(CryptonightR_template_part2):
lea rcx, [r10+r11]
mov eax, edi
mov edx, ebp
shl rdx, 32
@ -124,6 +127,8 @@ FN_PREFIX(CryptonightR_template_part2):
mov rax, r13
mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d
mov r12d, r10d
@ -145,13 +150,10 @@ FN_PREFIX(CryptonightR_template_part2):
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
mov QWORD PTR [rcx], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
mov QWORD PTR [rcx+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5

View file

@ -70,29 +70,30 @@ CryptonightR_template_mainloop:
aesenc xmm5, xmm4
mov r12d, r9d
mov r13d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor r13d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movq r12, xmm5
movd r10d, xmm5
and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
@ -102,14 +103,16 @@ CryptonightR_template_mainloop:
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
CryptonightR_template_part2:
lea rcx, [r10+r11]
mov eax, edi
mov edx, ebp
shl rdx, 32
@ -124,6 +127,8 @@ CryptonightR_template_part2:
mov rax, r13
mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d
mov r12d, r10d
@ -145,13 +150,10 @@ CryptonightR_template_part2:
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
mov QWORD PTR [rcx], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
mov QWORD PTR [rcx+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5

View file

@ -70,29 +70,30 @@ FN_PREFIX(CryptonightR_template_mainloop):
aesenc xmm5, xmm4
mov r12d, r9d
mov r13d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor r13d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movd r12, xmm5
movd r10d, xmm5
and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
@ -102,14 +103,16 @@ FN_PREFIX(CryptonightR_template_mainloop):
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
FN_PREFIX(CryptonightR_template_part2):
lea rcx, [r10+r11]
mov eax, edi
mov edx, ebp
shl rdx, 32
@ -124,6 +127,8 @@ FN_PREFIX(CryptonightR_template_part2):
mov rax, r13
mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d
mov r12d, r10d
@ -145,13 +150,10 @@ FN_PREFIX(CryptonightR_template_part2):
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
mov QWORD PTR [rcx], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
mov QWORD PTR [rcx+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5

View file

@ -70,29 +70,30 @@ CryptonightR_template_mainloop:
aesenc xmm5, xmm4
mov r12d, r9d
mov r13d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor r13d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movd r12, xmm5
movd r10d, xmm5
and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
@ -102,14 +103,16 @@ CryptonightR_template_mainloop:
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
CryptonightR_template_part2:
lea rcx, [r10+r11]
mov eax, edi
mov edx, ebp
shl rdx, 32
@ -124,6 +127,8 @@ CryptonightR_template_part2:
mov rax, r13
mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d
mov r12d, r10d
@ -145,13 +150,10 @@ CryptonightR_template_part2:
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
mov QWORD PTR [rcx], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
mov QWORD PTR [rcx+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5