From 9f2d82197082be92c986320690612056e8c8111e Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 4 Mar 2019 17:39:49 +0100 Subject: [PATCH] Optimized cn/r asm code Average over 100 block heights: Coffee Lake +0.1% Ryzen +0.4% Sandy Bridge +1.5% --- src/crypto/asm/CryptonightR_template.inc | 36 ++++++++++--------- src/crypto/asm/CryptonightR_template_win.inc | 36 ++++++++++--------- .../asm/win64/CryptonightR_template.inc | 36 ++++++++++--------- .../asm/win64/CryptonightR_template_win.inc | 36 ++++++++++--------- 4 files changed, 76 insertions(+), 68 deletions(-) diff --git a/src/crypto/asm/CryptonightR_template.inc b/src/crypto/asm/CryptonightR_template.inc index b54486a57..8ecab7247 100644 --- a/src/crypto/asm/CryptonightR_template.inc +++ b/src/crypto/asm/CryptonightR_template.inc @@ -70,29 +70,30 @@ FN_PREFIX(CryptonightR_template_mainloop): aesenc xmm5, xmm4 - mov r12d, r9d + mov r13d, r9d mov eax, r9d xor r9d, 48 - xor r12d, 16 + xor r13d, 16 xor eax, 32 movdqu xmm0, XMMWORD PTR [r9+r11] movaps xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm2, XMMWORD PTR [r13+r11] movdqu xmm1, XMMWORD PTR [rax+r11] pxor xmm0, xmm2 pxor xmm5, xmm1 pxor xmm5, xmm0 - paddq xmm3, xmm7 - paddq xmm2, xmm6 - paddq xmm1, xmm4 - movdqu XMMWORD PTR [r12+r11], xmm3 - movdqu XMMWORD PTR [rax+r11], xmm2 - movdqu XMMWORD PTR [r9+r11], xmm1 movq r12, xmm5 movd r10d, xmm5 and r10d, 2097136 + paddq xmm3, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r13+r11], xmm3 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + movdqa xmm0, xmm5 pxor xmm0, xmm6 movdqu XMMWORD PTR [rdx], xmm0 @@ -102,14 +103,16 @@ FN_PREFIX(CryptonightR_template_mainloop): shl rdx, 32 or r13, rdx - xor r13, QWORD PTR [r10+r11] - mov r14, QWORD PTR [r10+r11+8] - movd eax, xmm6 movd edx, xmm7 pextrd r9d, xmm7, 2 + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + FN_PREFIX(CryptonightR_template_part2): + lea rcx, [r10+r11] + mov eax, edi mov edx, ebp shl rdx, 32 @@ -124,6 +127,8 @@ FN_PREFIX(CryptonightR_template_part2): mov rax, r13 mul r12 + add r15, rax + add rsp, rdx mov r9d, r10d mov r12d, r10d @@ -145,13 +150,10 @@ FN_PREFIX(CryptonightR_template_part2): movdqu XMMWORD PTR [r10+r11], xmm3 movdqa xmm7, xmm6 - add r15, rax - add rsp, rdx - xor r10, 48 - mov QWORD PTR [r10+r11], rsp + mov QWORD PTR [rcx], rsp xor rsp, r13 mov r9d, esp - mov QWORD PTR [r10+r11+8], r15 + mov QWORD PTR [rcx+8], r15 and r9d, 2097136 xor r15, r14 movdqa xmm6, xmm5 diff --git a/src/crypto/asm/CryptonightR_template_win.inc b/src/crypto/asm/CryptonightR_template_win.inc index 150bb0e35..a170f2d2b 100644 --- a/src/crypto/asm/CryptonightR_template_win.inc +++ b/src/crypto/asm/CryptonightR_template_win.inc @@ -70,29 +70,30 @@ CryptonightR_template_mainloop: aesenc xmm5, xmm4 - mov r12d, r9d + mov r13d, r9d mov eax, r9d xor r9d, 48 - xor r12d, 16 + xor r13d, 16 xor eax, 32 movdqu xmm0, XMMWORD PTR [r9+r11] movaps xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm2, XMMWORD PTR [r13+r11] movdqu xmm1, XMMWORD PTR [rax+r11] pxor xmm0, xmm2 pxor xmm5, xmm1 pxor xmm5, xmm0 - paddq xmm3, xmm7 - paddq xmm2, xmm6 - paddq xmm1, xmm4 - movdqu XMMWORD PTR [r12+r11], xmm3 - movdqu XMMWORD PTR [rax+r11], xmm2 - movdqu XMMWORD PTR [r9+r11], xmm1 movq r12, xmm5 movd r10d, xmm5 and r10d, 2097136 + paddq xmm3, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r13+r11], xmm3 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + movdqa xmm0, xmm5 pxor xmm0, xmm6 movdqu XMMWORD PTR [rdx], xmm0 @@ -102,14 +103,16 @@ CryptonightR_template_mainloop: shl rdx, 32 or r13, rdx - xor r13, QWORD PTR [r10+r11] - mov r14, QWORD PTR [r10+r11+8] - movd eax, xmm6 movd edx, xmm7 pextrd r9d, xmm7, 2 + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + CryptonightR_template_part2: + lea rcx, [r10+r11] + mov eax, edi mov edx, ebp shl rdx, 32 @@ -124,6 +127,8 @@ CryptonightR_template_part2: mov rax, r13 mul r12 + add r15, rax + add rsp, rdx mov r9d, r10d mov r12d, r10d @@ -145,13 +150,10 @@ CryptonightR_template_part2: movdqu XMMWORD PTR [r10+r11], xmm3 movdqa xmm7, xmm6 - add r15, rax - add rsp, rdx - xor r10, 48 - mov QWORD PTR [r10+r11], rsp + mov QWORD PTR [rcx], rsp xor rsp, r13 mov r9d, esp - mov QWORD PTR [r10+r11+8], r15 + mov QWORD PTR [rcx+8], r15 and r9d, 2097136 xor r15, r14 movdqa xmm6, xmm5 diff --git a/src/crypto/asm/win64/CryptonightR_template.inc b/src/crypto/asm/win64/CryptonightR_template.inc index 1dae434a2..97ba5347b 100644 --- a/src/crypto/asm/win64/CryptonightR_template.inc +++ b/src/crypto/asm/win64/CryptonightR_template.inc @@ -70,29 +70,30 @@ FN_PREFIX(CryptonightR_template_mainloop): aesenc xmm5, xmm4 - mov r12d, r9d + mov r13d, r9d mov eax, r9d xor r9d, 48 - xor r12d, 16 + xor r13d, 16 xor eax, 32 movdqu xmm0, XMMWORD PTR [r9+r11] movaps xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm2, XMMWORD PTR [r13+r11] movdqu xmm1, XMMWORD PTR [rax+r11] pxor xmm0, xmm2 pxor xmm5, xmm1 pxor xmm5, xmm0 - paddq xmm3, xmm7 - paddq xmm2, xmm6 - paddq xmm1, xmm4 - movdqu XMMWORD PTR [r12+r11], xmm3 - movdqu XMMWORD PTR [rax+r11], xmm2 - movdqu XMMWORD PTR [r9+r11], xmm1 movd r12, xmm5 movd r10d, xmm5 and r10d, 2097136 + paddq xmm3, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r13+r11], xmm3 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + movdqa xmm0, xmm5 pxor xmm0, xmm6 movdqu XMMWORD PTR [rdx], xmm0 @@ -102,14 +103,16 @@ FN_PREFIX(CryptonightR_template_mainloop): shl rdx, 32 or r13, rdx - xor r13, QWORD PTR [r10+r11] - mov r14, QWORD PTR [r10+r11+8] - movd eax, xmm6 movd edx, xmm7 pextrd r9d, xmm7, 2 + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + FN_PREFIX(CryptonightR_template_part2): + lea rcx, [r10+r11] + mov eax, edi mov edx, ebp shl rdx, 32 @@ -124,6 +127,8 @@ FN_PREFIX(CryptonightR_template_part2): mov rax, r13 mul r12 + add r15, rax + add rsp, rdx mov r9d, r10d mov r12d, r10d @@ -145,13 +150,10 @@ FN_PREFIX(CryptonightR_template_part2): movdqu XMMWORD PTR [r10+r11], xmm3 movdqa xmm7, xmm6 - add r15, rax - add rsp, rdx - xor r10, 48 - mov QWORD PTR [r10+r11], rsp + mov QWORD PTR [rcx], rsp xor rsp, r13 mov r9d, esp - mov QWORD PTR [r10+r11+8], r15 + mov QWORD PTR [rcx+8], r15 and r9d, 2097136 xor r15, r14 movdqa xmm6, xmm5 diff --git a/src/crypto/asm/win64/CryptonightR_template_win.inc b/src/crypto/asm/win64/CryptonightR_template_win.inc index 2f2d71a25..60ee3441b 100644 --- a/src/crypto/asm/win64/CryptonightR_template_win.inc +++ b/src/crypto/asm/win64/CryptonightR_template_win.inc @@ -70,29 +70,30 @@ CryptonightR_template_mainloop: aesenc xmm5, xmm4 - mov r12d, r9d + mov r13d, r9d mov eax, r9d xor r9d, 48 - xor r12d, 16 + xor r13d, 16 xor eax, 32 movdqu xmm0, XMMWORD PTR [r9+r11] movaps xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm2, XMMWORD PTR [r13+r11] movdqu xmm1, XMMWORD PTR [rax+r11] pxor xmm0, xmm2 pxor xmm5, xmm1 pxor xmm5, xmm0 - paddq xmm3, xmm7 - paddq xmm2, xmm6 - paddq xmm1, xmm4 - movdqu XMMWORD PTR [r12+r11], xmm3 - movdqu XMMWORD PTR [rax+r11], xmm2 - movdqu XMMWORD PTR [r9+r11], xmm1 movd r12, xmm5 movd r10d, xmm5 and r10d, 2097136 + paddq xmm3, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r13+r11], xmm3 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + movdqa xmm0, xmm5 pxor xmm0, xmm6 movdqu XMMWORD PTR [rdx], xmm0 @@ -102,14 +103,16 @@ CryptonightR_template_mainloop: shl rdx, 32 or r13, rdx - xor r13, QWORD PTR [r10+r11] - mov r14, QWORD PTR [r10+r11+8] - movd eax, xmm6 movd edx, xmm7 pextrd r9d, xmm7, 2 + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + CryptonightR_template_part2: + lea rcx, [r10+r11] + mov eax, edi mov edx, ebp shl rdx, 32 @@ -124,6 +127,8 @@ CryptonightR_template_part2: mov rax, r13 mul r12 + add r15, rax + add rsp, rdx mov r9d, r10d mov r12d, r10d @@ -145,13 +150,10 @@ CryptonightR_template_part2: movdqu XMMWORD PTR [r10+r11], xmm3 movdqa xmm7, xmm6 - add r15, rax - add rsp, rdx - xor r10, 48 - mov QWORD PTR [r10+r11], rsp + mov QWORD PTR [rcx], rsp xor rsp, r13 mov r9d, esp - mov QWORD PTR [r10+r11+8], r15 + mov QWORD PTR [rcx+8], r15 and r9d, 2097136 xor r15, r14 movdqa xmm6, xmm5