Another dataset AVX2 init speedup (+3.8% faster on Zen3)

This commit is contained in:
SChernykh 2020-12-19 19:46:31 +01:00
parent 633aaccd9c
commit 5efd00abec
4 changed files with 8 additions and 10 deletions

View file

@ -1,4 +1,4 @@
add rsp, 32
add rsp, 40
pop r9
movdqu xmm0, xmmword ptr [rsp]

View file

@ -34,5 +34,5 @@
add rbp, 5
add rsi, 320
cmp rbp, qword ptr [rsp+32]
cmp rbp, qword ptr [rsp+40]
db 15, 130, 0, 0, 0, 0 ;# jb rel32

View file

@ -212,7 +212,7 @@ DECL(randomx_dataset_init_avx2_prologue):
mov rbp, rdx ;# block index
push rcx ;# max. block index
#endif
sub rsp, 32
sub rsp, 40
jmp randomx_dataset_init_avx2_prologue_loop_begin
#include "asm/program_sshash_avx2_constants.inc"
@ -240,9 +240,8 @@ randomx_dataset_init_avx2_prologue_loop_begin:
xor r15, r8
;# init AVX registers (lanes 1-4)
vpxor ymm0, ymm0, ymm0
movq xmm0, rbp
vpbroadcastq ymm0, xmm0
mov qword ptr [rsp+32], rbp
vbroadcastsd ymm0, qword ptr [rsp+32]
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip]
;# ymm0 *= r0_avx2_mul

View file

@ -195,7 +195,7 @@ randomx_dataset_init_avx2_prologue PROC
mov rsi, rdx ;# dataset
mov rbp, r8 ;# block index
push r9 ;# max. block index
sub rsp, 32
sub rsp, 40
jmp loop_begin
include asm/program_sshash_avx2_constants.inc
@ -223,9 +223,8 @@ loop_begin:
xor r15, r8
;# init AVX registers (lanes 1-4)
vpxor ymm0, ymm0, ymm0
movq xmm0, rbp
vpbroadcastq ymm0, xmm0
mov qword ptr [rsp+32], rbp
vbroadcastsd ymm0, qword ptr [rsp+32]
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments]
;# ymm0 *= r0_avx2_mul