RandomX ARMv8: optimized dataset read

Break dependency from readReg2 and readReg3. It should run faster on superscalar and out-of-order CPUs i.e. Apple M1.
2025-01-10 21:04:37 +00:00 · 2021-05-20 21:24:28 +02:00 · 2021-05-20 21:24:28 +02:00 · 94fecb5e92
commit 94fecb5e92
parent 3bfa5ea038
1 changed files with 3 additions and 2 deletions
--- a/src/crypto/randomx/jit_compiler_a64_static.S
+++ b/src/crypto/randomx/jit_compiler_a64_static.S
@ -304,6 +304,9 @@ literal_v14: .fill 2,8,0
 literal_v15: .fill 2,8,0

 DECL(randomx_program_aarch64_vm_instructions_end):
+	# Calculate dataset pointer for dataset read
+	# Do it here to break false dependency from readReg2 and readReg3 (see next line)
+	lsr	x10, x9, 32

 	# mx ^= r[readReg2] ^ r[readReg3];
 	eor	x9, x9, x18
@ -321,8 +324,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
 	# mx <-> ma
 	ror	x9, x9, 32

-	# Calculate dataset pointer for dataset read
-	mov	w10, w9
 DECL(randomx_program_aarch64_cacheline_align_mask2):
 	# Actual mask will be inserted by JIT compiler
 	and	x10, x10, 1