From 94fecb5e927c2baeecabee1879fbba5b775f0c45 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Thu, 20 May 2021 21:24:28 +0200
Subject: [PATCH] RandomX ARMv8: optimized dataset read

Break dependency from readReg2 and readReg3. It should run faster on superscalar and out-of-order CPUs i.e. Apple M1.
---
 src/crypto/randomx/jit_compiler_a64_static.S | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S
index 37c044c8c..95a5c92c9 100644
--- a/src/crypto/randomx/jit_compiler_a64_static.S
+++ b/src/crypto/randomx/jit_compiler_a64_static.S
@@ -304,6 +304,9 @@ literal_v14: .fill 2,8,0
 literal_v15: .fill 2,8,0
 
 DECL(randomx_program_aarch64_vm_instructions_end):
+	# Calculate dataset pointer for dataset read
+	# Do it here to break false dependency from readReg2 and readReg3 (see next line)
+	lsr	x10, x9, 32
 
 	# mx ^= r[readReg2] ^ r[readReg3];
 	eor	x9, x9, x18
@@ -321,8 +324,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
 	# mx <-> ma
 	ror	x9, x9, 32
 
-	# Calculate dataset pointer for dataset read
-	mov	w10, w9
 DECL(randomx_program_aarch64_cacheline_align_mask2):
 	# Actual mask will be inserted by JIT compiler
 	and	x10, x10, 1