diff --git a/CMakeLists.txt b/CMakeLists.txt
index 07654d040..850ddef8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,7 +143,7 @@ else()
     endif()
 endif()
 
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
     EXECUTE_PROCESS(COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
     if (OPERATING_SYSTEM MATCHES "Android")
         set(EXTRA_LIBS ${EXTRA_LIBS} log)
diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake
index d0d892b96..27b6e5e64 100644
--- a/cmake/randomx.cmake
+++ b/cmake/randomx.cmake
@@ -51,6 +51,13 @@ if (WITH_RANDOMX)
             )
         # cheat because cmake and ccache hate each other
         set_property(SOURCE src/crypto/randomx/jit_compiler_x86_static.S PROPERTY LANGUAGE C)
+    elseif (XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+        list(APPEND SOURCES_CRYPTO
+             src/crypto/randomx/jit_compiler_a64_static.S
+             src/crypto/randomx/jit_compiler_a64.cpp
+            )
+        # cheat because cmake and ccache hate each other
+        set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
     endif()
 
     if (CMAKE_CXX_COMPILER_ID MATCHES Clang)
diff --git a/src/crypto/randomx/common.hpp b/src/crypto/randomx/common.hpp
index da36f2c5b..48f31bac2 100644
--- a/src/crypto/randomx/common.hpp
+++ b/src/crypto/randomx/common.hpp
@@ -108,7 +108,7 @@ namespace randomx {
 	class JitCompilerX86;
 	using JitCompiler = JitCompilerX86;
 #elif defined(__aarch64__)
-	#define RANDOMX_HAVE_COMPILER 0
+	#define RANDOMX_HAVE_COMPILER 1
 	class JitCompilerA64;
 	using JitCompiler = JitCompilerA64;
 #else
diff --git a/src/crypto/randomx/instructions_portable.cpp b/src/crypto/randomx/instructions_portable.cpp
index b28203a9c..d08ee5870 100644
--- a/src/crypto/randomx/instructions_portable.cpp
+++ b/src/crypto/randomx/instructions_portable.cpp
@@ -82,6 +82,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define HAVE_SETROUNDMODE_IMPL
 #endif
 
+#ifndef HAVE_SETROUNDMODE_IMPL
+	static void setRoundMode_(uint32_t mode) {
+		fesetround(mode);
+	}
+#endif
+
 #ifndef HAVE_ROTR64
 	uint64_t rotr64(uint64_t a, unsigned int b) {
 		return (a >> b) | (a << (-b & 63));
@@ -127,12 +133,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef RANDOMX_DEFAULT_FENV
 
-#	ifndef HAVE_SETROUNDMODE_IMPL
-	static void setRoundMode_(uint32_t mode) {
-		fesetround(mode);
-	}
-#	endif
-
 void rx_reset_float_state() {
 	setRoundMode_(FE_TONEAREST);
 	rx_set_double_precision(); //set precision to 53 bits if needed by the platform
diff --git a/src/crypto/randomx/intrin_portable.h b/src/crypto/randomx/intrin_portable.h
index e49160967..346c433ae 100644
--- a/src/crypto/randomx/intrin_portable.h
+++ b/src/crypto/randomx/intrin_portable.h
@@ -376,11 +376,138 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 
 #define RANDOMX_DEFAULT_FENV
 
-void rx_reset_float_state();
+#elif defined(__aarch64__)
 
-void rx_set_rounding_mode(uint32_t mode);
+#include <stdlib.h>
+#include <arm_neon.h>
+#include <arm_acle.h>
 
-#else //end altivec
+typedef uint8x16_t rx_vec_i128;
+typedef float64x2_t rx_vec_f128;
+
+inline void* rx_aligned_alloc(size_t size, size_t align) {
+	void* p;
+	if (posix_memalign(&p, align, size) == 0)
+		return p;
+
+	return 0;
+};
+
+#define rx_aligned_free(a) free(a)
+
+inline void rx_prefetch_nta(void* ptr) {
+	asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
+}
+
+FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
+	return vld1q_f64((const float64_t*)pd);
+}
+
+FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 val) {
+	vst1q_f64((float64_t*)mem_addr, val);
+}
+
+FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
+	float64x2_t temp;
+	temp = vcopyq_laneq_f64(temp, 1, a, 1);
+	a = vcopyq_laneq_f64(a, 1, a, 0);
+	return vcopyq_laneq_f64(a, 0, temp, 1);
+}
+
+FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
+	uint64x2_t temp0 = vdupq_n_u64(x0);
+	uint64x2_t temp1 = vdupq_n_u64(x1);
+	return vreinterpretq_f64_u64(vcopyq_laneq_u64(temp0, 1, temp1, 0));
+}
+
+FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
+	return vreinterpretq_f64_u64(vdupq_n_u64(x));
+}
+
+#define rx_add_vec_f128 vaddq_f64
+#define rx_sub_vec_f128 vsubq_f64
+#define rx_mul_vec_f128 vmulq_f64
+#define rx_div_vec_f128 vdivq_f64
+#define rx_sqrt_vec_f128 vsqrtq_f64
+
+FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	return vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
+}
+
+FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	return vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
+}
+
+FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	return vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
+}
+
+#ifdef __ARM_FEATURE_CRYPTO
+
+
+FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
+	const uint8x16_t zero = { 0 };
+	return vaesmcq_u8(vaeseq_u8(a, zero)) ^ key;
+}
+
+FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
+	const uint8x16_t zero = { 0 };
+	return vaesimcq_u8(vaesdq_u8(a, zero)) ^ key;
+}
+
+#define HAVE_AES
+
+#endif
+
+#define rx_xor_vec_i128 veorq_u8
+
+FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
+	return vgetq_lane_s32(vreinterpretq_s32_u8(a), 0);
+}
+
+FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
+	return vgetq_lane_s32(vreinterpretq_s32_u8(a), 1);
+}
+
+FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
+	return vgetq_lane_s32(vreinterpretq_s32_u8(a), 2);
+}
+
+FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
+	return vgetq_lane_s32(vreinterpretq_s32_u8(a), 3);
+}
+
+FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
+	int32_t data[4];
+	data[0] = _I0;
+	data[1] = _I1;
+	data[2] = _I2;
+	data[3] = _I3;
+	return vreinterpretq_u8_s32(vld1q_s32(data));
+};
+
+#define rx_xor_vec_i128 veorq_u8
+
+FORCE_INLINE rx_vec_i128 rx_load_vec_i128(const rx_vec_i128* mem_addr) {
+	return vld1q_u8((const uint8_t*)mem_addr);
+}
+
+FORCE_INLINE void rx_store_vec_i128(rx_vec_i128* mem_addr, rx_vec_i128 val) {
+	vst1q_u8((uint8_t*)mem_addr, val);
+}
+
+FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
+	double lo = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
+	double hi = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
+	rx_vec_f128 x;
+	x = vsetq_lane_f64(lo, x, 0);
+	x = vsetq_lane_f64(hi, x, 1);
+	return x;
+}
+
+#define RANDOMX_DEFAULT_FENV
+
+#else //portable fallback
 
 #include <cstdint>
 #include <stdexcept>
@@ -487,7 +614,6 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
 	return v;
 }
 
-
 FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	rx_vec_f128 x;
 	x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
@@ -578,10 +704,6 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 
 #define RANDOMX_DEFAULT_FENV
 
-void rx_reset_float_state();
-
-void rx_set_rounding_mode(uint32_t mode);
-
 #endif
 
 #ifndef HAVE_AES
@@ -598,6 +720,14 @@ FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 }
 #endif
 
+#ifdef RANDOMX_DEFAULT_FENV
+
+void rx_reset_float_state();
+
+void rx_set_rounding_mode(uint32_t mode);
+
+#endif
+
 double loadDoublePortable(const void* addr);
 uint64_t mulh(uint64_t, uint64_t);
 int64_t smulh(int64_t, int64_t);
diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp
new file mode 100644
index 000000000..08f84f1ce
--- /dev/null
+++ b/src/crypto/randomx/jit_compiler_a64.cpp
@@ -0,0 +1,1020 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "crypto/randomx/jit_compiler_a64.hpp"
+#include "crypto/randomx/superscalar.hpp"
+#include "crypto/randomx/program.hpp"
+#include "crypto/randomx/reciprocal.h"
+#include "crypto/randomx/virtual_memory.hpp"
+
+namespace ARMV8A {
+
+constexpr uint32_t B           = 0x14000000;
+constexpr uint32_t EOR         = 0xCA000000;
+constexpr uint32_t EOR32       = 0x4A000000;
+constexpr uint32_t ADD         = 0x8B000000;
+constexpr uint32_t SUB         = 0xCB000000;
+constexpr uint32_t MUL         = 0x9B007C00;
+constexpr uint32_t UMULH       = 0x9BC07C00;
+constexpr uint32_t SMULH       = 0x9B407C00;
+constexpr uint32_t MOVZ        = 0xD2800000;
+constexpr uint32_t MOVN        = 0x92800000;
+constexpr uint32_t MOVK        = 0xF2800000;
+constexpr uint32_t ADD_IMM_LO  = 0x91000000;
+constexpr uint32_t ADD_IMM_HI  = 0x91400000;
+constexpr uint32_t LDR_LITERAL = 0x58000000;
+constexpr uint32_t ROR         = 0x9AC02C00;
+constexpr uint32_t ROR_IMM     = 0x93C00000;
+constexpr uint32_t MOV_REG     = 0xAA0003E0;
+constexpr uint32_t MOV_VREG_EL = 0x6E080400;
+constexpr uint32_t FADD        = 0x4E60D400;
+constexpr uint32_t FSUB        = 0x4EE0D400;
+constexpr uint32_t FEOR        = 0x6E201C00;
+constexpr uint32_t FMUL        = 0x6E60DC00;
+constexpr uint32_t FDIV        = 0x6E60FC00;
+constexpr uint32_t FSQRT       = 0x6EE1F800;
+
+}
+
+namespace randomx {
+
+static const size_t CodeSize = ((uint8_t*)randomx_init_dataset_aarch64_end) - ((uint8_t*)randomx_program_aarch64);
+static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64);
+static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64);
+static const size_t ImulRcpLiteralsEnd = ((uint8_t*)randomx_program_aarch64_imul_rcp_literals_end) - ((uint8_t*)randomx_program_aarch64);
+
+static size_t CalcDatasetItemSize()
+{
+	return
+	// Prologue
+	((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch - (uint8_t*)randomx_calc_dataset_item_aarch64) + 
+	// Main loop
+	RandomX_CurrentConfig.CacheAccesses * (
+		// Main loop prologue
+		((uint8_t*)randomx_calc_dataset_item_aarch64_mix - ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch)) + 4 +
+		// Inner main loop (instructions)
+		((RandomX_CurrentConfig.SuperscalarLatency * 3) + 2) * 16 +
+		// Main loop epilogue
+		((uint8_t*)randomx_calc_dataset_item_aarch64_store_result - (uint8_t*)randomx_calc_dataset_item_aarch64_mix) + 4
+	) + 
+	// Epilogue
+	((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result);
+}
+
+constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
+
+JitCompilerA64::JitCompilerA64()
+	: code((uint8_t*) allocExecutableMemory(CodeSize + CalcDatasetItemSize()))
+	, literalPos(ImulRcpLiteralsEnd)
+	, num32bitLiterals(0)
+{
+	memset(reg_changed_offset, 0, sizeof(reg_changed_offset));
+	memcpy(code, (void*) randomx_program_aarch64, CodeSize);
+}
+
+JitCompilerA64::~JitCompilerA64()
+{
+	freePagedMemory(code, CodeSize + CalcDatasetItemSize());
+}
+
+void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config)
+{
+	uint32_t codePos = MainLoopBegin + 4;
+
+	// and w16, w10, ScratchpadL3Mask64
+	emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos);
+
+	// and w17, w18, ScratchpadL3Mask64
+	emit32(0x121A0000 | 17 | (18 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos);
+
+	codePos = PrologueSize;
+	literalPos = ImulRcpLiteralsEnd;
+	num32bitLiterals = 0;
+
+	for (uint32_t i = 0; i < RegistersCount; ++i)
+		reg_changed_offset[i] = codePos;
+
+	for (uint32_t i = 0; i < program.getSize(); ++i)
+	{
+		Instruction& instr = program(i);
+		instr.src %= RegistersCount;
+		instr.dst %= RegistersCount;
+		(this->*engine[instr.opcode])(instr, codePos);
+	}
+
+	// Update spMix2
+	// eor w18, config.readReg2, config.readReg3
+	emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
+
+	// Jump back to the main loop
+	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
+	emit32(ARMV8A::B | (offset / 4), code, codePos);
+
+	// and w18, w18, CacheLineAlignMask
+	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
+	emit32(0x121A0000 | 18 | (18 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
+
+	// and w10, w10, CacheLineAlignMask
+	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
+	emit32(0x121A0000 | 10 | (10 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
+
+	// Update spMix1
+	// eor x10, config.readReg0, config.readReg1
+	codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
+	emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
+
+#ifdef __GNUC__
+	__builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
+#endif
+}
+
+void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration& config, uint32_t datasetOffset)
+{
+	uint32_t codePos = MainLoopBegin + 4;
+
+	// and w16, w10, ScratchpadL3Mask64
+	emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos);
+
+	// and w17, w18, ScratchpadL3Mask64
+	emit32(0x121A0000 | 17 | (18 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos);
+
+	codePos = PrologueSize;
+	literalPos = ImulRcpLiteralsEnd;
+	num32bitLiterals = 0;
+
+	for (uint32_t i = 0; i < RegistersCount; ++i)
+		reg_changed_offset[i] = codePos;
+
+	for (uint32_t i = 0; i < program.getSize(); ++i)
+	{
+		Instruction& instr = program(i);
+		instr.src %= RegistersCount;
+		instr.dst %= RegistersCount;
+		(this->*engine[instr.opcode])(instr, codePos);
+	}
+
+	// Update spMix2
+	// eor w18, config.readReg2, config.readReg3
+	emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
+
+	// Jump back to the main loop
+	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos;
+	emit32(ARMV8A::B | (offset / 4), code, codePos);
+
+	// and w2, w9, CacheLineAlignMask
+	codePos = (((uint8_t*)randomx_program_aarch64_light_cacheline_align_mask) - ((uint8_t*)randomx_program_aarch64));
+	emit32(0x121A0000 | 2 | (9 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
+
+	// Update spMix1
+	// eor x10, config.readReg0, config.readReg1
+	codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
+	emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
+
+	// Apply dataset offset
+	codePos = ((uint8_t*)randomx_program_aarch64_light_dataset_offset) - ((uint8_t*)randomx_program_aarch64);
+
+	datasetOffset /= CacheLineSize;
+	const uint32_t imm_lo = datasetOffset & ((1 << 12) - 1);
+	const uint32_t imm_hi = datasetOffset >> 12;
+
+	emit32(ARMV8A::ADD_IMM_LO | 2 | (2 << 5) | (imm_lo << 10), code, codePos);
+	emit32(ARMV8A::ADD_IMM_HI | 2 | (2 << 5) | (imm_hi << 10), code, codePos);
+
+#ifdef __GNUC__
+	__builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
+#endif
+}
+
+template<size_t N>
+void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &reciprocalCache)
+{
+	uint32_t codePos = CodeSize;
+
+	uint8_t* p1 = (uint8_t*)randomx_calc_dataset_item_aarch64;
+	uint8_t* p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_prefetch;
+	memcpy(code + codePos, p1, p2 - p1);
+	codePos += p2 - p1;
+
+	num32bitLiterals = 64;
+	constexpr uint32_t tmp_reg = 12;
+
+	for (size_t i = 0; i < RandomX_CurrentConfig.CacheAccesses; ++i)
+	{
+		// and x11, x10, CacheSize / CacheLineSize - 1
+		emit32(0x92400000 | 11 | (10 << 5) | ((RandomX_CurrentConfig.Log2_CacheSize - 1) << 10), code, codePos);
+
+		p1 = ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch) + 4;
+		p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_mix;
+		memcpy(code + codePos, p1, p2 - p1);
+		codePos += p2 - p1;
+
+		SuperscalarProgram& prog = programs[i];
+		const size_t progSize = prog.getSize();
+
+		uint32_t jmp_pos = codePos;
+		codePos += 4;
+
+		// Fill in literal pool
+		for (size_t j = 0; j < progSize; ++j)
+		{
+			const Instruction& instr = prog(j);
+			if (static_cast<SuperscalarInstructionType>(instr.opcode) == randomx::SuperscalarInstructionType::IMUL_RCP)
+				emit64(reciprocalCache[instr.getImm32()], code, codePos);
+		}
+
+		// Jump over literal pool
+		uint32_t literal_pos = jmp_pos;
+		emit32(ARMV8A::B | ((codePos - jmp_pos) / 4), code, literal_pos);
+
+		for (size_t j = 0; j < progSize; ++j)
+		{
+			const Instruction& instr = prog(j);
+			const uint32_t src = instr.src;
+			const uint32_t dst = instr.dst;
+
+			switch (static_cast<SuperscalarInstructionType>(instr.opcode))
+			{
+			case randomx::SuperscalarInstructionType::ISUB_R:
+				emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IXOR_R:
+				emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IADD_RS:
+				emit32(ARMV8A::ADD | dst | (dst << 5) | (instr.getModShift() << 10) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IMUL_R:
+				emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IROR_C:
+				emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IADD_C7:
+			case randomx::SuperscalarInstructionType::IADD_C8:
+			case randomx::SuperscalarInstructionType::IADD_C9:
+				emitAddImmediate(dst, dst, instr.getImm32(), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IXOR_C7:
+			case randomx::SuperscalarInstructionType::IXOR_C8:
+			case randomx::SuperscalarInstructionType::IXOR_C9:
+				emitMovImmediate(tmp_reg, instr.getImm32(), code, codePos);
+				emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IMULH_R:
+				emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::ISMULH_R:
+				emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IMUL_RCP:
+				{
+					int32_t offset = (literal_pos - codePos) / 4;
+					offset &= (1 << 19) - 1;
+					literal_pos += 8;
+
+					// ldr tmp_reg, reciprocal
+					emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, codePos);
+
+					// mul dst, dst, tmp_reg
+					emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, codePos);
+				}
+				break;
+			default:
+				break;
+			}
+		}
+
+		p1 = (uint8_t*)randomx_calc_dataset_item_aarch64_mix;
+		p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_store_result;
+		memcpy(code + codePos, p1, p2 - p1);
+		codePos += p2 - p1;
+
+		// Update registerValue
+		emit32(ARMV8A::MOV_REG | 10 | (prog.getAddressRegister() << 16), code, codePos);
+	}
+
+	p1 = (uint8_t*)randomx_calc_dataset_item_aarch64_store_result;
+	p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_end;
+	memcpy(code + codePos, p1, p2 - p1);
+	codePos += p2 - p1;
+
+#ifdef __GNUC__
+	__builtin___clear_cache(reinterpret_cast<char*>(code + CodeSize), reinterpret_cast<char*>(code + codePos));
+#endif
+}
+
+template void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES], std::vector<uint64_t> &reciprocalCache);
+
+DatasetInitFunc* JitCompilerA64::getDatasetInitFunc()
+{
+	return (DatasetInitFunc*)(code + (((uint8_t*)randomx_init_dataset_aarch64) - ((uint8_t*)randomx_program_aarch64)));
+}
+
+size_t JitCompilerA64::getCodeSize()
+{
+	return CodeSize;
+}
+
+void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	if (imm < (1 << 16))
+	{
+		// movz tmp_reg, imm32 (16 low bits)
+		emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k);
+	}
+	else
+	{
+		if (num32bitLiterals < 64)
+		{
+			if (static_cast<int32_t>(imm) < 0)
+			{
+				// smov dst, vN.s[M]
+				emit32(0x4E042C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k);
+			}
+			else
+			{
+				// umov dst, vN.s[M]
+				emit32(0x0E043C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k);
+			}
+
+			((uint32_t*)(code + ImulRcpLiteralsEnd))[num32bitLiterals] = imm;
+			++num32bitLiterals;
+		}
+		else
+		{
+			if (static_cast<int32_t>(imm) < 0)
+			{
+				// movn tmp_reg, ~imm32 (16 high bits)
+				emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k);
+			}
+			else
+			{
+				// movz tmp_reg, imm32 (16 high bits)
+				emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k);
+			}
+
+			// movk tmp_reg, imm32 (16 low bits)
+			emit32(ARMV8A::MOVK | dst | ((imm & 0xFFFF) << 5), code, k);
+		}
+	}
+
+	codePos = k;
+}
+
+void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	if (imm < (1 << 24))
+	{
+		const uint32_t imm_lo = imm & ((1 << 12) - 1);
+		const uint32_t imm_hi = imm >> 12;
+
+		if (imm_lo && imm_hi)
+		{
+			emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
+			emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
+		}
+		else if (imm_lo)
+		{
+			emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
+		}
+		else
+		{
+			emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k);
+		}
+	}
+	else
+	{
+		constexpr uint32_t tmp_reg = 18;
+		emitMovImmediate(tmp_reg, imm, code, k);
+
+		// add dst, src, tmp_reg
+		emit32(ARMV8A::ADD | dst | (src << 5) | (tmp_reg << 16), code, k);
+	}
+
+	codePos = k;
+}
+
+template<uint32_t tmp_reg>
+void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	uint32_t imm = instr.getImm32();
+
+	if (src != dst)
+	{
+		imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
+		emitAddImmediate(tmp_reg, src, imm, code, k);
+
+		constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
+		const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
+		const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
+
+		emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
+
+		// ldr tmp_reg, [x2, tmp_reg]
+		emit32(0xf8606840 | tmp_reg | (tmp_reg << 16), code, k);
+	}
+	else
+	{
+		imm = (imm & ScratchpadL3Mask) >> 3;
+		emitMovImmediate(tmp_reg, imm, code, k);
+
+		// ldr tmp_reg, [x2, tmp_reg, lsl 3]
+		emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
+	}
+
+	codePos = k;
+}
+
+template<uint32_t tmp_reg_fp>
+void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	uint32_t imm = instr.getImm32();
+	constexpr uint32_t tmp_reg = 18;
+
+	imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
+	emitAddImmediate(tmp_reg, src, imm, code, k);
+
+	constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
+	const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
+	const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
+
+	emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
+
+	// add tmp_reg, x2, tmp_reg
+	emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k);
+
+	// ldpsw tmp_reg, tmp_reg + 1, [tmp_reg]
+	emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k);
+
+	// ins tmp_reg_fp.d[0], tmp_reg
+	emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k);
+
+	// ins tmp_reg_fp.d[1], tmp_reg + 1
+	emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k);
+
+	// scvtf tmp_reg_fp.2d, tmp_reg_fp.2d
+	emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_IADD_RS(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+	const uint32_t shift = instr.getModShift();
+
+	// add dst, src << shift
+	emit32(ARMV8A::ADD | dst | (dst << 5) | (shift << 10) | (src << 16), code, k);
+
+	if (instr.dst == RegisterNeedsDisplacement)
+		emitAddImmediate(dst, dst, instr.getImm32(), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// add dst, dst, tmp_reg
+	emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISUB_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src != dst)
+	{
+		// sub dst, dst, src
+		emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, k);
+	}
+	else
+	{
+		emitAddImmediate(dst, dst, -instr.getImm32(), code, k);
+	}
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// sub dst, dst, tmp_reg
+	emit32(ARMV8A::SUB | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src == dst)
+	{
+		src = 18;
+		emitMovImmediate(src, instr.getImm32(), code, k);
+	}
+
+	// mul dst, dst, src
+	emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// sub dst, dst, tmp_reg
+	emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMULH_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	// umulh dst, dst, src
+	emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// umulh dst, dst, tmp_reg
+	emit32(ARMV8A::UMULH | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISMULH_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	// smulh dst, dst, src
+	emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// smulh dst, dst, tmp_reg
+	emit32(ARMV8A::SMULH | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
+{
+	const uint64_t divisor = instr.getImm32();
+	if (isZeroOrPowerOf2(divisor))
+		return;
+
+	uint32_t k = codePos;
+
+	constexpr uint32_t tmp_reg = 18;
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint64_t N = 1ULL << 63;
+	const uint64_t q = N / divisor;
+	const uint64_t r = N % divisor;
+#ifdef __GNUC__
+	const uint64_t shift = 64 - __builtin_clzll(divisor);
+#else
+	uint64_t shift = 32;
+	for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1)
+		--shift;
+#endif
+
+	const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t);
+
+	literalPos -= sizeof(uint64_t);
+	*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
+
+	if (literal_id < 13)
+	{
+		static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 };
+
+		// mul dst, dst, literal_reg
+		emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k);
+	}
+	else
+	{
+		// ldr tmp_reg, reciprocal
+		const uint32_t offset = (literalPos - k) / 4;
+		emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);
+
+		// mul dst, dst, tmp_reg
+		emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
+	}
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_INEG_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	// sub dst, xzr, dst
+	emit32(ARMV8A::SUB | dst | (31 << 5) | (dst << 16), code, codePos);
+
+	reg_changed_offset[instr.dst] = codePos;
+}
+
+void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src == dst)
+	{
+		src = 18;
+		emitMovImmediate(src, instr.getImm32(), code, k);
+	}
+
+	// eor dst, dst, src
+	emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// eor dst, dst, tmp_reg
+	emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src != dst)
+	{
+		// ror dst, dst, src
+		emit32(ARMV8A::ROR | dst | (dst << 5) | (src << 16), code, codePos);
+	}
+	else
+	{
+		// ror dst, dst, imm
+		emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
+	}
+
+	reg_changed_offset[instr.dst] = codePos;
+}
+
+void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src != dst)
+	{
+		constexpr uint32_t tmp_reg = 18;
+
+		// sub tmp_reg, xzr, src
+		emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
+
+		// ror dst, dst, tmp_reg
+		emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k);
+	}
+	else
+	{
+		// ror dst, dst, imm
+		emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
+	}
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src == dst)
+		return;
+
+	uint32_t k = codePos;
+
+	constexpr uint32_t tmp_reg = 18;
+	emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
+	emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
+	emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.src] = k;
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t dst = instr.dst + 16;
+
+	constexpr uint32_t tmp_reg_fp = 28;
+	constexpr uint32_t src_index1 = 1 << 14;
+	constexpr uint32_t dst_index1 = 1 << 20;
+
+	emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k);
+	emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k);
+	emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_FADD_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = (instr.src % 4) + 24;
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	emit32(ARMV8A::FADD | dst | (dst << 5) | (src << 16), code, codePos);
+}
+
+void JitCompilerA64::h_FADD_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	constexpr uint32_t tmp_reg_fp = 28;
+	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k); 
+
+	emit32(ARMV8A::FADD | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_FSUB_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = (instr.src % 4) + 24;
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	emit32(ARMV8A::FSUB | dst | (dst << 5) | (src << 16), code, codePos);
+}
+
+void JitCompilerA64::h_FSUB_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	constexpr uint32_t tmp_reg_fp = 28;
+	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k); 
+
+	emit32(ARMV8A::FSUB | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_FSCAL_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	emit32(ARMV8A::FEOR | dst | (dst << 5) | (31 << 16), code, codePos);
+}
+
+void JitCompilerA64::h_FMUL_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = (instr.src % 4) + 24;
+	const uint32_t dst = (instr.dst % 4) + 20;
+
+	emit32(ARMV8A::FMUL | dst | (dst << 5) | (src << 16), code, codePos);
+}
+
+void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = (instr.dst % 4) + 20;
+
+	constexpr uint32_t tmp_reg_fp = 28;
+	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k); 
+
+	// and tmp_reg_fp, tmp_reg_fp, and_mask_reg
+	emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k);
+
+	// orr tmp_reg_fp, tmp_reg_fp, or_mask_reg
+	emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k);
+
+	emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_FSQRT_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t dst = (instr.dst % 4) + 20;
+
+	emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos);
+}
+
+void JitCompilerA64::h_CBRANCH(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t dst = IntRegMap[instr.dst];
+	const uint32_t modCond = instr.getModCond();
+	const uint32_t shift = modCond + RandomX_CurrentConfig.JumpOffset;
+	const uint32_t imm = (instr.getImm32() | (1U << shift)) & ~(1U << (shift - 1));
+
+	emitAddImmediate(dst, dst, imm, code, k);
+
+	// tst dst, mask
+	emit32((0xF2781C1F - (modCond << 16)) | (dst << 5), code, k);
+
+	int32_t offset = reg_changed_offset[instr.dst];
+	offset = ((offset - k) >> 2) & ((1 << 19) - 1);
+
+	// beq target
+	emit32(0x54000000 | (offset << 5), code, k);
+
+	for (uint32_t i = 0; i < RegistersCount; ++i)
+		reg_changed_offset[i] = k;
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+
+	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t fpcr_tmp_reg = 8;
+
+	// ror tmp_reg, src, imm
+	emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
+
+	// bfi fpcr_tmp_reg, tmp_reg, 40, 2
+	emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
+
+	// rbit tmp_reg, fpcr_tmp_reg
+	emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k);
+
+	// msr fpcr, tmp_reg
+	emit32(0xD51B4400 | tmp_reg, code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+	constexpr uint32_t tmp_reg = 18;
+
+	uint32_t imm = instr.getImm32();
+
+	if (instr.getModCond() < StoreL3Condition)
+		imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
+	else
+		imm &= RandomX_CurrentConfig.ScratchpadL3_Size - 1;
+
+	emitAddImmediate(tmp_reg, dst, imm, code, k);
+
+	constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
+	const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
+	const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
+	const uint32_t andInstrL3 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 4) << 10);
+
+	emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k);
+
+	// str src, [x2, tmp_reg]
+	emit32(0xF8206840 | src | (tmp_reg << 16), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_NOP(Instruction& instr, uint32_t& codePos)
+{
+}
+
+InstructionGeneratorA64 JitCompilerA64::engine[256] = {};
+
+}
diff --git a/src/crypto/randomx/jit_compiler_a64.hpp b/src/crypto/randomx/jit_compiler_a64.hpp
index 4b0bed665..e524feb87 100644
--- a/src/crypto/randomx/jit_compiler_a64.hpp
+++ b/src/crypto/randomx/jit_compiler_a64.hpp
@@ -1,5 +1,6 @@
 /*
 Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
 
 All rights reserved.
 
@@ -32,42 +33,91 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 #include <stdexcept>
 #include "crypto/randomx/common.hpp"
+#include "crypto/randomx/jit_compiler_a64_static.hpp"
 
 namespace randomx {
 
 	class Program;
 	class ProgramConfiguration;
 	class SuperscalarProgram;
+	class Instruction;
+
+	typedef void(JitCompilerA64::*InstructionGeneratorA64)(Instruction&, uint32_t&);
 
 	class JitCompilerA64 {
 	public:
-		JitCompilerA64() {
-			throw std::runtime_error("ARM64 JIT compiler is not implemented yet.");
-		}
-		void generateProgram(Program&, ProgramConfiguration&) {
+		JitCompilerA64();
+		~JitCompilerA64();
+
+		void generateProgram(Program&, ProgramConfiguration&);
+		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
 
-		}
-		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) {
-			
-		}
 		template<size_t N>
-		void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &) {
+		void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &);
 
-		}
-		void generateDatasetInitCode() {
+		void generateDatasetInitCode() {}
 
+		ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(code); }
+		DatasetInitFunc* getDatasetInitFunc();
+		uint8_t* getCode() { return code; }
+		size_t getCodeSize();
+
+		static InstructionGeneratorA64 engine[256];
+		uint32_t reg_changed_offset[8];
+		uint8_t* code;
+		uint32_t literalPos;
+		uint32_t num32bitLiterals;
+
+		static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos)
+		{
+			*(uint32_t*)(code + codePos) = val;
+			codePos += sizeof(val);
 		}
-		ProgramFunc* getProgramFunc() {
-			return nullptr;
-		}
-		DatasetInitFunc* getDatasetInitFunc() {
-			return nullptr;
-		}
-		uint8_t* getCode() {
-			return nullptr;
-		}
-		size_t getCodeSize() {
-			return 0;
+
+		static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos)
+		{
+			*(uint64_t*)(code + codePos) = val;
+			codePos += sizeof(val);
 		}
+
+		void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos);
+		void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos);
+
+		template<uint32_t tmp_reg>
+		void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
+
+		template<uint32_t tmp_reg_fp>
+		void emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
+
+		void h_IADD_RS(Instruction&, uint32_t&);
+		void h_IADD_M(Instruction&, uint32_t&);
+		void h_ISUB_R(Instruction&, uint32_t&);
+		void h_ISUB_M(Instruction&, uint32_t&);
+		void h_IMUL_R(Instruction&, uint32_t&);
+		void h_IMUL_M(Instruction&, uint32_t&);
+		void h_IMULH_R(Instruction&, uint32_t&);
+		void h_IMULH_M(Instruction&, uint32_t&);
+		void h_ISMULH_R(Instruction&, uint32_t&);
+		void h_ISMULH_M(Instruction&, uint32_t&);
+		void h_IMUL_RCP(Instruction&, uint32_t&);
+		void h_INEG_R(Instruction&, uint32_t&);
+		void h_IXOR_R(Instruction&, uint32_t&);
+		void h_IXOR_M(Instruction&, uint32_t&);
+		void h_IROR_R(Instruction&, uint32_t&);
+		void h_IROL_R(Instruction&, uint32_t&);
+		void h_ISWAP_R(Instruction&, uint32_t&);
+		void h_FSWAP_R(Instruction&, uint32_t&);
+		void h_FADD_R(Instruction&, uint32_t&);
+		void h_FADD_M(Instruction&, uint32_t&);
+		void h_FSUB_R(Instruction&, uint32_t&);
+		void h_FSUB_M(Instruction&, uint32_t&);
+		void h_FSCAL_R(Instruction&, uint32_t&);
+		void h_FMUL_R(Instruction&, uint32_t&);
+		void h_FDIV_M(Instruction&, uint32_t&);
+		void h_FSQRT_R(Instruction&, uint32_t&);
+		void h_CBRANCH(Instruction&, uint32_t&);
+		void h_CFROUND(Instruction&, uint32_t&);
+		void h_ISTORE(Instruction&, uint32_t&);
+		void h_NOP(Instruction&, uint32_t&);
 	};
 }
diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S
new file mode 100644
index 000000000..a813e8210
--- /dev/null
+++ b/src/crypto/randomx/jit_compiler_a64_static.S
@@ -0,0 +1,576 @@
+# Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+# Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 	* Redistributions of source code must retain the above copyright
+# 	  notice, this list of conditions and the following disclaimer.
+# 	* Redistributions in binary form must reproduce the above copyright
+# 	  notice, this list of conditions and the following disclaimer in the
+# 	  documentation and/or other materials provided with the distribution.
+# 	* Neither the name of the copyright holder nor the
+# 	  names of its contributors may be used to endorse or promote products
+# 	  derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+	.arch armv8-a
+	.text
+	.global	randomx_program_aarch64
+	.global	randomx_program_aarch64_main_loop
+	.global	randomx_program_aarch64_vm_instructions
+	.global randomx_program_aarch64_imul_rcp_literals_end
+	.global	randomx_program_aarch64_vm_instructions_end
+	.global randomx_program_aarch64_cacheline_align_mask1
+	.global randomx_program_aarch64_cacheline_align_mask2
+	.global randomx_program_aarch64_update_spMix1
+	.global randomx_program_aarch64_vm_instructions_end_light
+	.global randomx_program_aarch64_light_cacheline_align_mask
+	.global randomx_program_aarch64_light_dataset_offset
+	.global randomx_init_dataset_aarch64
+	.global randomx_init_dataset_aarch64_end
+	.global randomx_calc_dataset_item_aarch64
+	.global randomx_calc_dataset_item_aarch64_prefetch
+	.global randomx_calc_dataset_item_aarch64_mix
+	.global randomx_calc_dataset_item_aarch64_store_result
+	.global randomx_calc_dataset_item_aarch64_end
+
+# Register allocation
+
+# x0  -> pointer to reg buffer and then literal for IMUL_RCP
+# x1  -> pointer to mem buffer and then to dataset
+# x2  -> pointer to scratchpad
+# x3  -> loop counter
+# x4  -> "r0"
+# x5  -> "r1"
+# x6  -> "r2"
+# x7  -> "r3"
+# x8  -> fpcr (reversed bits)
+# x9  -> mx, ma
+# x10 -> spMix1
+# x11 -> literal for IMUL_RCP
+# x12 -> "r4"
+# x13 -> "r5"
+# x14 -> "r6"
+# x15 -> "r7"
+# x16 -> spAddr0
+# x17 -> spAddr1
+# x18 -> temporary
+# x19 -> temporary
+# x20 -> literal for IMUL_RCP
+# x21 -> literal for IMUL_RCP
+# x22 -> literal for IMUL_RCP
+# x23 -> literal for IMUL_RCP
+# x24 -> literal for IMUL_RCP
+# x25 -> literal for IMUL_RCP
+# x26 -> literal for IMUL_RCP
+# x27 -> literal for IMUL_RCP
+# x28 -> literal for IMUL_RCP
+# x29 -> literal for IMUL_RCP
+# x30 -> literal for IMUL_RCP
+
+# v0-v15 -> store 32-bit literals
+# v16 -> "f0"
+# v17 -> "f1"
+# v18 -> "f2"
+# v19 -> "f3"
+# v20 -> "e0"
+# v21 -> "e1"
+# v22 -> "e2"
+# v23 -> "e3"
+# v24 -> "a0"
+# v25 -> "a1"
+# v26 -> "a2"
+# v27 -> "a3"
+# v28 -> temporary
+# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
+# v30 -> E 'or' mask  = 0x3*00000000******3*00000000******
+# v31 -> scale mask   = 0x81f000000000000081f0000000000000
+
+randomx_program_aarch64:
+	# Save callee-saved registers
+	sub	sp, sp, 192
+	stp	x16, x17, [sp]
+	stp	x18, x19, [sp, 16]
+	stp	x20, x21, [sp, 32]
+	stp	x22, x23, [sp, 48]
+	stp	x24, x25, [sp, 64]
+	stp	x26, x27, [sp, 80]
+	stp	x28, x29, [sp, 96]
+	stp	x8, x30, [sp, 112]
+	stp	d8, d9, [sp, 128]
+	stp	d10, d11, [sp, 144]
+	stp	d12, d13, [sp, 160]
+	stp	d14, d15, [sp, 176]
+
+	# Zero integer registers
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x6, xzr
+	mov	x7, xzr
+	mov	x12, xzr
+	mov	x13, xzr
+	mov	x14, xzr
+	mov	x15, xzr
+
+	# Load ma, mx and dataset pointer
+	ldp	x9, x1, [x1]
+
+	# Load initial spMix value
+	mov	x10, x9
+
+	# Load group A registers
+	ldp	q24, q25, [x0, 192]
+	ldp	q26, q27, [x0, 224]
+
+	# Load E 'and' mask
+	mov	x16, 0x00FFFFFFFFFFFFFF
+	ins	v29.d[0], x16
+	ins	v29.d[1], x16
+
+	# Load E 'or' mask (stored in reg.f[0])
+	ldr	q30, [x0, 64]
+
+	# Load scale mask
+	mov	x16, 0x80f0000000000000
+	ins	v31.d[0], x16
+	ins	v31.d[1], x16
+
+	# Read fpcr
+	mrs	x8, fpcr
+	rbit	x8, x8
+
+	# Save x0
+	str	x0, [sp, -16]!
+
+	# Read literals
+	ldr	x0, literal_x0
+	ldr	x11, literal_x11
+	ldr	x20, literal_x20
+	ldr	x21, literal_x21
+	ldr	x22, literal_x22
+	ldr	x23, literal_x23
+	ldr	x24, literal_x24
+	ldr	x25, literal_x25
+	ldr	x26, literal_x26
+	ldr	x27, literal_x27
+	ldr	x28, literal_x28
+	ldr	x29, literal_x29
+	ldr	x30, literal_x30
+
+	ldr	q0, literal_v0
+	ldr	q1, literal_v1
+	ldr	q2, literal_v2
+	ldr	q3, literal_v3
+	ldr	q4, literal_v4
+	ldr	q5, literal_v5
+	ldr	q6, literal_v6
+	ldr	q7, literal_v7
+	ldr	q8, literal_v8
+	ldr	q9, literal_v9
+	ldr	q10, literal_v10
+	ldr	q11, literal_v11
+	ldr	q12, literal_v12
+	ldr	q13, literal_v13
+	ldr	q14, literal_v14
+	ldr	q15, literal_v15
+
+randomx_program_aarch64_main_loop:
+	# spAddr0 = spMix1 & ScratchpadL3Mask64;
+	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
+	lsr	x18, x10, 32
+
+	# Actual mask will be inserted by JIT compiler
+	and	w16, w10, 1
+	and	w17, w18, 1
+
+	# x16 = scratchpad + spAddr0
+	# x17 = scratchpad + spAddr1
+	add	x16, x16, x2
+	add	x17, x17, x2
+
+	# xor integer registers with scratchpad data (spAddr0)
+	ldp	x18, x19, [x16]
+	eor	x4, x4, x18
+	eor	x5, x5, x19
+	ldp	x18, x19, [x16, 16]
+	eor	x6, x6, x18
+	eor	x7, x7, x19
+	ldp	x18, x19, [x16, 32]
+	eor	x12, x12, x18
+	eor	x13, x13, x19
+	ldp	x18, x19, [x16, 48]
+	eor	x14, x14, x18
+	eor	x15, x15, x19
+
+	# Load group F registers (spAddr1)
+	ldpsw	x18, x19, [x17]
+	ins	v16.d[0], x18
+	ins	v16.d[1], x19
+	ldpsw	x18, x19, [x17, 8]
+	ins	v17.d[0], x18
+	ins	v17.d[1], x19
+	ldpsw	x18, x19, [x17, 16]
+	ins	v18.d[0], x18
+	ins	v18.d[1], x19
+	ldpsw	x18, x19, [x17, 24]
+	ins	v19.d[0], x18
+	ins	v19.d[1], x19
+	scvtf	v16.2d, v16.2d
+	scvtf	v17.2d, v17.2d
+	scvtf	v18.2d, v18.2d
+	scvtf	v19.2d, v19.2d
+
+	# Load group E registers (spAddr1)
+	ldpsw	x18, x19, [x17, 32]
+	ins	v20.d[0], x18
+	ins	v20.d[1], x19
+	ldpsw	x18, x19, [x17, 40]
+	ins	v21.d[0], x18
+	ins	v21.d[1], x19
+	ldpsw	x18, x19, [x17, 48]
+	ins	v22.d[0], x18
+	ins	v22.d[1], x19
+	ldpsw	x18, x19, [x17, 56]
+	ins	v23.d[0], x18
+	ins	v23.d[1], x19
+	scvtf	v20.2d, v20.2d
+	scvtf	v21.2d, v21.2d
+	scvtf	v22.2d, v22.2d
+	scvtf	v23.2d, v23.2d
+	and	v20.16b, v20.16b, v29.16b
+	and	v21.16b, v21.16b, v29.16b
+	and	v22.16b, v22.16b, v29.16b
+	and	v23.16b, v23.16b, v29.16b
+	orr	v20.16b, v20.16b, v30.16b
+	orr	v21.16b, v21.16b, v30.16b
+	orr	v22.16b, v22.16b, v30.16b
+	orr	v23.16b, v23.16b, v30.16b
+
+	# Execute VM instructions
+randomx_program_aarch64_vm_instructions:
+
+	# 16 KB buffer for generated instructions
+	.fill 4096,4,0
+
+literal_x0:  .fill 1,8,0
+literal_x11: .fill 1,8,0
+literal_x20: .fill 1,8,0
+literal_x21: .fill 1,8,0
+literal_x22: .fill 1,8,0
+literal_x23: .fill 1,8,0
+literal_x24: .fill 1,8,0
+literal_x25: .fill 1,8,0
+literal_x26: .fill 1,8,0
+literal_x27: .fill 1,8,0
+literal_x28: .fill 1,8,0
+literal_x29: .fill 1,8,0
+literal_x30: .fill 1,8,0
+randomx_program_aarch64_imul_rcp_literals_end:
+
+literal_v0:  .fill 2,8,0
+literal_v1:  .fill 2,8,0
+literal_v2:  .fill 2,8,0
+literal_v3:  .fill 2,8,0
+literal_v4:  .fill 2,8,0
+literal_v5:  .fill 2,8,0
+literal_v6:  .fill 2,8,0
+literal_v7:  .fill 2,8,0
+literal_v8:  .fill 2,8,0
+literal_v9:  .fill 2,8,0
+literal_v10: .fill 2,8,0
+literal_v11: .fill 2,8,0
+literal_v12: .fill 2,8,0
+literal_v13: .fill 2,8,0
+literal_v14: .fill 2,8,0
+literal_v15: .fill 2,8,0
+
+randomx_program_aarch64_vm_instructions_end:
+
+	# mx ^= r[readReg2] ^ r[readReg3];
+	eor	x9, x9, x18
+
+	# Calculate dataset pointer for dataset prefetch
+	mov	w18, w9
+randomx_program_aarch64_cacheline_align_mask1:
+	# Actual mask will be inserted by JIT compiler
+	and	x18, x18, 1
+	add	x18, x18, x1
+
+	# Prefetch dataset data
+	prfm	pldl2strm, [x18]
+
+	# mx <-> ma
+	ror	x9, x9, 32
+
+	# Calculate dataset pointer for dataset read
+	mov	w10, w9
+randomx_program_aarch64_cacheline_align_mask2:
+	# Actual mask will be inserted by JIT compiler
+	and	x10, x10, 1
+	add	x10, x10, x1
+
+randomx_program_aarch64_xor_with_dataset_line:
+	# xor integer registers with dataset data
+	ldp	x18, x19, [x10]
+	eor	x4, x4, x18
+	eor	x5, x5, x19
+	ldp	x18, x19, [x10, 16]
+	eor	x6, x6, x18
+	eor	x7, x7, x19
+	ldp	x18, x19, [x10, 32]
+	eor	x12, x12, x18
+	eor	x13, x13, x19
+	ldp	x18, x19, [x10, 48]
+	eor	x14, x14, x18
+	eor	x15, x15, x19
+
+randomx_program_aarch64_update_spMix1:
+	# JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1"
+	eor	x10, x0, x0
+
+	# Store integer registers to scratchpad (spAddr1)
+	stp	x4, x5, [x17, 0]
+	stp	x6, x7, [x17, 16]
+	stp	x12, x13, [x17, 32]
+	stp	x14, x15, [x17, 48]
+
+	# xor group F and group E registers
+	eor	v16.16b, v16.16b, v20.16b
+	eor	v17.16b, v17.16b, v21.16b
+	eor	v18.16b, v18.16b, v22.16b
+	eor	v19.16b, v19.16b, v23.16b
+
+	# Store FP registers to scratchpad (spAddr0)
+	stp	q16, q17, [x16, 0]
+	stp	q18, q19, [x16, 32]
+
+	subs	x3, x3, 1
+	bne	randomx_program_aarch64_main_loop
+	
+	# Restore x0
+	ldr	x0, [sp], 16
+
+	# Store integer registers
+	stp	x4, x5, [x0, 0]
+	stp	x6, x7, [x0, 16]
+	stp	x12, x13, [x0, 32]
+	stp	x14, x15, [x0, 48]
+
+	# Store FP registers
+	stp	q16, q17, [x0, 64]
+	stp	q18, q19, [x0, 96]
+	stp	q20, q21, [x0, 128]
+	stp	q22, q23, [x0, 160]
+
+	# Restore callee-saved registers
+	ldp	x16, x17, [sp]
+	ldp	x18, x19, [sp, 16]
+	ldp	x20, x21, [sp, 32]
+	ldp	x22, x23, [sp, 48]
+	ldp	x24, x25, [sp, 64]
+	ldp	x26, x27, [sp, 80]
+	ldp	x28, x29, [sp, 96]
+	ldp	x8, x30, [sp, 112]
+	ldp	d8, d9, [sp, 128]
+	ldp	d10, d11, [sp, 144]
+	ldp	d12, d13, [sp, 160]
+	ldp	d14, d15, [sp, 176]
+	add	sp, sp, 192
+
+	ret
+
+randomx_program_aarch64_vm_instructions_end_light:
+	sub	sp, sp, 96
+	stp	x0, x1, [sp, 64]
+	stp	x2, x30, [sp, 80]
+
+	# mx ^= r[readReg2] ^ r[readReg3];
+	eor	x9, x9, x18
+
+	# mx <-> ma
+	ror	x9, x9, 32
+
+	# x0 -> pointer to cache memory
+	mov	x0, x1
+
+	# x1 -> pointer to output
+	mov	x1, sp
+
+randomx_program_aarch64_light_cacheline_align_mask:
+	# Actual mask will be inserted by JIT compiler
+	and	w2, w9, 1
+
+	# x2 -> item number
+	lsr	x2, x2, 6
+
+randomx_program_aarch64_light_dataset_offset:
+	# Apply dataset offset (filled in by JIT compiler)
+	add	x2, x2, 0
+	add	x2, x2, 0
+
+	bl	randomx_calc_dataset_item_aarch64
+
+	mov	x10, sp
+	ldp	x0, x1, [sp, 64]
+	ldp	x2, x30, [sp, 80]
+	add	sp, sp, 96
+
+	b	randomx_program_aarch64_xor_with_dataset_line
+
+
+
+# Input parameters
+#
+# x0 -> pointer to cache
+# x1 -> pointer to dataset memory at startItem
+# x2 -> start item
+# x3 -> end item
+
+randomx_init_dataset_aarch64:
+	# Save x30 (return address)
+	str	x30, [sp, -16]!
+
+	# Load pointer to cache memory
+	ldr	x0, [x0]
+
+randomx_init_dataset_aarch64_main_loop:
+	bl	randomx_calc_dataset_item_aarch64
+	add	x1, x1, 64
+	add	x2, x2, 1
+	cmp	x2, x3
+	bne	randomx_init_dataset_aarch64_main_loop
+
+	# Restore x30 (return address)
+	ldr	x30, [sp], 16
+
+	ret
+
+randomx_init_dataset_aarch64_end:
+
+# Input parameters
+#
+# x0 -> pointer to cache memory
+# x1 -> pointer to output
+# x2 -> item number
+#
+# Register allocation
+#
+# x0-x7 -> output value (calculated dataset item)
+# x8 -> pointer to cache memory
+# x9 -> pointer to output
+# x10 -> registerValue
+# x11 -> mixBlock
+# x12 -> temporary
+# x13 -> temporary
+
+randomx_calc_dataset_item_aarch64:
+	sub	sp, sp, 112
+	stp	x0, x1, [sp]
+	stp	x2, x3, [sp, 16]
+	stp	x4, x5, [sp, 32]
+	stp	x6, x7, [sp, 48]
+	stp	x8, x9, [sp, 64]
+	stp	x10, x11, [sp, 80]
+	stp	x12, x13, [sp, 96]
+
+	mov	x8, x0
+	mov	x9, x1
+	mov	x10, x2
+
+	# rl[0] = (itemNumber + 1) * superscalarMul0;
+	ldr	x12, superscalarMul0
+	madd	x0, x2, x12, x12
+
+	# rl[1] = rl[0] ^ superscalarAdd1;
+	ldr	x12, superscalarAdd1
+	eor	x1, x0, x12
+
+	# rl[2] = rl[0] ^ superscalarAdd2;
+	ldr	x12, superscalarAdd2
+	eor	x2, x0, x12
+
+	# rl[3] = rl[0] ^ superscalarAdd3;
+	ldr	x12, superscalarAdd3
+	eor	x3, x0, x12
+
+	# rl[4] = rl[0] ^ superscalarAdd4;
+	ldr	x12, superscalarAdd4
+	eor	x4, x0, x12
+
+	# rl[5] = rl[0] ^ superscalarAdd5;
+	ldr	x12, superscalarAdd5
+	eor	x5, x0, x12
+
+	# rl[6] = rl[0] ^ superscalarAdd6;
+	ldr	x12, superscalarAdd6
+	eor	x6, x0, x12
+
+	# rl[7] = rl[0] ^ superscalarAdd7;
+	ldr	x12, superscalarAdd7
+	eor	x7, x0, x12
+
+	b	randomx_calc_dataset_item_aarch64_prefetch
+
+superscalarMul0: .quad 6364136223846793005
+superscalarAdd1: .quad 9298411001130361340
+superscalarAdd2: .quad 12065312585734608966
+superscalarAdd3: .quad 9306329213124626780
+superscalarAdd4: .quad 5281919268842080866
+superscalarAdd5: .quad 10536153434571861004
+superscalarAdd6: .quad 3398623926847679864
+superscalarAdd7: .quad 9549104520008361294
+
+# Prefetch -> SuperScalar hash -> Mix will be repeated N times
+
+randomx_calc_dataset_item_aarch64_prefetch:
+	# Actual mask will be inserted by JIT compiler
+	and	x11, x10, 1
+	add	x11, x8, x11, lsl 6
+	prfm	pldl2strm, [x11]
+
+	# Generated SuperScalar hash program goes here
+
+randomx_calc_dataset_item_aarch64_mix:
+	ldp	x12, x13, [x11]
+	eor	x0, x0, x12
+	eor	x1, x1, x13
+	ldp	x12, x13, [x11, 16]
+	eor	x2, x2, x12
+	eor	x3, x3, x13
+	ldp	x12, x13, [x11, 32]
+	eor	x4, x4, x12
+	eor	x5, x5, x13
+	ldp	x12, x13, [x11, 48]
+	eor	x6, x6, x12
+	eor	x7, x7, x13
+
+randomx_calc_dataset_item_aarch64_store_result:
+	stp	x0, x1, [x9]
+	stp	x2, x3, [x9, 16]
+	stp	x4, x5, [x9, 32]
+	stp	x6, x7, [x9, 48]
+
+	ldp	x0, x1, [sp]
+	ldp	x2, x3, [sp, 16]
+	ldp	x4, x5, [sp, 32]
+	ldp	x6, x7, [sp, 48]
+	ldp	x8, x9, [sp, 64]
+	ldp	x10, x11, [sp, 80]
+	ldp	x12, x13, [sp, 96]
+	add	sp, sp, 112
+
+	ret
+
+randomx_calc_dataset_item_aarch64_end:
diff --git a/src/crypto/randomx/jit_compiler_a64_static.hpp b/src/crypto/randomx/jit_compiler_a64_static.hpp
new file mode 100644
index 000000000..a9b922e29
--- /dev/null
+++ b/src/crypto/randomx/jit_compiler_a64_static.hpp
@@ -0,0 +1,51 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern "C" {
+	void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
+	void randomx_program_aarch64_main_loop();
+	void randomx_program_aarch64_vm_instructions();
+	void randomx_program_aarch64_imul_rcp_literals_end();
+	void randomx_program_aarch64_vm_instructions_end();
+	void randomx_program_aarch64_cacheline_align_mask1();
+	void randomx_program_aarch64_cacheline_align_mask2();
+	void randomx_program_aarch64_update_spMix1();
+	void randomx_program_aarch64_vm_instructions_end_light();
+	void randomx_program_aarch64_light_cacheline_align_mask();
+	void randomx_program_aarch64_light_dataset_offset();
+	void randomx_init_dataset_aarch64();
+	void randomx_init_dataset_aarch64_end();
+	void randomx_calc_dataset_item_aarch64();
+	void randomx_calc_dataset_item_aarch64_prefetch();
+	void randomx_calc_dataset_item_aarch64_mix();
+	void randomx_calc_dataset_item_aarch64_store_result();
+	void randomx_calc_dataset_item_aarch64_end();
+}
diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp
index 516807042..cab1be9fc 100644
--- a/src/crypto/randomx/randomx.cpp
+++ b/src/crypto/randomx/randomx.cpp
@@ -26,6 +26,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include "crypto/randomx/common.hpp"
 #include "crypto/randomx/randomx.h"
 #include "crypto/randomx/dataset.hpp"
 #include "crypto/randomx/vm_interpreted.hpp"
@@ -33,7 +34,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "crypto/randomx/vm_compiled.hpp"
 #include "crypto/randomx/vm_compiled_light.hpp"
 #include "crypto/randomx/blake2/blake2.h"
+
+#if defined(_M_X64) || defined(__x86_64__)
 #include "crypto/randomx/jit_compiler_x86_static.hpp"
+#elif defined(XMRIG_ARM)
+#include "crypto/randomx/jit_compiler_a64_static.hpp"
+#endif
+
 #include <cassert>
 
 RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
@@ -156,19 +163,10 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
 #endif
 }
 
+static uint32_t Log2(size_t value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; }
+
 void RandomX_ConfigurationBase::Apply()
 {
-#if defined(_M_X64) || defined(__x86_64__)
-	*(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1;
-	const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE;
-	*(uint32_t*)(codeReadDatasetTweaked + 7) = DatasetBaseMask;
-	*(uint32_t*)(codeReadDatasetTweaked + 23) = DatasetBaseMask;
-	*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask;
-#endif
-
-	CacheLineAlignMask_Calculated = (DatasetBaseSize - 1) & ~(RANDOMX_DATASET_ITEM_SIZE - 1);
-	DatasetExtraItems_Calculated = DatasetExtraSize / RANDOMX_DATASET_ITEM_SIZE;
-
 	ScratchpadL1Mask_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) - 1) * 8;
 	ScratchpadL1Mask16_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) / 2 - 1) * 16;
 	ScratchpadL2Mask_Calculated = (ScratchpadL2_Size / sizeof(uint64_t) - 1) * 8;
@@ -176,22 +174,40 @@ void RandomX_ConfigurationBase::Apply()
 	ScratchpadL3Mask_Calculated = (((ScratchpadL3_Size / sizeof(uint64_t)) - 1) * 8);
 	ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64;
 
-#if defined(_M_X64) || defined(__x86_64__)
-	*(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated;
-	*(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
-#endif
+	CacheLineAlignMask_Calculated = (DatasetBaseSize - 1) & ~(RANDOMX_DATASET_ITEM_SIZE - 1);
+	DatasetExtraItems_Calculated = DatasetExtraSize / RANDOMX_DATASET_ITEM_SIZE;
 
 	ConditionMask_Calculated = (1 << JumpBits) - 1;
 
-	constexpr int CEIL_NULL = 0;
-	int k = 0;
-
 #if defined(_M_X64) || defined(__x86_64__)
+	*(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1;
+	const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE;
+	*(uint32_t*)(codeReadDatasetTweaked + 7) = DatasetBaseMask;
+	*(uint32_t*)(codeReadDatasetTweaked + 23) = DatasetBaseMask;
+	*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask;
+
+	*(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated;
+	*(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
+
 #define JIT_HANDLE(x, prev) randomx::JitCompilerX86::engine[k] = &randomx::JitCompilerX86::h_##x
+
+#elif defined(XMRIG_ARM)
+
+	Log2_ScratchpadL1 = Log2(ScratchpadL1_Size);
+	Log2_ScratchpadL2 = Log2(ScratchpadL2_Size);
+	Log2_ScratchpadL3 = Log2(ScratchpadL3_Size);
+	Log2_DatasetBaseSize = Log2(DatasetBaseSize);
+	Log2_CacheSize = Log2((ArgonMemory * randomx::ArgonBlockSize) / randomx::CacheLineSize);
+
+#define JIT_HANDLE(x, prev) randomx::JitCompilerA64::engine[k] = &randomx::JitCompilerA64::h_##x
+
 #else
 #define JIT_HANDLE(x, prev)
 #endif
 
+	constexpr int CEIL_NULL = 0;
+	int k = 0;
+
 #define INST_HANDLE(x, prev) \
 	CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \
 	for (; k < CEIL_##x; ++k) { JIT_HANDLE(x, prev); }
@@ -435,12 +451,12 @@ extern "C" {
 		assert(inputSize == 0 || input != nullptr);
 		assert(output != nullptr);
 		alignas(16) uint64_t tempHash[8];
-        rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
+		rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
 		machine->initScratchpad(&tempHash);
 		machine->resetRoundingMode();
 		for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
 			machine->run(&tempHash);
-            rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
+			rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
 		}
 		machine->run(&tempHash);
 		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h
index 05b7bdc8f..c69fb3131 100644
--- a/src/crypto/randomx/randomx.h
+++ b/src/crypto/randomx/randomx.h
@@ -133,6 +133,14 @@ struct RandomX_ConfigurationBase
 
 	uint32_t ConditionMask_Calculated;
 
+#ifdef XMRIG_ARM
+	uint32_t Log2_ScratchpadL1;
+	uint32_t Log2_ScratchpadL2;
+	uint32_t Log2_ScratchpadL3;
+	uint32_t Log2_DatasetBaseSize;
+	uint32_t Log2_CacheSize;
+#endif
+
 	int CEIL_IADD_RS;
 	int CEIL_IADD_M;
 	int CEIL_ISUB_R;
diff --git a/src/crypto/randomx/vm_compiled.cpp b/src/crypto/randomx/vm_compiled.cpp
index f3b9758c4..d2ee59e8b 100644
--- a/src/crypto/randomx/vm_compiled.cpp
+++ b/src/crypto/randomx/vm_compiled.cpp
@@ -50,6 +50,9 @@ namespace randomx {
 
 	template<bool softAes>
 	void CompiledVm<softAes>::execute() {
+#ifdef XMRIG_ARM
+		memcpy(reg.f, config.eMask, sizeof(config.eMask));
+#endif
 		compiler.getProgramFunc()(reg, mem, scratchpad, RandomX_CurrentConfig.ProgramIterations);
 	}
 
diff --git a/src/crypto/rx/RxVm.cpp b/src/crypto/rx/RxVm.cpp
index 275f9558b..6426443a4 100644
--- a/src/crypto/rx/RxVm.cpp
+++ b/src/crypto/rx/RxVm.cpp
@@ -33,11 +33,9 @@
 
 xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes)
 {
-#   ifndef XMRIG_ARM
     if (!softAes) {
        m_flags |= RANDOMX_FLAG_HARD_AES;
     }
-#   endif
 
     if (dataset->get()) {
         m_flags |= RANDOMX_FLAG_FULL_MEM;