mirror of
https://github.com/xmrig/xmrig.git
synced 2025-03-09 10:16:05 +00:00
Merge pull request #1185 from SChernykh/evo
Added JIT compiler for RandomX on ARMv8
This commit is contained in:
commit
dc686bd1bf
13 changed files with 1919 additions and 60 deletions
|
@ -143,7 +143,7 @@ else()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||||
EXECUTE_PROCESS(COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
EXECUTE_PROCESS(COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
||||||
if (OPERATING_SYSTEM MATCHES "Android")
|
if (OPERATING_SYSTEM MATCHES "Android")
|
||||||
set(EXTRA_LIBS ${EXTRA_LIBS} log)
|
set(EXTRA_LIBS ${EXTRA_LIBS} log)
|
||||||
|
|
|
@ -51,6 +51,13 @@ if (WITH_RANDOMX)
|
||||||
)
|
)
|
||||||
# cheat because cmake and ccache hate each other
|
# cheat because cmake and ccache hate each other
|
||||||
set_property(SOURCE src/crypto/randomx/jit_compiler_x86_static.S PROPERTY LANGUAGE C)
|
set_property(SOURCE src/crypto/randomx/jit_compiler_x86_static.S PROPERTY LANGUAGE C)
|
||||||
|
elseif (XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||||
|
list(APPEND SOURCES_CRYPTO
|
||||||
|
src/crypto/randomx/jit_compiler_a64_static.S
|
||||||
|
src/crypto/randomx/jit_compiler_a64.cpp
|
||||||
|
)
|
||||||
|
# cheat because cmake and ccache hate each other
|
||||||
|
set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (CMAKE_CXX_COMPILER_ID MATCHES Clang)
|
if (CMAKE_CXX_COMPILER_ID MATCHES Clang)
|
||||||
|
|
|
@ -108,7 +108,7 @@ namespace randomx {
|
||||||
class JitCompilerX86;
|
class JitCompilerX86;
|
||||||
using JitCompiler = JitCompilerX86;
|
using JitCompiler = JitCompilerX86;
|
||||||
#elif defined(__aarch64__)
|
#elif defined(__aarch64__)
|
||||||
#define RANDOMX_HAVE_COMPILER 0
|
#define RANDOMX_HAVE_COMPILER 1
|
||||||
class JitCompilerA64;
|
class JitCompilerA64;
|
||||||
using JitCompiler = JitCompilerA64;
|
using JitCompiler = JitCompilerA64;
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -82,6 +82,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define HAVE_SETROUNDMODE_IMPL
|
#define HAVE_SETROUNDMODE_IMPL
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef HAVE_SETROUNDMODE_IMPL
|
||||||
|
static void setRoundMode_(uint32_t mode) {
|
||||||
|
fesetround(mode);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_ROTR64
|
#ifndef HAVE_ROTR64
|
||||||
uint64_t rotr64(uint64_t a, unsigned int b) {
|
uint64_t rotr64(uint64_t a, unsigned int b) {
|
||||||
return (a >> b) | (a << (-b & 63));
|
return (a >> b) | (a << (-b & 63));
|
||||||
|
@ -127,12 +133,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#ifdef RANDOMX_DEFAULT_FENV
|
#ifdef RANDOMX_DEFAULT_FENV
|
||||||
|
|
||||||
# ifndef HAVE_SETROUNDMODE_IMPL
|
|
||||||
static void setRoundMode_(uint32_t mode) {
|
|
||||||
fesetround(mode);
|
|
||||||
}
|
|
||||||
# endif
|
|
||||||
|
|
||||||
void rx_reset_float_state() {
|
void rx_reset_float_state() {
|
||||||
setRoundMode_(FE_TONEAREST);
|
setRoundMode_(FE_TONEAREST);
|
||||||
rx_set_double_precision(); //set precision to 53 bits if needed by the platform
|
rx_set_double_precision(); //set precision to 53 bits if needed by the platform
|
||||||
|
|
|
@ -376,11 +376,138 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
|
||||||
|
|
||||||
#define RANDOMX_DEFAULT_FENV
|
#define RANDOMX_DEFAULT_FENV
|
||||||
|
|
||||||
void rx_reset_float_state();
|
#elif defined(__aarch64__)
|
||||||
|
|
||||||
void rx_set_rounding_mode(uint32_t mode);
|
#include <stdlib.h>
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#include <arm_acle.h>
|
||||||
|
|
||||||
#else //end altivec
|
typedef uint8x16_t rx_vec_i128;
|
||||||
|
typedef float64x2_t rx_vec_f128;
|
||||||
|
|
||||||
|
inline void* rx_aligned_alloc(size_t size, size_t align) {
|
||||||
|
void* p;
|
||||||
|
if (posix_memalign(&p, align, size) == 0)
|
||||||
|
return p;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define rx_aligned_free(a) free(a)
|
||||||
|
|
||||||
|
inline void rx_prefetch_nta(void* ptr) {
|
||||||
|
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
|
||||||
|
return vld1q_f64((const float64_t*)pd);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 val) {
|
||||||
|
vst1q_f64((float64_t*)mem_addr, val);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
|
||||||
|
float64x2_t temp;
|
||||||
|
temp = vcopyq_laneq_f64(temp, 1, a, 1);
|
||||||
|
a = vcopyq_laneq_f64(a, 1, a, 0);
|
||||||
|
return vcopyq_laneq_f64(a, 0, temp, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
|
||||||
|
uint64x2_t temp0 = vdupq_n_u64(x0);
|
||||||
|
uint64x2_t temp1 = vdupq_n_u64(x1);
|
||||||
|
return vreinterpretq_f64_u64(vcopyq_laneq_u64(temp0, 1, temp1, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
|
||||||
|
return vreinterpretq_f64_u64(vdupq_n_u64(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
#define rx_add_vec_f128 vaddq_f64
|
||||||
|
#define rx_sub_vec_f128 vsubq_f64
|
||||||
|
#define rx_mul_vec_f128 vmulq_f64
|
||||||
|
#define rx_div_vec_f128 vdivq_f64
|
||||||
|
#define rx_sqrt_vec_f128 vsqrtq_f64
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
|
return vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
|
return vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
|
return vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __ARM_FEATURE_CRYPTO
|
||||||
|
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
|
||||||
|
const uint8x16_t zero = { 0 };
|
||||||
|
return vaesmcq_u8(vaeseq_u8(a, zero)) ^ key;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
|
||||||
|
const uint8x16_t zero = { 0 };
|
||||||
|
return vaesimcq_u8(vaesdq_u8(a, zero)) ^ key;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAVE_AES
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define rx_xor_vec_i128 veorq_u8
|
||||||
|
|
||||||
|
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
|
||||||
|
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
|
||||||
|
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
|
||||||
|
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
|
||||||
|
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
|
||||||
|
int32_t data[4];
|
||||||
|
data[0] = _I0;
|
||||||
|
data[1] = _I1;
|
||||||
|
data[2] = _I2;
|
||||||
|
data[3] = _I3;
|
||||||
|
return vreinterpretq_u8_s32(vld1q_s32(data));
|
||||||
|
};
|
||||||
|
|
||||||
|
#define rx_xor_vec_i128 veorq_u8
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(const rx_vec_i128* mem_addr) {
|
||||||
|
return vld1q_u8((const uint8_t*)mem_addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128* mem_addr, rx_vec_i128 val) {
|
||||||
|
vst1q_u8((uint8_t*)mem_addr, val);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
|
||||||
|
double lo = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
|
||||||
|
double hi = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
|
||||||
|
rx_vec_f128 x;
|
||||||
|
x = vsetq_lane_f64(lo, x, 0);
|
||||||
|
x = vsetq_lane_f64(hi, x, 1);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define RANDOMX_DEFAULT_FENV
|
||||||
|
|
||||||
|
#else //portable fallback
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
@ -487,7 +614,6 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
rx_vec_f128 x;
|
rx_vec_f128 x;
|
||||||
x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
|
x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
|
||||||
|
@ -578,10 +704,6 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
|
||||||
|
|
||||||
#define RANDOMX_DEFAULT_FENV
|
#define RANDOMX_DEFAULT_FENV
|
||||||
|
|
||||||
void rx_reset_float_state();
|
|
||||||
|
|
||||||
void rx_set_rounding_mode(uint32_t mode);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_AES
|
#ifndef HAVE_AES
|
||||||
|
@ -598,6 +720,14 @@ FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef RANDOMX_DEFAULT_FENV
|
||||||
|
|
||||||
|
void rx_reset_float_state();
|
||||||
|
|
||||||
|
void rx_set_rounding_mode(uint32_t mode);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
double loadDoublePortable(const void* addr);
|
double loadDoublePortable(const void* addr);
|
||||||
uint64_t mulh(uint64_t, uint64_t);
|
uint64_t mulh(uint64_t, uint64_t);
|
||||||
int64_t smulh(int64_t, int64_t);
|
int64_t smulh(int64_t, int64_t);
|
||||||
|
|
1020
src/crypto/randomx/jit_compiler_a64.cpp
Normal file
1020
src/crypto/randomx/jit_compiler_a64.cpp
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,6 @@
|
||||||
/*
|
/*
|
||||||
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
||||||
|
Copyright (c) 2019, SChernykh <https://github.com/SChernykh>
|
||||||
|
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
|
@ -32,42 +33,91 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include "crypto/randomx/common.hpp"
|
#include "crypto/randomx/common.hpp"
|
||||||
|
#include "crypto/randomx/jit_compiler_a64_static.hpp"
|
||||||
|
|
||||||
namespace randomx {
|
namespace randomx {
|
||||||
|
|
||||||
class Program;
|
class Program;
|
||||||
class ProgramConfiguration;
|
class ProgramConfiguration;
|
||||||
class SuperscalarProgram;
|
class SuperscalarProgram;
|
||||||
|
class Instruction;
|
||||||
|
|
||||||
|
typedef void(JitCompilerA64::*InstructionGeneratorA64)(Instruction&, uint32_t&);
|
||||||
|
|
||||||
class JitCompilerA64 {
|
class JitCompilerA64 {
|
||||||
public:
|
public:
|
||||||
JitCompilerA64() {
|
JitCompilerA64();
|
||||||
throw std::runtime_error("ARM64 JIT compiler is not implemented yet.");
|
~JitCompilerA64();
|
||||||
}
|
|
||||||
void generateProgram(Program&, ProgramConfiguration&) {
|
void generateProgram(Program&, ProgramConfiguration&);
|
||||||
|
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
|
||||||
|
|
||||||
}
|
|
||||||
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) {
|
|
||||||
|
|
||||||
}
|
|
||||||
template<size_t N>
|
template<size_t N>
|
||||||
void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &) {
|
void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &);
|
||||||
|
|
||||||
}
|
void generateDatasetInitCode() {}
|
||||||
void generateDatasetInitCode() {
|
|
||||||
|
|
||||||
|
ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(code); }
|
||||||
|
DatasetInitFunc* getDatasetInitFunc();
|
||||||
|
uint8_t* getCode() { return code; }
|
||||||
|
size_t getCodeSize();
|
||||||
|
|
||||||
|
static InstructionGeneratorA64 engine[256];
|
||||||
|
uint32_t reg_changed_offset[8];
|
||||||
|
uint8_t* code;
|
||||||
|
uint32_t literalPos;
|
||||||
|
uint32_t num32bitLiterals;
|
||||||
|
|
||||||
|
static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos)
|
||||||
|
{
|
||||||
|
*(uint32_t*)(code + codePos) = val;
|
||||||
|
codePos += sizeof(val);
|
||||||
}
|
}
|
||||||
ProgramFunc* getProgramFunc() {
|
|
||||||
return nullptr;
|
static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos)
|
||||||
}
|
{
|
||||||
DatasetInitFunc* getDatasetInitFunc() {
|
*(uint64_t*)(code + codePos) = val;
|
||||||
return nullptr;
|
codePos += sizeof(val);
|
||||||
}
|
|
||||||
uint8_t* getCode() {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
size_t getCodeSize() {
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos);
|
||||||
|
void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos);
|
||||||
|
|
||||||
|
template<uint32_t tmp_reg>
|
||||||
|
void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
|
||||||
|
|
||||||
|
template<uint32_t tmp_reg_fp>
|
||||||
|
void emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
|
||||||
|
|
||||||
|
void h_IADD_RS(Instruction&, uint32_t&);
|
||||||
|
void h_IADD_M(Instruction&, uint32_t&);
|
||||||
|
void h_ISUB_R(Instruction&, uint32_t&);
|
||||||
|
void h_ISUB_M(Instruction&, uint32_t&);
|
||||||
|
void h_IMUL_R(Instruction&, uint32_t&);
|
||||||
|
void h_IMUL_M(Instruction&, uint32_t&);
|
||||||
|
void h_IMULH_R(Instruction&, uint32_t&);
|
||||||
|
void h_IMULH_M(Instruction&, uint32_t&);
|
||||||
|
void h_ISMULH_R(Instruction&, uint32_t&);
|
||||||
|
void h_ISMULH_M(Instruction&, uint32_t&);
|
||||||
|
void h_IMUL_RCP(Instruction&, uint32_t&);
|
||||||
|
void h_INEG_R(Instruction&, uint32_t&);
|
||||||
|
void h_IXOR_R(Instruction&, uint32_t&);
|
||||||
|
void h_IXOR_M(Instruction&, uint32_t&);
|
||||||
|
void h_IROR_R(Instruction&, uint32_t&);
|
||||||
|
void h_IROL_R(Instruction&, uint32_t&);
|
||||||
|
void h_ISWAP_R(Instruction&, uint32_t&);
|
||||||
|
void h_FSWAP_R(Instruction&, uint32_t&);
|
||||||
|
void h_FADD_R(Instruction&, uint32_t&);
|
||||||
|
void h_FADD_M(Instruction&, uint32_t&);
|
||||||
|
void h_FSUB_R(Instruction&, uint32_t&);
|
||||||
|
void h_FSUB_M(Instruction&, uint32_t&);
|
||||||
|
void h_FSCAL_R(Instruction&, uint32_t&);
|
||||||
|
void h_FMUL_R(Instruction&, uint32_t&);
|
||||||
|
void h_FDIV_M(Instruction&, uint32_t&);
|
||||||
|
void h_FSQRT_R(Instruction&, uint32_t&);
|
||||||
|
void h_CBRANCH(Instruction&, uint32_t&);
|
||||||
|
void h_CFROUND(Instruction&, uint32_t&);
|
||||||
|
void h_ISTORE(Instruction&, uint32_t&);
|
||||||
|
void h_NOP(Instruction&, uint32_t&);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
576
src/crypto/randomx/jit_compiler_a64_static.S
Normal file
576
src/crypto/randomx/jit_compiler_a64_static.S
Normal file
|
@ -0,0 +1,576 @@
|
||||||
|
# Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
||||||
|
# Copyright (c) 2019, SChernykh <https://github.com/SChernykh>
|
||||||
|
#
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are met:
|
||||||
|
# * Redistributions of source code must retain the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
# * Neither the name of the copyright holder nor the
|
||||||
|
# names of its contributors may be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||||
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
.arch armv8-a
|
||||||
|
.text
|
||||||
|
.global randomx_program_aarch64
|
||||||
|
.global randomx_program_aarch64_main_loop
|
||||||
|
.global randomx_program_aarch64_vm_instructions
|
||||||
|
.global randomx_program_aarch64_imul_rcp_literals_end
|
||||||
|
.global randomx_program_aarch64_vm_instructions_end
|
||||||
|
.global randomx_program_aarch64_cacheline_align_mask1
|
||||||
|
.global randomx_program_aarch64_cacheline_align_mask2
|
||||||
|
.global randomx_program_aarch64_update_spMix1
|
||||||
|
.global randomx_program_aarch64_vm_instructions_end_light
|
||||||
|
.global randomx_program_aarch64_light_cacheline_align_mask
|
||||||
|
.global randomx_program_aarch64_light_dataset_offset
|
||||||
|
.global randomx_init_dataset_aarch64
|
||||||
|
.global randomx_init_dataset_aarch64_end
|
||||||
|
.global randomx_calc_dataset_item_aarch64
|
||||||
|
.global randomx_calc_dataset_item_aarch64_prefetch
|
||||||
|
.global randomx_calc_dataset_item_aarch64_mix
|
||||||
|
.global randomx_calc_dataset_item_aarch64_store_result
|
||||||
|
.global randomx_calc_dataset_item_aarch64_end
|
||||||
|
|
||||||
|
# Register allocation
|
||||||
|
|
||||||
|
# x0 -> pointer to reg buffer and then literal for IMUL_RCP
|
||||||
|
# x1 -> pointer to mem buffer and then to dataset
|
||||||
|
# x2 -> pointer to scratchpad
|
||||||
|
# x3 -> loop counter
|
||||||
|
# x4 -> "r0"
|
||||||
|
# x5 -> "r1"
|
||||||
|
# x6 -> "r2"
|
||||||
|
# x7 -> "r3"
|
||||||
|
# x8 -> fpcr (reversed bits)
|
||||||
|
# x9 -> mx, ma
|
||||||
|
# x10 -> spMix1
|
||||||
|
# x11 -> literal for IMUL_RCP
|
||||||
|
# x12 -> "r4"
|
||||||
|
# x13 -> "r5"
|
||||||
|
# x14 -> "r6"
|
||||||
|
# x15 -> "r7"
|
||||||
|
# x16 -> spAddr0
|
||||||
|
# x17 -> spAddr1
|
||||||
|
# x18 -> temporary
|
||||||
|
# x19 -> temporary
|
||||||
|
# x20 -> literal for IMUL_RCP
|
||||||
|
# x21 -> literal for IMUL_RCP
|
||||||
|
# x22 -> literal for IMUL_RCP
|
||||||
|
# x23 -> literal for IMUL_RCP
|
||||||
|
# x24 -> literal for IMUL_RCP
|
||||||
|
# x25 -> literal for IMUL_RCP
|
||||||
|
# x26 -> literal for IMUL_RCP
|
||||||
|
# x27 -> literal for IMUL_RCP
|
||||||
|
# x28 -> literal for IMUL_RCP
|
||||||
|
# x29 -> literal for IMUL_RCP
|
||||||
|
# x30 -> literal for IMUL_RCP
|
||||||
|
|
||||||
|
# v0-v15 -> store 32-bit literals
|
||||||
|
# v16 -> "f0"
|
||||||
|
# v17 -> "f1"
|
||||||
|
# v18 -> "f2"
|
||||||
|
# v19 -> "f3"
|
||||||
|
# v20 -> "e0"
|
||||||
|
# v21 -> "e1"
|
||||||
|
# v22 -> "e2"
|
||||||
|
# v23 -> "e3"
|
||||||
|
# v24 -> "a0"
|
||||||
|
# v25 -> "a1"
|
||||||
|
# v26 -> "a2"
|
||||||
|
# v27 -> "a3"
|
||||||
|
# v28 -> temporary
|
||||||
|
# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
|
||||||
|
# v30 -> E 'or' mask = 0x3*00000000******3*00000000******
|
||||||
|
# v31 -> scale mask = 0x81f000000000000081f0000000000000
|
||||||
|
|
||||||
|
randomx_program_aarch64:
|
||||||
|
# Save callee-saved registers
|
||||||
|
sub sp, sp, 192
|
||||||
|
stp x16, x17, [sp]
|
||||||
|
stp x18, x19, [sp, 16]
|
||||||
|
stp x20, x21, [sp, 32]
|
||||||
|
stp x22, x23, [sp, 48]
|
||||||
|
stp x24, x25, [sp, 64]
|
||||||
|
stp x26, x27, [sp, 80]
|
||||||
|
stp x28, x29, [sp, 96]
|
||||||
|
stp x8, x30, [sp, 112]
|
||||||
|
stp d8, d9, [sp, 128]
|
||||||
|
stp d10, d11, [sp, 144]
|
||||||
|
stp d12, d13, [sp, 160]
|
||||||
|
stp d14, d15, [sp, 176]
|
||||||
|
|
||||||
|
# Zero integer registers
|
||||||
|
mov x4, xzr
|
||||||
|
mov x5, xzr
|
||||||
|
mov x6, xzr
|
||||||
|
mov x7, xzr
|
||||||
|
mov x12, xzr
|
||||||
|
mov x13, xzr
|
||||||
|
mov x14, xzr
|
||||||
|
mov x15, xzr
|
||||||
|
|
||||||
|
# Load ma, mx and dataset pointer
|
||||||
|
ldp x9, x1, [x1]
|
||||||
|
|
||||||
|
# Load initial spMix value
|
||||||
|
mov x10, x9
|
||||||
|
|
||||||
|
# Load group A registers
|
||||||
|
ldp q24, q25, [x0, 192]
|
||||||
|
ldp q26, q27, [x0, 224]
|
||||||
|
|
||||||
|
# Load E 'and' mask
|
||||||
|
mov x16, 0x00FFFFFFFFFFFFFF
|
||||||
|
ins v29.d[0], x16
|
||||||
|
ins v29.d[1], x16
|
||||||
|
|
||||||
|
# Load E 'or' mask (stored in reg.f[0])
|
||||||
|
ldr q30, [x0, 64]
|
||||||
|
|
||||||
|
# Load scale mask
|
||||||
|
mov x16, 0x80f0000000000000
|
||||||
|
ins v31.d[0], x16
|
||||||
|
ins v31.d[1], x16
|
||||||
|
|
||||||
|
# Read fpcr
|
||||||
|
mrs x8, fpcr
|
||||||
|
rbit x8, x8
|
||||||
|
|
||||||
|
# Save x0
|
||||||
|
str x0, [sp, -16]!
|
||||||
|
|
||||||
|
# Read literals
|
||||||
|
ldr x0, literal_x0
|
||||||
|
ldr x11, literal_x11
|
||||||
|
ldr x20, literal_x20
|
||||||
|
ldr x21, literal_x21
|
||||||
|
ldr x22, literal_x22
|
||||||
|
ldr x23, literal_x23
|
||||||
|
ldr x24, literal_x24
|
||||||
|
ldr x25, literal_x25
|
||||||
|
ldr x26, literal_x26
|
||||||
|
ldr x27, literal_x27
|
||||||
|
ldr x28, literal_x28
|
||||||
|
ldr x29, literal_x29
|
||||||
|
ldr x30, literal_x30
|
||||||
|
|
||||||
|
ldr q0, literal_v0
|
||||||
|
ldr q1, literal_v1
|
||||||
|
ldr q2, literal_v2
|
||||||
|
ldr q3, literal_v3
|
||||||
|
ldr q4, literal_v4
|
||||||
|
ldr q5, literal_v5
|
||||||
|
ldr q6, literal_v6
|
||||||
|
ldr q7, literal_v7
|
||||||
|
ldr q8, literal_v8
|
||||||
|
ldr q9, literal_v9
|
||||||
|
ldr q10, literal_v10
|
||||||
|
ldr q11, literal_v11
|
||||||
|
ldr q12, literal_v12
|
||||||
|
ldr q13, literal_v13
|
||||||
|
ldr q14, literal_v14
|
||||||
|
ldr q15, literal_v15
|
||||||
|
|
||||||
|
randomx_program_aarch64_main_loop:
|
||||||
|
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
||||||
|
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
|
||||||
|
lsr x18, x10, 32
|
||||||
|
|
||||||
|
# Actual mask will be inserted by JIT compiler
|
||||||
|
and w16, w10, 1
|
||||||
|
and w17, w18, 1
|
||||||
|
|
||||||
|
# x16 = scratchpad + spAddr0
|
||||||
|
# x17 = scratchpad + spAddr1
|
||||||
|
add x16, x16, x2
|
||||||
|
add x17, x17, x2
|
||||||
|
|
||||||
|
# xor integer registers with scratchpad data (spAddr0)
|
||||||
|
ldp x18, x19, [x16]
|
||||||
|
eor x4, x4, x18
|
||||||
|
eor x5, x5, x19
|
||||||
|
ldp x18, x19, [x16, 16]
|
||||||
|
eor x6, x6, x18
|
||||||
|
eor x7, x7, x19
|
||||||
|
ldp x18, x19, [x16, 32]
|
||||||
|
eor x12, x12, x18
|
||||||
|
eor x13, x13, x19
|
||||||
|
ldp x18, x19, [x16, 48]
|
||||||
|
eor x14, x14, x18
|
||||||
|
eor x15, x15, x19
|
||||||
|
|
||||||
|
# Load group F registers (spAddr1)
|
||||||
|
ldpsw x18, x19, [x17]
|
||||||
|
ins v16.d[0], x18
|
||||||
|
ins v16.d[1], x19
|
||||||
|
ldpsw x18, x19, [x17, 8]
|
||||||
|
ins v17.d[0], x18
|
||||||
|
ins v17.d[1], x19
|
||||||
|
ldpsw x18, x19, [x17, 16]
|
||||||
|
ins v18.d[0], x18
|
||||||
|
ins v18.d[1], x19
|
||||||
|
ldpsw x18, x19, [x17, 24]
|
||||||
|
ins v19.d[0], x18
|
||||||
|
ins v19.d[1], x19
|
||||||
|
scvtf v16.2d, v16.2d
|
||||||
|
scvtf v17.2d, v17.2d
|
||||||
|
scvtf v18.2d, v18.2d
|
||||||
|
scvtf v19.2d, v19.2d
|
||||||
|
|
||||||
|
# Load group E registers (spAddr1)
|
||||||
|
ldpsw x18, x19, [x17, 32]
|
||||||
|
ins v20.d[0], x18
|
||||||
|
ins v20.d[1], x19
|
||||||
|
ldpsw x18, x19, [x17, 40]
|
||||||
|
ins v21.d[0], x18
|
||||||
|
ins v21.d[1], x19
|
||||||
|
ldpsw x18, x19, [x17, 48]
|
||||||
|
ins v22.d[0], x18
|
||||||
|
ins v22.d[1], x19
|
||||||
|
ldpsw x18, x19, [x17, 56]
|
||||||
|
ins v23.d[0], x18
|
||||||
|
ins v23.d[1], x19
|
||||||
|
scvtf v20.2d, v20.2d
|
||||||
|
scvtf v21.2d, v21.2d
|
||||||
|
scvtf v22.2d, v22.2d
|
||||||
|
scvtf v23.2d, v23.2d
|
||||||
|
and v20.16b, v20.16b, v29.16b
|
||||||
|
and v21.16b, v21.16b, v29.16b
|
||||||
|
and v22.16b, v22.16b, v29.16b
|
||||||
|
and v23.16b, v23.16b, v29.16b
|
||||||
|
orr v20.16b, v20.16b, v30.16b
|
||||||
|
orr v21.16b, v21.16b, v30.16b
|
||||||
|
orr v22.16b, v22.16b, v30.16b
|
||||||
|
orr v23.16b, v23.16b, v30.16b
|
||||||
|
|
||||||
|
# Execute VM instructions
|
||||||
|
randomx_program_aarch64_vm_instructions:
|
||||||
|
|
||||||
|
# 16 KB buffer for generated instructions
|
||||||
|
.fill 4096,4,0
|
||||||
|
|
||||||
|
literal_x0: .fill 1,8,0
|
||||||
|
literal_x11: .fill 1,8,0
|
||||||
|
literal_x20: .fill 1,8,0
|
||||||
|
literal_x21: .fill 1,8,0
|
||||||
|
literal_x22: .fill 1,8,0
|
||||||
|
literal_x23: .fill 1,8,0
|
||||||
|
literal_x24: .fill 1,8,0
|
||||||
|
literal_x25: .fill 1,8,0
|
||||||
|
literal_x26: .fill 1,8,0
|
||||||
|
literal_x27: .fill 1,8,0
|
||||||
|
literal_x28: .fill 1,8,0
|
||||||
|
literal_x29: .fill 1,8,0
|
||||||
|
literal_x30: .fill 1,8,0
|
||||||
|
randomx_program_aarch64_imul_rcp_literals_end:
|
||||||
|
|
||||||
|
literal_v0: .fill 2,8,0
|
||||||
|
literal_v1: .fill 2,8,0
|
||||||
|
literal_v2: .fill 2,8,0
|
||||||
|
literal_v3: .fill 2,8,0
|
||||||
|
literal_v4: .fill 2,8,0
|
||||||
|
literal_v5: .fill 2,8,0
|
||||||
|
literal_v6: .fill 2,8,0
|
||||||
|
literal_v7: .fill 2,8,0
|
||||||
|
literal_v8: .fill 2,8,0
|
||||||
|
literal_v9: .fill 2,8,0
|
||||||
|
literal_v10: .fill 2,8,0
|
||||||
|
literal_v11: .fill 2,8,0
|
||||||
|
literal_v12: .fill 2,8,0
|
||||||
|
literal_v13: .fill 2,8,0
|
||||||
|
literal_v14: .fill 2,8,0
|
||||||
|
literal_v15: .fill 2,8,0
|
||||||
|
|
||||||
|
randomx_program_aarch64_vm_instructions_end:
|
||||||
|
|
||||||
|
# mx ^= r[readReg2] ^ r[readReg3];
|
||||||
|
eor x9, x9, x18
|
||||||
|
|
||||||
|
# Calculate dataset pointer for dataset prefetch
|
||||||
|
mov w18, w9
|
||||||
|
randomx_program_aarch64_cacheline_align_mask1:
|
||||||
|
# Actual mask will be inserted by JIT compiler
|
||||||
|
and x18, x18, 1
|
||||||
|
add x18, x18, x1
|
||||||
|
|
||||||
|
# Prefetch dataset data
|
||||||
|
prfm pldl2strm, [x18]
|
||||||
|
|
||||||
|
# mx <-> ma
|
||||||
|
ror x9, x9, 32
|
||||||
|
|
||||||
|
# Calculate dataset pointer for dataset read
|
||||||
|
mov w10, w9
|
||||||
|
randomx_program_aarch64_cacheline_align_mask2:
|
||||||
|
# Actual mask will be inserted by JIT compiler
|
||||||
|
and x10, x10, 1
|
||||||
|
add x10, x10, x1
|
||||||
|
|
||||||
|
randomx_program_aarch64_xor_with_dataset_line:
|
||||||
|
# xor integer registers with dataset data
|
||||||
|
ldp x18, x19, [x10]
|
||||||
|
eor x4, x4, x18
|
||||||
|
eor x5, x5, x19
|
||||||
|
ldp x18, x19, [x10, 16]
|
||||||
|
eor x6, x6, x18
|
||||||
|
eor x7, x7, x19
|
||||||
|
ldp x18, x19, [x10, 32]
|
||||||
|
eor x12, x12, x18
|
||||||
|
eor x13, x13, x19
|
||||||
|
ldp x18, x19, [x10, 48]
|
||||||
|
eor x14, x14, x18
|
||||||
|
eor x15, x15, x19
|
||||||
|
|
||||||
|
randomx_program_aarch64_update_spMix1:
|
||||||
|
# JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1"
|
||||||
|
eor x10, x0, x0
|
||||||
|
|
||||||
|
# Store integer registers to scratchpad (spAddr1)
|
||||||
|
stp x4, x5, [x17, 0]
|
||||||
|
stp x6, x7, [x17, 16]
|
||||||
|
stp x12, x13, [x17, 32]
|
||||||
|
stp x14, x15, [x17, 48]
|
||||||
|
|
||||||
|
# xor group F and group E registers
|
||||||
|
eor v16.16b, v16.16b, v20.16b
|
||||||
|
eor v17.16b, v17.16b, v21.16b
|
||||||
|
eor v18.16b, v18.16b, v22.16b
|
||||||
|
eor v19.16b, v19.16b, v23.16b
|
||||||
|
|
||||||
|
# Store FP registers to scratchpad (spAddr0)
|
||||||
|
stp q16, q17, [x16, 0]
|
||||||
|
stp q18, q19, [x16, 32]
|
||||||
|
|
||||||
|
subs x3, x3, 1
|
||||||
|
bne randomx_program_aarch64_main_loop
|
||||||
|
|
||||||
|
# Restore x0
|
||||||
|
ldr x0, [sp], 16
|
||||||
|
|
||||||
|
# Store integer registers
|
||||||
|
stp x4, x5, [x0, 0]
|
||||||
|
stp x6, x7, [x0, 16]
|
||||||
|
stp x12, x13, [x0, 32]
|
||||||
|
stp x14, x15, [x0, 48]
|
||||||
|
|
||||||
|
# Store FP registers
|
||||||
|
stp q16, q17, [x0, 64]
|
||||||
|
stp q18, q19, [x0, 96]
|
||||||
|
stp q20, q21, [x0, 128]
|
||||||
|
stp q22, q23, [x0, 160]
|
||||||
|
|
||||||
|
# Restore callee-saved registers
|
||||||
|
ldp x16, x17, [sp]
|
||||||
|
ldp x18, x19, [sp, 16]
|
||||||
|
ldp x20, x21, [sp, 32]
|
||||||
|
ldp x22, x23, [sp, 48]
|
||||||
|
ldp x24, x25, [sp, 64]
|
||||||
|
ldp x26, x27, [sp, 80]
|
||||||
|
ldp x28, x29, [sp, 96]
|
||||||
|
ldp x8, x30, [sp, 112]
|
||||||
|
ldp d8, d9, [sp, 128]
|
||||||
|
ldp d10, d11, [sp, 144]
|
||||||
|
ldp d12, d13, [sp, 160]
|
||||||
|
ldp d14, d15, [sp, 176]
|
||||||
|
add sp, sp, 192
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
randomx_program_aarch64_vm_instructions_end_light:
|
||||||
|
sub sp, sp, 96
|
||||||
|
stp x0, x1, [sp, 64]
|
||||||
|
stp x2, x30, [sp, 80]
|
||||||
|
|
||||||
|
# mx ^= r[readReg2] ^ r[readReg3];
|
||||||
|
eor x9, x9, x18
|
||||||
|
|
||||||
|
# mx <-> ma
|
||||||
|
ror x9, x9, 32
|
||||||
|
|
||||||
|
# x0 -> pointer to cache memory
|
||||||
|
mov x0, x1
|
||||||
|
|
||||||
|
# x1 -> pointer to output
|
||||||
|
mov x1, sp
|
||||||
|
|
||||||
|
randomx_program_aarch64_light_cacheline_align_mask:
|
||||||
|
# Actual mask will be inserted by JIT compiler
|
||||||
|
and w2, w9, 1
|
||||||
|
|
||||||
|
# x2 -> item number
|
||||||
|
lsr x2, x2, 6
|
||||||
|
|
||||||
|
randomx_program_aarch64_light_dataset_offset:
|
||||||
|
# Apply dataset offset (filled in by JIT compiler)
|
||||||
|
add x2, x2, 0
|
||||||
|
add x2, x2, 0
|
||||||
|
|
||||||
|
bl randomx_calc_dataset_item_aarch64
|
||||||
|
|
||||||
|
mov x10, sp
|
||||||
|
ldp x0, x1, [sp, 64]
|
||||||
|
ldp x2, x30, [sp, 80]
|
||||||
|
add sp, sp, 96
|
||||||
|
|
||||||
|
b randomx_program_aarch64_xor_with_dataset_line
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Input parameters
|
||||||
|
#
|
||||||
|
# x0 -> pointer to cache
|
||||||
|
# x1 -> pointer to dataset memory at startItem
|
||||||
|
# x2 -> start item
|
||||||
|
# x3 -> end item
|
||||||
|
|
||||||
|
randomx_init_dataset_aarch64:
|
||||||
|
# Save x30 (return address)
|
||||||
|
str x30, [sp, -16]!
|
||||||
|
|
||||||
|
# Load pointer to cache memory
|
||||||
|
ldr x0, [x0]
|
||||||
|
|
||||||
|
randomx_init_dataset_aarch64_main_loop:
|
||||||
|
bl randomx_calc_dataset_item_aarch64
|
||||||
|
add x1, x1, 64
|
||||||
|
add x2, x2, 1
|
||||||
|
cmp x2, x3
|
||||||
|
bne randomx_init_dataset_aarch64_main_loop
|
||||||
|
|
||||||
|
# Restore x30 (return address)
|
||||||
|
ldr x30, [sp], 16
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
randomx_init_dataset_aarch64_end:
|
||||||
|
|
||||||
|
# Input parameters
|
||||||
|
#
|
||||||
|
# x0 -> pointer to cache memory
|
||||||
|
# x1 -> pointer to output
|
||||||
|
# x2 -> item number
|
||||||
|
#
|
||||||
|
# Register allocation
|
||||||
|
#
|
||||||
|
# x0-x7 -> output value (calculated dataset item)
|
||||||
|
# x8 -> pointer to cache memory
|
||||||
|
# x9 -> pointer to output
|
||||||
|
# x10 -> registerValue
|
||||||
|
# x11 -> mixBlock
|
||||||
|
# x12 -> temporary
|
||||||
|
# x13 -> temporary
|
||||||
|
|
||||||
|
randomx_calc_dataset_item_aarch64:
|
||||||
|
sub sp, sp, 112
|
||||||
|
stp x0, x1, [sp]
|
||||||
|
stp x2, x3, [sp, 16]
|
||||||
|
stp x4, x5, [sp, 32]
|
||||||
|
stp x6, x7, [sp, 48]
|
||||||
|
stp x8, x9, [sp, 64]
|
||||||
|
stp x10, x11, [sp, 80]
|
||||||
|
stp x12, x13, [sp, 96]
|
||||||
|
|
||||||
|
mov x8, x0
|
||||||
|
mov x9, x1
|
||||||
|
mov x10, x2
|
||||||
|
|
||||||
|
# rl[0] = (itemNumber + 1) * superscalarMul0;
|
||||||
|
ldr x12, superscalarMul0
|
||||||
|
madd x0, x2, x12, x12
|
||||||
|
|
||||||
|
# rl[1] = rl[0] ^ superscalarAdd1;
|
||||||
|
ldr x12, superscalarAdd1
|
||||||
|
eor x1, x0, x12
|
||||||
|
|
||||||
|
# rl[2] = rl[0] ^ superscalarAdd2;
|
||||||
|
ldr x12, superscalarAdd2
|
||||||
|
eor x2, x0, x12
|
||||||
|
|
||||||
|
# rl[3] = rl[0] ^ superscalarAdd3;
|
||||||
|
ldr x12, superscalarAdd3
|
||||||
|
eor x3, x0, x12
|
||||||
|
|
||||||
|
# rl[4] = rl[0] ^ superscalarAdd4;
|
||||||
|
ldr x12, superscalarAdd4
|
||||||
|
eor x4, x0, x12
|
||||||
|
|
||||||
|
# rl[5] = rl[0] ^ superscalarAdd5;
|
||||||
|
ldr x12, superscalarAdd5
|
||||||
|
eor x5, x0, x12
|
||||||
|
|
||||||
|
# rl[6] = rl[0] ^ superscalarAdd6;
|
||||||
|
ldr x12, superscalarAdd6
|
||||||
|
eor x6, x0, x12
|
||||||
|
|
||||||
|
# rl[7] = rl[0] ^ superscalarAdd7;
|
||||||
|
ldr x12, superscalarAdd7
|
||||||
|
eor x7, x0, x12
|
||||||
|
|
||||||
|
b randomx_calc_dataset_item_aarch64_prefetch
|
||||||
|
|
||||||
|
superscalarMul0: .quad 6364136223846793005
|
||||||
|
superscalarAdd1: .quad 9298411001130361340
|
||||||
|
superscalarAdd2: .quad 12065312585734608966
|
||||||
|
superscalarAdd3: .quad 9306329213124626780
|
||||||
|
superscalarAdd4: .quad 5281919268842080866
|
||||||
|
superscalarAdd5: .quad 10536153434571861004
|
||||||
|
superscalarAdd6: .quad 3398623926847679864
|
||||||
|
superscalarAdd7: .quad 9549104520008361294
|
||||||
|
|
||||||
|
# Prefetch -> SuperScalar hash -> Mix will be repeated N times
|
||||||
|
|
||||||
|
randomx_calc_dataset_item_aarch64_prefetch:
|
||||||
|
# Actual mask will be inserted by JIT compiler
|
||||||
|
and x11, x10, 1
|
||||||
|
add x11, x8, x11, lsl 6
|
||||||
|
prfm pldl2strm, [x11]
|
||||||
|
|
||||||
|
# Generated SuperScalar hash program goes here
|
||||||
|
|
||||||
|
randomx_calc_dataset_item_aarch64_mix:
|
||||||
|
ldp x12, x13, [x11]
|
||||||
|
eor x0, x0, x12
|
||||||
|
eor x1, x1, x13
|
||||||
|
ldp x12, x13, [x11, 16]
|
||||||
|
eor x2, x2, x12
|
||||||
|
eor x3, x3, x13
|
||||||
|
ldp x12, x13, [x11, 32]
|
||||||
|
eor x4, x4, x12
|
||||||
|
eor x5, x5, x13
|
||||||
|
ldp x12, x13, [x11, 48]
|
||||||
|
eor x6, x6, x12
|
||||||
|
eor x7, x7, x13
|
||||||
|
|
||||||
|
randomx_calc_dataset_item_aarch64_store_result:
|
||||||
|
stp x0, x1, [x9]
|
||||||
|
stp x2, x3, [x9, 16]
|
||||||
|
stp x4, x5, [x9, 32]
|
||||||
|
stp x6, x7, [x9, 48]
|
||||||
|
|
||||||
|
ldp x0, x1, [sp]
|
||||||
|
ldp x2, x3, [sp, 16]
|
||||||
|
ldp x4, x5, [sp, 32]
|
||||||
|
ldp x6, x7, [sp, 48]
|
||||||
|
ldp x8, x9, [sp, 64]
|
||||||
|
ldp x10, x11, [sp, 80]
|
||||||
|
ldp x12, x13, [sp, 96]
|
||||||
|
add sp, sp, 112
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
randomx_calc_dataset_item_aarch64_end:
|
51
src/crypto/randomx/jit_compiler_a64_static.hpp
Normal file
51
src/crypto/randomx/jit_compiler_a64_static.hpp
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
/*
|
||||||
|
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
||||||
|
Copyright (c) 2019, SChernykh <https://github.com/SChernykh>
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holder nor the
|
||||||
|
names of its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||||
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
|
||||||
|
void randomx_program_aarch64_main_loop();
|
||||||
|
void randomx_program_aarch64_vm_instructions();
|
||||||
|
void randomx_program_aarch64_imul_rcp_literals_end();
|
||||||
|
void randomx_program_aarch64_vm_instructions_end();
|
||||||
|
void randomx_program_aarch64_cacheline_align_mask1();
|
||||||
|
void randomx_program_aarch64_cacheline_align_mask2();
|
||||||
|
void randomx_program_aarch64_update_spMix1();
|
||||||
|
void randomx_program_aarch64_vm_instructions_end_light();
|
||||||
|
void randomx_program_aarch64_light_cacheline_align_mask();
|
||||||
|
void randomx_program_aarch64_light_dataset_offset();
|
||||||
|
void randomx_init_dataset_aarch64();
|
||||||
|
void randomx_init_dataset_aarch64_end();
|
||||||
|
void randomx_calc_dataset_item_aarch64();
|
||||||
|
void randomx_calc_dataset_item_aarch64_prefetch();
|
||||||
|
void randomx_calc_dataset_item_aarch64_mix();
|
||||||
|
void randomx_calc_dataset_item_aarch64_store_result();
|
||||||
|
void randomx_calc_dataset_item_aarch64_end();
|
||||||
|
}
|
|
@ -26,6 +26,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "crypto/randomx/common.hpp"
|
||||||
#include "crypto/randomx/randomx.h"
|
#include "crypto/randomx/randomx.h"
|
||||||
#include "crypto/randomx/dataset.hpp"
|
#include "crypto/randomx/dataset.hpp"
|
||||||
#include "crypto/randomx/vm_interpreted.hpp"
|
#include "crypto/randomx/vm_interpreted.hpp"
|
||||||
|
@ -33,7 +34,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "crypto/randomx/vm_compiled.hpp"
|
#include "crypto/randomx/vm_compiled.hpp"
|
||||||
#include "crypto/randomx/vm_compiled_light.hpp"
|
#include "crypto/randomx/vm_compiled_light.hpp"
|
||||||
#include "crypto/randomx/blake2/blake2.h"
|
#include "crypto/randomx/blake2/blake2.h"
|
||||||
|
|
||||||
|
#if defined(_M_X64) || defined(__x86_64__)
|
||||||
#include "crypto/randomx/jit_compiler_x86_static.hpp"
|
#include "crypto/randomx/jit_compiler_x86_static.hpp"
|
||||||
|
#elif defined(XMRIG_ARM)
|
||||||
|
#include "crypto/randomx/jit_compiler_a64_static.hpp"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
|
RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
|
||||||
|
@ -156,19 +163,10 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint32_t Log2(size_t value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; }
|
||||||
|
|
||||||
void RandomX_ConfigurationBase::Apply()
|
void RandomX_ConfigurationBase::Apply()
|
||||||
{
|
{
|
||||||
#if defined(_M_X64) || defined(__x86_64__)
|
|
||||||
*(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1;
|
|
||||||
const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE;
|
|
||||||
*(uint32_t*)(codeReadDatasetTweaked + 7) = DatasetBaseMask;
|
|
||||||
*(uint32_t*)(codeReadDatasetTweaked + 23) = DatasetBaseMask;
|
|
||||||
*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
CacheLineAlignMask_Calculated = (DatasetBaseSize - 1) & ~(RANDOMX_DATASET_ITEM_SIZE - 1);
|
|
||||||
DatasetExtraItems_Calculated = DatasetExtraSize / RANDOMX_DATASET_ITEM_SIZE;
|
|
||||||
|
|
||||||
ScratchpadL1Mask_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) - 1) * 8;
|
ScratchpadL1Mask_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) - 1) * 8;
|
||||||
ScratchpadL1Mask16_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) / 2 - 1) * 16;
|
ScratchpadL1Mask16_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) / 2 - 1) * 16;
|
||||||
ScratchpadL2Mask_Calculated = (ScratchpadL2_Size / sizeof(uint64_t) - 1) * 8;
|
ScratchpadL2Mask_Calculated = (ScratchpadL2_Size / sizeof(uint64_t) - 1) * 8;
|
||||||
|
@ -176,22 +174,40 @@ void RandomX_ConfigurationBase::Apply()
|
||||||
ScratchpadL3Mask_Calculated = (((ScratchpadL3_Size / sizeof(uint64_t)) - 1) * 8);
|
ScratchpadL3Mask_Calculated = (((ScratchpadL3_Size / sizeof(uint64_t)) - 1) * 8);
|
||||||
ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64;
|
ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64;
|
||||||
|
|
||||||
#if defined(_M_X64) || defined(__x86_64__)
|
CacheLineAlignMask_Calculated = (DatasetBaseSize - 1) & ~(RANDOMX_DATASET_ITEM_SIZE - 1);
|
||||||
*(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated;
|
DatasetExtraItems_Calculated = DatasetExtraSize / RANDOMX_DATASET_ITEM_SIZE;
|
||||||
*(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ConditionMask_Calculated = (1 << JumpBits) - 1;
|
ConditionMask_Calculated = (1 << JumpBits) - 1;
|
||||||
|
|
||||||
constexpr int CEIL_NULL = 0;
|
|
||||||
int k = 0;
|
|
||||||
|
|
||||||
#if defined(_M_X64) || defined(__x86_64__)
|
#if defined(_M_X64) || defined(__x86_64__)
|
||||||
|
*(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1;
|
||||||
|
const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE;
|
||||||
|
*(uint32_t*)(codeReadDatasetTweaked + 7) = DatasetBaseMask;
|
||||||
|
*(uint32_t*)(codeReadDatasetTweaked + 23) = DatasetBaseMask;
|
||||||
|
*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask;
|
||||||
|
|
||||||
|
*(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated;
|
||||||
|
*(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
|
||||||
|
|
||||||
#define JIT_HANDLE(x, prev) randomx::JitCompilerX86::engine[k] = &randomx::JitCompilerX86::h_##x
|
#define JIT_HANDLE(x, prev) randomx::JitCompilerX86::engine[k] = &randomx::JitCompilerX86::h_##x
|
||||||
|
|
||||||
|
#elif defined(XMRIG_ARM)
|
||||||
|
|
||||||
|
Log2_ScratchpadL1 = Log2(ScratchpadL1_Size);
|
||||||
|
Log2_ScratchpadL2 = Log2(ScratchpadL2_Size);
|
||||||
|
Log2_ScratchpadL3 = Log2(ScratchpadL3_Size);
|
||||||
|
Log2_DatasetBaseSize = Log2(DatasetBaseSize);
|
||||||
|
Log2_CacheSize = Log2((ArgonMemory * randomx::ArgonBlockSize) / randomx::CacheLineSize);
|
||||||
|
|
||||||
|
#define JIT_HANDLE(x, prev) randomx::JitCompilerA64::engine[k] = &randomx::JitCompilerA64::h_##x
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#define JIT_HANDLE(x, prev)
|
#define JIT_HANDLE(x, prev)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
constexpr int CEIL_NULL = 0;
|
||||||
|
int k = 0;
|
||||||
|
|
||||||
#define INST_HANDLE(x, prev) \
|
#define INST_HANDLE(x, prev) \
|
||||||
CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \
|
CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \
|
||||||
for (; k < CEIL_##x; ++k) { JIT_HANDLE(x, prev); }
|
for (; k < CEIL_##x; ++k) { JIT_HANDLE(x, prev); }
|
||||||
|
@ -435,12 +451,12 @@ extern "C" {
|
||||||
assert(inputSize == 0 || input != nullptr);
|
assert(inputSize == 0 || input != nullptr);
|
||||||
assert(output != nullptr);
|
assert(output != nullptr);
|
||||||
alignas(16) uint64_t tempHash[8];
|
alignas(16) uint64_t tempHash[8];
|
||||||
rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
|
rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
|
||||||
machine->initScratchpad(&tempHash);
|
machine->initScratchpad(&tempHash);
|
||||||
machine->resetRoundingMode();
|
machine->resetRoundingMode();
|
||||||
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
|
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
|
||||||
machine->run(&tempHash);
|
machine->run(&tempHash);
|
||||||
rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
|
rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
|
||||||
}
|
}
|
||||||
machine->run(&tempHash);
|
machine->run(&tempHash);
|
||||||
machine->getFinalResult(output, RANDOMX_HASH_SIZE);
|
machine->getFinalResult(output, RANDOMX_HASH_SIZE);
|
||||||
|
|
|
@ -133,6 +133,14 @@ struct RandomX_ConfigurationBase
|
||||||
|
|
||||||
uint32_t ConditionMask_Calculated;
|
uint32_t ConditionMask_Calculated;
|
||||||
|
|
||||||
|
#ifdef XMRIG_ARM
|
||||||
|
uint32_t Log2_ScratchpadL1;
|
||||||
|
uint32_t Log2_ScratchpadL2;
|
||||||
|
uint32_t Log2_ScratchpadL3;
|
||||||
|
uint32_t Log2_DatasetBaseSize;
|
||||||
|
uint32_t Log2_CacheSize;
|
||||||
|
#endif
|
||||||
|
|
||||||
int CEIL_IADD_RS;
|
int CEIL_IADD_RS;
|
||||||
int CEIL_IADD_M;
|
int CEIL_IADD_M;
|
||||||
int CEIL_ISUB_R;
|
int CEIL_ISUB_R;
|
||||||
|
|
|
@ -50,6 +50,9 @@ namespace randomx {
|
||||||
|
|
||||||
template<bool softAes>
|
template<bool softAes>
|
||||||
void CompiledVm<softAes>::execute() {
|
void CompiledVm<softAes>::execute() {
|
||||||
|
#ifdef XMRIG_ARM
|
||||||
|
memcpy(reg.f, config.eMask, sizeof(config.eMask));
|
||||||
|
#endif
|
||||||
compiler.getProgramFunc()(reg, mem, scratchpad, RandomX_CurrentConfig.ProgramIterations);
|
compiler.getProgramFunc()(reg, mem, scratchpad, RandomX_CurrentConfig.ProgramIterations);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,11 +33,9 @@
|
||||||
|
|
||||||
xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes)
|
xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes)
|
||||||
{
|
{
|
||||||
# ifndef XMRIG_ARM
|
|
||||||
if (!softAes) {
|
if (!softAes) {
|
||||||
m_flags |= RANDOMX_FLAG_HARD_AES;
|
m_flags |= RANDOMX_FLAG_HARD_AES;
|
||||||
}
|
}
|
||||||
# endif
|
|
||||||
|
|
||||||
if (dataset->get()) {
|
if (dataset->get()) {
|
||||||
m_flags |= RANDOMX_FLAG_FULL_MEM;
|
m_flags |= RANDOMX_FLAG_FULL_MEM;
|
||||||
|
|
Loading…
Reference in a new issue