CryptonightR support for Wownero

This commit is contained in:
SChernykh 2019-02-10 15:21:17 +01:00
parent 73852f44c6
commit e3f2c38fff
32 changed files with 11251 additions and 131 deletions

View file

@ -86,6 +86,7 @@ set(HEADERS_CRYPTO
src/crypto/hash.h
src/crypto/skein_port.h
src/crypto/soft_aes.h
src/crypto/asm/CryptonightR_template.h
)
if (XMRIG_ARM)
@ -135,6 +136,7 @@ set(SOURCES_CRYPTO
src/crypto/c_blake256.c
src/crypto/c_jh.c
src/crypto/c_skein.c
src/crypto/CryptonightR_gen.cpp
)
if (WIN32)

View file

@ -5,9 +5,15 @@ if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
enable_language(ASM_MASM)
if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141)
set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.asm")
set(XMRIG_ASM_FILE
"src/crypto/asm/cn_main_loop.asm"
"src/crypto/asm/CryptonightR_template.asm"
)
else()
set(XMRIG_ASM_FILE "src/crypto/asm/win64/cn_main_loop.asm")
set(XMRIG_ASM_FILE
"src/crypto/asm/win64/cn_main_loop.asm"
"src/crypto/asm/win64/CryptonightR_template.asm"
)
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
@ -15,9 +21,15 @@ if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
enable_language(ASM)
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
set(XMRIG_ASM_FILE "src/crypto/asm/win64/cn_main_loop.S")
set(XMRIG_ASM_FILE
"src/crypto/asm/win64/cn_main_loop.S"
"src/crypto/asm/win64/CryptonightR_template.S"
)
else()
set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S")
set(XMRIG_ASM_FILE
"src/crypto/asm/cn_main_loop.S"
"src/crypto/asm/CryptonightR_template.S"
)
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)

View file

@ -51,6 +51,12 @@ MemInfo Mem::create(cryptonight_ctx **ctx, xmrig::Algo algorithm, size_t count)
cryptonight_ctx *c = static_cast<cryptonight_ctx *>(_mm_malloc(sizeof(cryptonight_ctx), 4096));
c->memory = info.memory + (i * cn_select_memory(algorithm));
uint8_t* p = reinterpret_cast<uint8_t*>(allocateExecutableMemory(0x4000));
c->generated_code = reinterpret_cast<cn_mainloop_fun_ms_abi>(p);
c->generated_code_double = reinterpret_cast<cn_mainloop_double_fun_ms_abi>(p + 0x2000);
c->generated_code_height = (uint64_t)(-1);
c->generated_code_double_height = (uint64_t)(-1);
ctx[i] = c;
}

View file

@ -64,6 +64,7 @@ static AlgoData const algorithms[] = {
{ "cryptonight/2", "cn/2", xmrig::CRYPTONIGHT, xmrig::VARIANT_2 },
{ "cryptonight/half", "cn/half", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF },
{ "cryptonight/xtlv9", "cn/xtlv9", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF },
{ "cryptonight/wow", "cn/wow", xmrig::CRYPTONIGHT, xmrig::VARIANT_WOW },
# ifndef XMRIG_NO_AEON
{ "cryptonight-lite", "cn-lite", xmrig::CRYPTONIGHT_LITE, xmrig::VARIANT_AUTO },
@ -127,7 +128,8 @@ static const char *variants[] = {
"2",
"half",
"trtl",
"gpu"
"gpu",
"wow",
};

View file

@ -354,6 +354,14 @@ bool Client::parseJob(const rapidjson::Value &params, int *code)
}
}
if (params.HasMember("height")) {
const rapidjson::Value &variant = params["height"];
if (variant.IsInt()) {
job.setHeight(variant.GetInt());
}
}
if (!verifyAlgorithm(job.algorithm())) {
*code = 6;

View file

@ -31,7 +31,7 @@
#include "common/net/Job.h"
static inline unsigned char hf_hex2bin(char c, bool &err)
unsigned char hf_hex2bin(char c, bool &err)
{
if (c >= '0' && c <= '9') {
return c - '0';
@ -48,7 +48,7 @@ static inline unsigned char hf_hex2bin(char c, bool &err)
}
static inline char hf_bin2hex(unsigned char c)
char hf_bin2hex(unsigned char c)
{
if (c <= 0x9) {
return '0' + c;
@ -66,7 +66,8 @@ Job::Job() :
m_size(0),
m_diff(0),
m_target(0),
m_blob()
m_blob(),
m_height(0)
{
}
@ -80,6 +81,7 @@ Job::Job(int poolId, bool nicehash, const xmrig::Algorithm &algorithm, const xmr
m_diff(0),
m_target(0),
m_blob(),
m_height(0),
m_algorithm(algorithm),
m_clientId(clientId)
{
@ -195,6 +197,12 @@ void Job::setAlgorithm(const char *algo)
}
void Job::setHeight(uint64_t height)
{
m_height = height;
}
bool Job::fromHex(const char* in, unsigned int len, unsigned char* out)
{
bool error = false;

View file

@ -50,6 +50,7 @@ public:
bool setBlob(const char *blob);
bool setTarget(const char *target);
void setAlgorithm(const char *algo);
void setHeight(uint64_t height);
inline bool isNicehash() const { return m_nicehash; }
inline bool isValid() const { return m_size > 0 && m_diff > 0; }
@ -65,6 +66,7 @@ public:
inline uint32_t *nonce() { return reinterpret_cast<uint32_t*>(m_blob + 39); }
inline uint32_t diff() const { return static_cast<uint32_t>(m_diff); }
inline uint64_t target() const { return m_target; }
inline uint64_t height() const { return m_height; }
inline void reset() { m_size = 0; m_diff = 0; }
inline void setClientId(const xmrig::Id &id) { m_clientId = id; }
inline void setPoolId(int poolId) { m_poolId = poolId; }
@ -100,6 +102,7 @@ private:
uint64_t m_diff;
uint64_t m_target;
uint8_t m_blob[kMaxBlobSize];
uint64_t m_height;
xmrig::Algorithm m_algorithm;
xmrig::Id m_clientId;
xmrig::Id m_id;

View file

@ -412,6 +412,7 @@ void Pool::rebuild()
m_algorithms.push_back(m_algorithm);
# ifndef XMRIG_PROXY_PROJECT
addVariant(xmrig::VARIANT_WOW);
addVariant(xmrig::VARIANT_2);
addVariant(xmrig::VARIANT_1);
addVariant(xmrig::VARIANT_0);

View file

@ -74,6 +74,7 @@ enum Variant {
VARIANT_HALF = 9, // CryptoNight variant 2 with half iterations (Masari/Stellite)
VARIANT_TRTL = 10, // CryptoNight Turtle (TRTL)
VARIANT_GPU = 11, // CryptoNight-GPU (Ryo)
VARIANT_WOW = 12, // CryptoNightR (Wownero)
VARIANT_MAX
};

View file

@ -29,10 +29,23 @@
#include <stddef.h>
#include <stdint.h>
#ifdef _MSC_VER
#define ABI_ATTRIBUTE
#else
#define ABI_ATTRIBUTE __attribute__((ms_abi))
#endif
struct cryptonight_ctx;
typedef void(*cn_mainloop_fun_ms_abi)(cryptonight_ctx*) ABI_ATTRIBUTE;
typedef void(*cn_mainloop_double_fun_ms_abi)(cryptonight_ctx*, cryptonight_ctx*) ABI_ATTRIBUTE;
struct cryptonight_ctx {
alignas(16) uint8_t state[224];
alignas(16) uint8_t *memory;
cn_mainloop_fun_ms_abi generated_code;
cn_mainloop_double_fun_ms_abi generated_code_double;
uint64_t generated_code_height;
uint64_t generated_code_double_height;
};

View file

@ -455,7 +455,7 @@ static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m1
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
@ -476,6 +476,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
VARIANT1_INIT(0);
VARIANT2_INIT(0);
VARIANT4_RANDOM_MATH_INIT(0);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
@ -515,11 +516,15 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
if (BASE == xmrig::VARIANT_2) {
VARIANT2_INTEGER_MATH(0, cl, cx);
lo = __umul128(idx0, cl, &hi);
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo);
}
else {
lo = __umul128(idx0, cl, &hi);
else if ((VARIANT == xmrig::VARIANT_4) || (VARIANT == xmrig::VARIANT_4_64)) {
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
}
lo = __umul128(idx0, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo);
}
al0 += hi;
@ -575,7 +580,7 @@ void cn_gpu_inner_arm(const uint8_t *spad, uint8_t *lpad);
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::CRYPTONIGHT_GPU_MASK;
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
@ -599,7 +604,7 @@ inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
@ -623,6 +628,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
VARIANT1_INIT(1);
VARIANT2_INIT(0);
VARIANT2_INIT(1);
VARIANT4_RANDOM_MATH_INIT(0);
VARIANT4_RANDOM_MATH_INIT(1);
cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@ -679,10 +686,15 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
if (BASE == xmrig::VARIANT_2) {
VARIANT2_INTEGER_MATH(0, cl, cx0);
lo = __umul128(idx0, cl, &hi);
}
else if ((VARIANT == xmrig::VARIANT_4) || (VARIANT == xmrig::VARIANT_4_64)) {
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
}
lo = __umul128(idx0, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo);
} else {
lo = __umul128(idx0, cl, &hi);
}
al0 += hi;
@ -702,7 +714,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
ah0 ^= ch;
idx0 = al0;
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t *>(&l0[idx0 & MASK]));
const int64_t n = vgetq_lane_s64(x, 0);
const int32_t d = vgetq_lane_s32(x, 2);
@ -723,10 +735,15 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
if (BASE == xmrig::VARIANT_2) {
VARIANT2_INTEGER_MATH(1, cl, cx1);
lo = __umul128(idx1, cl, &hi);
}
else if ((VARIANT == xmrig::VARIANT_4) || (VARIANT == xmrig::VARIANT_4_64)) {
VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
}
lo = __umul128(idx1, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo);
} else {
lo = __umul128(idx1, cl, &hi);
}
al1 += hi;
@ -761,7 +778,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
idx1 = d ^ q;
}
}
if (VARIANT == xmrig::VARIANT_2) {
if (BASE == xmrig::VARIANT_2) {
bx01 = bx00;
bx11 = bx10;
}
@ -781,19 +798,19 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
}
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
}
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
}

View file

@ -126,6 +126,7 @@ template<Algo ALGO, Variant variant> inline constexpr uint32_t cn_select_iter()
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_0>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_1>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_2>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_WOW>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_XTL>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_HALF>() { return CRYPTONIGHT_HALF_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_MSR>() { return CRYPTONIGHT_HALF_ITER; }
@ -192,6 +193,7 @@ template<> inline constexpr Variant cn_base_variant<VARIANT_2>() { return VA
template<> inline constexpr Variant cn_base_variant<VARIANT_HALF>() { return VARIANT_2; }
template<> inline constexpr Variant cn_base_variant<VARIANT_TRTL>() { return VARIANT_2; }
template<> inline constexpr Variant cn_base_variant<VARIANT_GPU>() { return VARIANT_GPU; }
template<> inline constexpr Variant cn_base_variant<VARIANT_WOW>() { return VARIANT_2; }
} /* namespace xmrig */

View file

@ -147,4 +147,33 @@
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
} while (0)
#endif
#define SWAP32LE(x) x
#define SWAP64LE(x) x
#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length))
#include "variant4_random_math.h"
#define VARIANT4_RANDOM_MATH_INIT(part) \
uint32_t r##part[8]; \
uint64_t r64_##part[8]; \
struct V4_Instruction code##part[256]; \
if (VARIANT == xmrig::VARIANT_WOW) { \
r##part[0] = (uint32_t)(h##part[12]); \
r##part[1] = (uint32_t)(h##part[12] >> 32); \
r##part[2] = (uint32_t)(h##part[13]); \
r##part[3] = (uint32_t)(h##part[13] >> 32); \
} \
v4_random_math_init(code##part, height);
#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \
if (VARIANT == xmrig::VARIANT_WOW) { \
cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \
r##part[4] = static_cast<uint32_t>(al); \
r##part[5] = static_cast<uint32_t>(ah); \
r##part[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0)); \
r##part[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1)); \
v4_random_math(code##part, r##part); \
}
#endif /* XMRIG_CRYPTONIGHT_MONERO_H */

View file

@ -58,6 +58,18 @@ const static uint8_t test_input[380] = {
0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04
};
const static char* test_input_WOW = R"===(
9d47bf4c41b7e8e727e681715acb47fa1677cdba9ca7bcb05ad8cc8abd5daa66 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260
0d4a495cb844a3ca8ba4edb8e6bcf829ef1c06d9cdea2b62ca46c2a21b8b0a79 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261
a1d6d848b5c5915fccd2f64cf216c6b1a02cf7c77bc80d8d4e51b419e88ff0dd 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262
af3a8544a0221a148c2ac90484b19861e3afca33fe17021efb8ad6496b567915 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263
313399e0963ae8a99dab8af66d343e097dae0c0feb08dbc43ccdafef5515f413 71756973206e6f737472756420657865726369746174696f6e20756c6c616d636f206c61626f726973206e697369 1806264
6021c6ef90bff9ae94a7506d623d3a7a86c1756d655f50dd558f716d64622a34 757420616c697175697020657820656120636f6d6d6f646f20636f6e7365717561742e20447569732061757465 1806265
2b13000535f3db5f9b9b84a65c4351f386cd2cdedebb8c3ad2eab086e6a3fee5 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266
fc0e1dad8e895749dc90eb690bc1ba059a1cd772afaaf65a106bf9e5e6b80503 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267
b60b0afe144deff7d903ed2d5545e77ebe66a3c51fee7016eeb8fee9eb630c0f 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268
64774b27e7d5fec862fc4c0c13ac6bf09123b6f05bb0e4b75c97f379a2b3a679 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269
)===";
// "cn/0"
const static uint8_t test_output_v0[160] = {
@ -79,7 +91,7 @@ const static uint8_t test_output_v1[160] = {
0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22,
0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22,
0xE7, 0x8C, 0x5A, 0x6E, 0x38, 0x30, 0x68, 0x4A, 0x73, 0xFC, 0x1B, 0xC6, 0x6D, 0xFC, 0x8D, 0x98,
0xB4, 0xC2, 0x23, 0x39, 0xAD, 0xE0, 0x9D, 0xF6, 0x6D, 0x8C, 0x6A, 0xAA, 0xF9, 0xB2, 0xE3, 0x4C,
0xB6, 0x90, 0x6C, 0xE6, 0x15, 0x5E, 0x46, 0x07, 0x9C, 0xB2, 0x6B, 0xAC, 0x3B, 0xAC, 0x1A, 0xDE,
@ -274,6 +286,8 @@ const static uint8_t test_output_pico_trtl[160] = {
};
#endif
unsigned char hf_hex2bin(char c, bool &err);
char hf_bin2hex(unsigned char c);
#ifndef XMRIG_NO_CN_GPU
// "cn/gpu"

View file

@ -480,7 +480,7 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l,
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
@ -504,6 +504,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
VARIANT1_INIT(0);
VARIANT2_INIT(0);
VARIANT2_SET_ROUNDING_MODE();
VARIANT4_RANDOM_MATH_INIT(0);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
@ -525,7 +526,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
else if (SOFT_AES) {
cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
}
else {
else {
cx = _mm_aesenc_si128(cx, ax0);
}
@ -542,12 +543,17 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
VARIANT2_INTEGER_MATH(0, cl, cx);
lo = __umul128(idx0, cl, &hi);
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo);
if (VARIANT == xmrig::VARIANT_WOW) {
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
} else {
VARIANT2_INTEGER_MATH(0, cl, cx);
}
}
else {
lo = __umul128(idx0, cl, &hi);
lo = __umul128(idx0, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo);
}
al0 += hi;
@ -605,7 +611,7 @@ void cn_gpu_inner_ssse3(const uint8_t *spad, uint8_t *lpad);
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::CRYPTONIGHT_GPU_MASK;
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
@ -640,7 +646,7 @@ inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_
extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx);
extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx);
extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx);
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx *ctx0, cryptonight_ctx *ctx1);
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm;
extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm;
@ -652,12 +658,23 @@ extern xmrig::CpuThread::cn_mainloop_fun cn_trtl_mainloop_ryzen_asm;
extern xmrig::CpuThread::cn_mainloop_fun cn_trtl_mainloop_bulldozer_asm;
extern xmrig::CpuThread::cn_mainloop_double_fun cn_trtl_double_mainloop_sandybridge_asm;
void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void v4_64_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void v4_64_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
template<xmrig::Algo ALGO, xmrig::Variant VARIANT, xmrig::Assembly ASM>
inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MEM = xmrig::cn_select_memory<ALGO>();
if ((VARIANT == xmrig::VARIANT_WOW) && (height != ctx[0]->generated_code_height)) {
V4_Instruction code[256];
const int code_size = v4_random_math_init(code, height);
v4_compile_code(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code), ASM);
ctx[0]->generated_code_height = height;
}
xmrig::keccak(input, size, ctx[0]->state);
cn_explode_scratchpad<ALGO, MEM, false>(reinterpret_cast<__m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));
@ -694,6 +711,9 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
cn_trtl_mainloop_bulldozer_asm(ctx[0]);
}
}
else if (VARIANT == xmrig::VARIANT_WOW) {
ctx[0]->generated_code(ctx[0]);
}
cn_implode_scratchpad<ALGO, MEM, false>(reinterpret_cast<__m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
xmrig::keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
@ -702,10 +722,17 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
template<xmrig::Algo ALGO, xmrig::Variant VARIANT, xmrig::Assembly ASM>
inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MEM = xmrig::cn_select_memory<ALGO>();
if ((VARIANT == xmrig::VARIANT_WOW) && (height != ctx[0]->generated_code_double_height)) {
V4_Instruction code[256];
const int code_size = v4_random_math_init(code, height);
v4_compile_code_double(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code_double), ASM);
ctx[0]->generated_code_double_height = height;
}
xmrig::keccak(input, size, ctx[0]->state);
xmrig::keccak(input + size, size, ctx[1]->state);
@ -721,6 +748,9 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
else if (VARIANT == xmrig::VARIANT_TRTL) {
cn_trtl_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
}
else if (VARIANT == xmrig::VARIANT_WOW) {
ctx[0]->generated_code_double(ctx[0], ctx[1]);
}
cn_implode_scratchpad<ALGO, MEM, false>(reinterpret_cast<__m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
cn_implode_scratchpad<ALGO, MEM, false>(reinterpret_cast<__m128i*>(ctx[1]->memory), reinterpret_cast<__m128i*>(ctx[1]->state));
@ -735,7 +765,7 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
@ -760,6 +790,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
VARIANT2_INIT(0);
VARIANT2_INIT(1);
VARIANT2_SET_ROUNDING_MODE();
VARIANT4_RANDOM_MATH_INIT(0);
VARIANT4_RANDOM_MATH_INIT(1);
cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@ -815,11 +847,17 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
VARIANT2_INTEGER_MATH(0, cl, cx0);
lo = __umul128(idx0, cl, &hi);
if (VARIANT == xmrig::VARIANT_WOW) {
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
} else {
VARIANT2_INTEGER_MATH(0, cl, cx0);
}
}
lo = __umul128(idx0, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo);
} else {
lo = __umul128(idx0, cl, &hi);
}
al0 += hi;
@ -857,11 +895,17 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
VARIANT2_INTEGER_MATH(1, cl, cx1);
lo = __umul128(idx1, cl, &hi);
if (VARIANT == xmrig::VARIANT_WOW) {
VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
} else {
VARIANT2_INTEGER_MATH(1, cl, cx1);
}
}
lo = __umul128(idx1, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo);
} else {
lo = __umul128(idx1, cl, &hi);
}
al1 += hi;
@ -946,11 +990,17 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
#define CN_STEP4(part, a, b0, b1, c, l, mc, ptr, idx) \
if (BASE == xmrig::VARIANT_2) { \
VARIANT2_INTEGER_MATH(part, cl##part, c); \
lo = __umul128(idx, cl##part, &hi); \
if (VARIANT == xmrig::VARIANT_WOW) { \
const uint64_t al = _mm_cvtsi128_si64(a); \
const uint64_t ah = _mm_cvtsi128_si64(_mm_srli_si128(a, 8)); \
VARIANT4_RANDOM_MATH(part, al, ah, cl##part, b0, b1); \
} else { \
VARIANT2_INTEGER_MATH(part, cl##part, c); \
} \
} \
lo = __umul128(idx, cl##part, &hi); \
if (BASE == xmrig::VARIANT_2) { \
VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo); \
} else { \
lo = __umul128(idx, cl##part, &hi); \
} \
a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \
\
@ -1000,11 +1050,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
__m128i ax##n = _mm_set_epi64x(h##n[1] ^ h##n[5], h##n[0] ^ h##n[4]); \
__m128i bx##n##0 = _mm_set_epi64x(h##n[3] ^ h##n[7], h##n[2] ^ h##n[6]); \
__m128i bx##n##1 = _mm_set_epi64x(h##n[9] ^ h##n[11], h##n[8] ^ h##n[10]); \
__m128i cx##n = _mm_setzero_si128();
__m128i cx##n = _mm_setzero_si128(); \
VARIANT4_RANDOM_MATH_INIT(n);
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
@ -1068,7 +1119,7 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
@ -1141,7 +1192,7 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();

View file

@ -0,0 +1,107 @@
#include <cstring>
#include "crypto/CryptoNight_monero.h"
typedef void(*void_func)();
#include "crypto/asm/CryptonightR_template.h"
#include "Mem.h"
#ifndef XMRIG_ARM
static inline void add_code(uint8_t* &p, void (*p1)(), void (*p2)())
{
const ptrdiff_t size = reinterpret_cast<const uint8_t*>(p2) - reinterpret_cast<const uint8_t*>(p1);
if (size > 0) {
memcpy(p, reinterpret_cast<void*>(p1), size);
p += size;
}
}
static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, xmrig::Assembly ASM)
{
uint32_t prev_rot_src = (uint32_t)(-1);
for (int i = 0;; ++i) {
const V4_Instruction inst = code[i];
if (inst.opcode == RET) {
break;
}
uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
uint8_t dst_index = inst.dst_index;
uint8_t src_index = inst.src_index;
const uint32_t a = inst.dst_index;
const uint32_t b = inst.src_index;
const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (src_index << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
switch (inst.opcode) {
case ROR:
case ROL:
if (b != prev_rot_src) {
prev_rot_src = b;
add_code(p, instructions_mov[c], instructions_mov[c + 1]);
}
break;
}
if (a == prev_rot_src) {
prev_rot_src = (uint32_t)(-1);
}
void_func begin = instructions[c];
if ((ASM = xmrig::ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) {
// AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
// Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
uint8_t* prefix = reinterpret_cast<uint8_t*>(begin);
if (*prefix == 0x49) {
*(p++) = 0x41;
}
begin = reinterpret_cast<void_func>(prefix + 1);
}
add_code(p, begin, instructions[c + 1]);
if (inst.opcode == ADD) {
*(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
if (is_64_bit) {
prev_rot_src = (uint32_t)(-1);
}
}
}
}
void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
uint8_t* p = p0;
add_code(p, CryptonightR_template_part1, CryptonightR_template_part2);
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(p, CryptonightR_template_part2, CryptonightR_template_part3);
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
add_code(p, CryptonightR_template_part3, CryptonightR_template_end);
Mem::flushInstructionCache(machine_code, p - p0);
}
void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
uint8_t* p = p0;
add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
add_code(p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
Mem::flushInstructionCache(machine_code, p - p0);
}
#endif

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,478 @@
PUBLIC FN_PREFIX(CryptonightR_template_part1)
PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
PUBLIC FN_PREFIX(CryptonightR_template_part2)
PUBLIC FN_PREFIX(CryptonightR_template_part3)
PUBLIC FN_PREFIX(CryptonightR_template_end)
PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
PUBLIC FN_PREFIX(CryptonightR_template_double_end)
FN_PREFIX(CryptonightR_template_part1):
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
FN_PREFIX(CryptonightR_template_mainloop):
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movq r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
FN_PREFIX(CryptonightR_template_part2):
mov rax, r13
mul r12
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz FN_PREFIX(CryptonightR_template_mainloop)
FN_PREFIX(CryptonightR_template_part3):
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
FN_PREFIX(CryptonightR_template_end):
ALIGN(64)
FN_PREFIX(CryptonightR_template_double_part1):
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
FN_PREFIX(CryptonightR_template_double_mainloop):
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq rdx, xmm6
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+112], rcx
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
FN_PREFIX(CryptonightR_template_double_part2):
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movq xmm1, rdx
movq xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
FN_PREFIX(CryptonightR_template_double_part3):
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
movq r15, xmm13
mov rcx, [rsp+104]
mov rax, r8
mul rdi
movq xmm1, rdx
movq xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz FN_PREFIX(CryptonightR_template_double_mainloop)
FN_PREFIX(CryptonightR_template_double_part4):
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
FN_PREFIX(CryptonightR_template_double_end):

View file

@ -0,0 +1,478 @@
PUBLIC CryptonightR_template_part1
PUBLIC CryptonightR_template_mainloop
PUBLIC CryptonightR_template_part2
PUBLIC CryptonightR_template_part3
PUBLIC CryptonightR_template_end
PUBLIC CryptonightR_template_double_part1
PUBLIC CryptonightR_template_double_mainloop
PUBLIC CryptonightR_template_double_part2
PUBLIC CryptonightR_template_double_part3
PUBLIC CryptonightR_template_double_part4
PUBLIC CryptonightR_template_double_end
CryptonightR_template_part1:
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
CryptonightR_template_mainloop:
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movq r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
CryptonightR_template_part2:
mov rax, r13
mul r12
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz CryptonightR_template_mainloop
CryptonightR_template_part3:
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
CryptonightR_template_end:
ALIGN(64)
CryptonightR_template_double_part1:
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
CryptonightR_template_double_mainloop:
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq rdx, xmm6
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+112], rcx
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
CryptonightR_template_double_part2:
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movq xmm1, rdx
movq xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
CryptonightR_template_double_part3:
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
movq r15, xmm13
mov rcx, [rsp+104]
mov rax, r8
mul rdi
movq xmm1, rdx
movq xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz CryptonightR_template_double_mainloop
CryptonightR_template_double_part4:
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
CryptonightR_template_double_end:

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,478 @@
PUBLIC FN_PREFIX(CryptonightR_template_part1)
PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
PUBLIC FN_PREFIX(CryptonightR_template_part2)
PUBLIC FN_PREFIX(CryptonightR_template_part3)
PUBLIC FN_PREFIX(CryptonightR_template_end)
PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
PUBLIC FN_PREFIX(CryptonightR_template_double_end)
FN_PREFIX(CryptonightR_template_part1):
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movd xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movd xmm0, r12
movd xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movd xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
FN_PREFIX(CryptonightR_template_mainloop):
movdqa xmm5, XMMWORD PTR [r9+r11]
movd xmm0, r15
movd xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movd r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
FN_PREFIX(CryptonightR_template_part2):
mov rax, r13
mul r12
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz FN_PREFIX(CryptonightR_template_mainloop)
FN_PREFIX(CryptonightR_template_part3):
movd rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
FN_PREFIX(CryptonightR_template_end):
ALIGN(64)
FN_PREFIX(CryptonightR_template_double_part1):
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movd xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movd xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movd xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movd xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movd xmm0, rcx
mov r11d, 524288
movd xmm10, rax
punpcklqdq xmm10, xmm0
movd xmm14, QWORD PTR [rsp+128]
movd xmm15, QWORD PTR [rsp+136]
ALIGN(64)
FN_PREFIX(CryptonightR_template_double_mainloop):
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movd xmm0, r12
mov ecx, ebx
movd xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movd rdx, xmm6
movd xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movd xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movd rdi, xmm5
movd rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movd xmm0, rsp
movd xmm1, rsi
movd xmm2, rdi
movd xmm11, rbp
movd xmm12, r15
movd xmm13, rdx
mov [rsp+112], rcx
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
FN_PREFIX(CryptonightR_template_double_part2):
movd rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movd rsi, xmm1
movd rdi, xmm2
movd rbp, xmm11
movd r15, xmm12
movd rdx, xmm13
mov rcx, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movd xmm1, rdx
movd xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movd rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movd xmm0, rsp
movd xmm1, rbx
movd xmm2, rsi
movd xmm11, rdi
movd xmm12, rbp
movd xmm13, r15
mov [rsp+104], rcx
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
FN_PREFIX(CryptonightR_template_double_part3):
movd rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movd rbx, xmm1
movd rsi, xmm2
movd rdi, xmm11
movd rbp, xmm12
movd r15, xmm13
mov rcx, [rsp+104]
mov rax, r8
mul rdi
movd xmm1, rdx
movd xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movd rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz FN_PREFIX(CryptonightR_template_double_mainloop)
FN_PREFIX(CryptonightR_template_double_part4):
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
FN_PREFIX(CryptonightR_template_double_end):

View file

@ -0,0 +1,478 @@
PUBLIC CryptonightR_template_part1
PUBLIC CryptonightR_template_mainloop
PUBLIC CryptonightR_template_part2
PUBLIC CryptonightR_template_part3
PUBLIC CryptonightR_template_end
PUBLIC CryptonightR_template_double_part1
PUBLIC CryptonightR_template_double_mainloop
PUBLIC CryptonightR_template_double_part2
PUBLIC CryptonightR_template_double_part3
PUBLIC CryptonightR_template_double_part4
PUBLIC CryptonightR_template_double_end
CryptonightR_template_part1:
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movd xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movd xmm0, r12
movd xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movd xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
CryptonightR_template_mainloop:
movdqa xmm5, XMMWORD PTR [r9+r11]
movd xmm0, r15
movd xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movd r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
CryptonightR_template_part2:
mov rax, r13
mul r12
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz CryptonightR_template_mainloop
CryptonightR_template_part3:
movd rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
CryptonightR_template_end:
ALIGN(64)
CryptonightR_template_double_part1:
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movd xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movd xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movd xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movd xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movd xmm0, rcx
mov r11d, 524288
movd xmm10, rax
punpcklqdq xmm10, xmm0
movd xmm14, QWORD PTR [rsp+128]
movd xmm15, QWORD PTR [rsp+136]
ALIGN(64)
CryptonightR_template_double_mainloop:
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movd xmm0, r12
mov ecx, ebx
movd xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movd rdx, xmm6
movd xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movd xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movd rdi, xmm5
movd rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movd xmm0, rsp
movd xmm1, rsi
movd xmm2, rdi
movd xmm11, rbp
movd xmm12, r15
movd xmm13, rdx
mov [rsp+112], rcx
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
CryptonightR_template_double_part2:
movd rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movd rsi, xmm1
movd rdi, xmm2
movd rbp, xmm11
movd r15, xmm12
movd rdx, xmm13
mov rcx, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movd xmm1, rdx
movd xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movd rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movd xmm0, rsp
movd xmm1, rbx
movd xmm2, rsi
movd xmm11, rdi
movd xmm12, rbp
movd xmm13, r15
mov [rsp+104], rcx
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
CryptonightR_template_double_part3:
movd rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movd rbx, xmm1
movd rsi, xmm2
movd rdi, xmm11
movd rbp, xmm12
movd r15, xmm13
mov rcx, [rsp+104]
mov rax, r8
mul rdi
movd xmm1, rdx
movd xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movd rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz CryptonightR_template_double_mainloop
CryptonightR_template_double_part4:
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
CryptonightR_template_double_end:

View file

@ -0,0 +1,428 @@
#ifndef VARIANT4_RANDOM_MATH_H
#define VARIANT4_RANDOM_MATH_H
extern "C"
{
#include "c_blake256.h"
}
enum V4_Settings
{
// Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
TOTAL_LATENCY = 15 * 3,
// Always generate at least 60 instructions
NUM_INSTRUCTIONS = 60,
// Available ALUs for MUL
// Modern CPUs typically have only 1 ALU which can do multiplications
ALU_COUNT_MUL = 1,
// Total available ALUs
// Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
ALU_COUNT = 3,
};
enum V4_InstructionList
{
MUL, // a*b
ADD, // a+b + C, C is an unsigned 32-bit constant
SUB, // a-b
ROR, // rotate right "a" by "b & 31" bits
ROL, // rotate left "a" by "b & 31" bits
XOR, // a^b
RET, // finish execution
V4_INSTRUCTION_COUNT = RET,
};
// V4_InstructionDefinition is used to generate code from random data
// Every random sequence of bytes is a valid code
//
// There are 8 registers in total:
// - 4 variable registers
// - 4 constant registers initialized from loop variables
//
// This is why dst_index is 2 bits
enum V4_InstructionDefinition
{
V4_OPCODE_BITS = 3,
V4_DST_INDEX_BITS = 2,
V4_SRC_INDEX_BITS = 3,
};
struct V4_Instruction
{
uint8_t opcode;
uint8_t dst_index;
uint8_t src_index;
uint32_t C;
};
#ifndef FORCEINLINE
#ifdef __GNUC__
#define FORCEINLINE __attribute__((always_inline)) inline
#elif _MSC_VER
#define FORCEINLINE __forceinline
#else
#define FORCEINLINE inline
#endif
#endif
#ifndef UNREACHABLE_CODE
#ifdef __GNUC__
#define UNREACHABLE_CODE __builtin_unreachable()
#elif _MSC_VER
#define UNREACHABLE_CODE __assume(false)
#else
#define UNREACHABLE_CODE
#endif
#endif
// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
// every switch-case will point to the same destination on every iteration of Cryptonight main loop
//
// This is about as fast as it can get without using low-level machine code generation
template<typename v4_reg>
static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
{
enum
{
REG_BITS = sizeof(v4_reg) * 8,
};
#define V4_EXEC(i) \
{ \
const struct V4_Instruction* op = code + i; \
const v4_reg src = r[op->src_index]; \
v4_reg* dst = r + op->dst_index; \
switch (op->opcode) \
{ \
case MUL: \
*dst *= src; \
break; \
case ADD: \
*dst += src + op->C; \
break; \
case SUB: \
*dst -= src; \
break; \
case ROR: \
{ \
const uint32_t shift = src % REG_BITS; \
*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
} \
break; \
case ROL: \
{ \
const uint32_t shift = src % REG_BITS; \
*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
} \
break; \
case XOR: \
*dst ^= src; \
break; \
case RET: \
return; \
default: \
UNREACHABLE_CODE; \
break; \
} \
}
#define V4_EXEC_10(j) \
V4_EXEC(j + 0) \
V4_EXEC(j + 1) \
V4_EXEC(j + 2) \
V4_EXEC(j + 3) \
V4_EXEC(j + 4) \
V4_EXEC(j + 5) \
V4_EXEC(j + 6) \
V4_EXEC(j + 7) \
V4_EXEC(j + 8) \
V4_EXEC(j + 9)
// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
// I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
//
// 60 28495
// 61 106077
// 62 2455855
// 63 5114930
// 64 1020868
// 65 1109026
// 66 151756
// 67 8429
// 68 4477
// 69 87
// Unroll 70 instructions here
V4_EXEC_10(0); // instructions 0-9
V4_EXEC_10(10); // instructions 10-19
V4_EXEC_10(20); // instructions 20-29
V4_EXEC_10(30); // instructions 30-39
V4_EXEC_10(40); // instructions 40-49
V4_EXEC_10(50); // instructions 50-59
V4_EXEC_10(60); // instructions 60-69
#undef V4_EXEC_10
#undef V4_EXEC
}
// If we don't have enough data available, generate more
static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
{
if (*data_index + bytes_needed > data_size)
{
hash_extra_blake(data, data_size, (char*) data);
*data_index = 0;
}
}
// Generates as many random math operations as possible with given latency and ALU restrictions
static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
{
// MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
// These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
//
// AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
// Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
// AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
// Source: https://www.agner.org/optimize/instruction_tables.pdf
const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
// Instruction latencies for theoretical ASIC implementation
const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
// Available ALUs for each instruction
const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
int8_t data[32];
memset(data, 0, sizeof(data));
uint64_t tmp = SWAP64LE(height);
memcpy(data, &tmp, sizeof(uint64_t));
// Set data_index past the last byte in data
// to trigger full data update with blake hash
// before we start using it
size_t data_index = sizeof(data);
int code_size;
do {
int latency[8];
int asic_latency[8];
// Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
// byte 0: current value of the destination register
// byte 1: instruction opcode
// byte 2: current value of the source register
//
// Registers R4-R7 are constant and are treated as having the same value because when we do
// the same operation twice with two constant source registers, it can be optimized into a single operation
uint32_t inst_data[8] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
bool is_rotation[V4_INSTRUCTION_COUNT];
bool rotated[4];
int rotate_count = 0;
memset(latency, 0, sizeof(latency));
memset(asic_latency, 0, sizeof(asic_latency));
memset(alu_busy, 0, sizeof(alu_busy));
memset(is_rotation, 0, sizeof(is_rotation));
memset(rotated, 0, sizeof(rotated));
is_rotation[ROR] = true;
is_rotation[ROL] = true;
int num_retries = 0;
code_size = 0;
int total_iterations = 0;
// Generate random code to achieve minimal required latency for our abstract CPU
// Try to get this latency for all 4 registers
while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
{
// Fail-safe to guarantee loop termination
++total_iterations;
if (total_iterations > 256)
break;
check_data(&data_index, 1, data, sizeof(data));
const uint8_t c = ((uint8_t*)data)[data_index++];
// MUL = opcodes 0-2
// ADD = opcode 3
// SUB = opcode 4
// ROR/ROL = opcode 5, shift direction is selected randomly
// XOR = opcodes 6-7
uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
if (opcode == 5)
{
check_data(&data_index, 1, data, sizeof(data));
opcode = (data[data_index++] >= 0) ? ROR : ROL;
}
else if (opcode >= 6)
{
opcode = XOR;
}
else
{
opcode = (opcode <= 2) ? MUL : (opcode - 2);
}
uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
const int a = dst_index;
int b = src_index;
// Don't do ADD/SUB/XOR with the same register
if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
{
// a is always < 4, so we don't need to check bounds here
b = a + 4;
src_index = b;
}
// Don't do rotation with the same destination twice because it's equal to a single rotation
if (is_rotation[opcode] && rotated[a])
{
continue;
}
// Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
// 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
// 2xXOR(a, b) = NOP
if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
{
continue;
}
// Find which ALU is available (and when) for this instruction
int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
int alu_index = -1;
while (next_latency < TOTAL_LATENCY)
{
for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
{
if (!alu_busy[next_latency][i])
{
// ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
if ((opcode == ADD) && alu_busy[next_latency + 1][i])
{
continue;
}
// Rotation can only start when previous rotation is finished, so do an additional availability check
if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
{
continue;
}
alu_index = i;
break;
}
}
if (alu_index >= 0)
{
break;
}
++next_latency;
}
// Don't generate instructions that leave some register unchanged for more than 7 cycles
if (next_latency > latency[a] + 7)
{
continue;
}
next_latency += op_latency[opcode];
if (next_latency <= TOTAL_LATENCY)
{
if (is_rotation[opcode])
{
++rotate_count;
}
// Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
latency[a] = next_latency;
// ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
rotated[a] = is_rotation[opcode];
inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
code[code_size].opcode = opcode;
code[code_size].dst_index = dst_index;
code[code_size].src_index = src_index;
code[code_size].C = 0;
if (opcode == ADD)
{
// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
// ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
uint32_t t;
memcpy(&t, data + data_index, sizeof(uint32_t));
code[code_size].C = SWAP32LE(t);
data_index += sizeof(uint32_t);
}
++code_size;
if (code_size >= NUM_INSTRUCTIONS)
{
break;
}
}
else
{
++num_retries;
}
}
// ASIC has more execution resources and can extract as much parallelism from the code as possible
// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
// Get this latency for at least 1 of the 4 registers
const int prev_code_size = code_size;
while ((asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
{
int min_idx = 0;
int max_idx = 0;
for (int i = 1; i < 4; ++i)
{
if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
}
const uint8_t pattern[3] = { ROR, MUL, MUL };
const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
latency[min_idx] = latency[max_idx] + op_latency[opcode];
asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
code[code_size].opcode = opcode;
code[code_size].dst_index = min_idx;
code[code_size].src_index = max_idx;
code[code_size].C = 0;
++code_size;
}
// There is ~99.8% chance that code_size >= NUM_INSTRUCTIONS here, so second iteration is required rarely
} while (code_size < NUM_INSTRUCTIONS);
// Add final instruction to stop the interpreter
code[code_size].opcode = RET;
code[code_size].dst_index = 0;
code[code_size].src_index = 0;
code[code_size].C = 0;
return code_size;
}
#endif

View file

@ -175,9 +175,9 @@ bool Network::isColors() const
void Network::setJob(Client *client, const Job &job, bool donate)
{
LOG_INFO(isColors() ? MAGENTA_BOLD("new job") " from " WHITE_BOLD("%s:%d") " diff " WHITE_BOLD("%d") " algo " WHITE_BOLD("%s")
: "new job from %s:%d diff %d algo %s",
client->host(), client->port(), job.diff(), job.algorithm().shortName());
LOG_INFO(isColors() ? MAGENTA_BOLD("new job") " from " WHITE_BOLD("%s:%d") " diff " WHITE_BOLD("%d") " algo " WHITE_BOLD("%s") " height " WHITE_BOLD("%llu")
: "new job from %s:%d diff %d algo %s height %llu",
client->host(), client->port(), job.diff(), job.algorithm().shortName(), job.height());
if (!donate && m_donate) {
m_donate->setAlgo(job.algorithm());

View file

@ -145,16 +145,50 @@ bool xmrig::CpuThread::isSoftAES(AlgoVariant av)
}
#ifndef XMRIG_NO_ASM
template<xmrig::Algo algo, xmrig::Variant variant>
static inline void add_asm_func(xmrig::CpuThread::cn_hash_fun(&asm_func_map)[xmrig::ALGO_MAX][xmrig::AV_MAX][xmrig::VARIANT_MAX][xmrig::ASM_MAX])
{
asm_func_map[algo][xmrig::AV_SINGLE][variant][xmrig::ASM_INTEL] = cryptonight_single_hash_asm<algo, variant, xmrig::ASM_INTEL>;
asm_func_map[algo][xmrig::AV_SINGLE][variant][xmrig::ASM_RYZEN] = cryptonight_single_hash_asm<algo, variant, xmrig::ASM_RYZEN>;
asm_func_map[algo][xmrig::AV_SINGLE][variant][xmrig::ASM_BULLDOZER] = cryptonight_single_hash_asm<algo, variant, xmrig::ASM_BULLDOZER>;
asm_func_map[algo][xmrig::AV_DOUBLE][variant][xmrig::ASM_INTEL] = cryptonight_double_hash_asm<algo, variant, xmrig::ASM_INTEL>;
asm_func_map[algo][xmrig::AV_DOUBLE][variant][xmrig::ASM_RYZEN] = cryptonight_double_hash_asm<algo, variant, xmrig::ASM_RYZEN>;
asm_func_map[algo][xmrig::AV_DOUBLE][variant][xmrig::ASM_BULLDOZER] = cryptonight_double_hash_asm<algo, variant, xmrig::ASM_BULLDOZER>;
}
#endif
xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant av, Variant variant, Assembly assembly)
{
assert(variant >= VARIANT_0 && variant < VARIANT_MAX);
# ifndef XMRIG_NO_ASM
constexpr const size_t count = VARIANT_MAX * 10 * ALGO_MAX + 12;
# else
constexpr const size_t count = VARIANT_MAX * 10 * ALGO_MAX;
if (assembly == ASM_AUTO) {
assembly = Cpu::info()->assembly();
}
static cn_hash_fun asm_func_map[ALGO_MAX][AV_MAX][VARIANT_MAX][ASM_MAX] = {};
static bool asm_func_map_initialized = false;
if (!asm_func_map_initialized) {
add_asm_func<CRYPTONIGHT, VARIANT_2>(asm_func_map);
add_asm_func<CRYPTONIGHT, VARIANT_HALF>(asm_func_map);
add_asm_func<CRYPTONIGHT, VARIANT_WOW>(asm_func_map);
add_asm_func<CRYPTONIGHT_PICO, VARIANT_HALF>(asm_func_map);
asm_func_map_initialized = true;
}
cn_hash_fun fun = asm_func_map[algorithm][av][variant][assembly];
if (fun) {
return fun;
}
# endif
constexpr const size_t count = VARIANT_MAX * 10 * ALGO_MAX;
static const cn_hash_fun func_table[] = {
cryptonight_single_hash<CRYPTONIGHT, false, VARIANT_0>,
cryptonight_double_hash<CRYPTONIGHT, false, VARIANT_0>,
@ -265,6 +299,17 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
# endif
cryptonight_single_hash<CRYPTONIGHT, false, VARIANT_WOW>,
cryptonight_double_hash<CRYPTONIGHT, false, VARIANT_WOW>,
cryptonight_single_hash<CRYPTONIGHT, true, VARIANT_WOW>,
cryptonight_double_hash<CRYPTONIGHT, true, VARIANT_WOW>,
cryptonight_triple_hash<CRYPTONIGHT, false, VARIANT_WOW>,
cryptonight_quad_hash<CRYPTONIGHT, false, VARIANT_WOW>,
cryptonight_penta_hash<CRYPTONIGHT, false, VARIANT_WOW>,
cryptonight_triple_hash<CRYPTONIGHT, true, VARIANT_WOW>,
cryptonight_quad_hash<CRYPTONIGHT, true, VARIANT_WOW>,
cryptonight_penta_hash<CRYPTONIGHT, true, VARIANT_WOW>,
# ifndef XMRIG_NO_AEON
cryptonight_single_hash<CRYPTONIGHT_LITE, false, VARIANT_0>,
cryptonight_double_hash<CRYPTONIGHT_LITE, false, VARIANT_0>,
@ -298,6 +343,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_HALF
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
# else
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1
@ -311,6 +357,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_HALF
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
# endif
# ifndef XMRIG_NO_SUMO
@ -358,6 +405,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_HALF
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
# else
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1
@ -371,7 +419,9 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_HALF
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
# endif
# ifndef XMRIG_NO_CN_PICO
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1
@ -396,6 +446,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
cryptonight_penta_hash<CRYPTONIGHT_PICO, true, VARIANT_TRTL>,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
# else
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1
@ -409,30 +460,15 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_HALF
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
# endif
# ifndef XMRIG_NO_ASM
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_INTEL>,
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_RYZEN>,
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_BULLDOZER>,
cryptonight_double_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_INTEL>,
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_HALF, ASM_INTEL>,
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_HALF, ASM_RYZEN>,
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_HALF, ASM_BULLDOZER>,
cryptonight_double_hash_asm<CRYPTONIGHT, VARIANT_HALF, ASM_INTEL>,
cryptonight_single_hash_asm<CRYPTONIGHT_PICO, VARIANT_TRTL, ASM_INTEL>,
cryptonight_single_hash_asm<CRYPTONIGHT_PICO, VARIANT_TRTL, ASM_RYZEN>,
cryptonight_single_hash_asm<CRYPTONIGHT_PICO, VARIANT_TRTL, ASM_BULLDOZER>,
cryptonight_double_hash_asm<CRYPTONIGHT_PICO, VARIANT_TRTL, ASM_INTEL>
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
# endif
};
static_assert(count == sizeof(func_table) / sizeof(func_table[0]), "func_table size mismatch");
const size_t index = VARIANT_MAX * 10 * algorithm + 10 * variant + av - 1;
# ifndef NDEBUG
const size_t index = fnIndex(algorithm, av, variant, assembly);
cn_hash_fun func = func_table[index];
assert(index < sizeof(func_table) / sizeof(func_table[0]));
@ -440,7 +476,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
return func;
# else
return func_table[fnIndex(algorithm, av, variant, assembly)];
return func_table[index];
# endif
}
@ -602,49 +638,3 @@ rapidjson::Value xmrig::CpuThread::toConfig(rapidjson::Document &doc) const
return obj;
}
size_t xmrig::CpuThread::fnIndex(Algo algorithm, AlgoVariant av, Variant variant, Assembly assembly)
{
const size_t index = VARIANT_MAX * 10 * algorithm + 10 * variant + av - 1;
# ifndef XMRIG_NO_ASM
if (assembly == ASM_AUTO) {
assembly = Cpu::info()->assembly();
}
if (assembly == ASM_NONE) {
return index;
}
constexpr const size_t offset = VARIANT_MAX * 10 * ALGO_MAX;
size_t extra_offset = 0;
if (algorithm == CRYPTONIGHT && (variant == VARIANT_2 || variant == VARIANT_HALF)) {
if (variant == VARIANT_HALF) {
extra_offset += 4;
}
if (av == AV_SINGLE) {
return offset + extra_offset + assembly - 2;
}
if (av == AV_DOUBLE) {
return offset + 3 + extra_offset;
}
}
else if (algorithm == CRYPTONIGHT_PICO && variant == VARIANT_TRTL) {
extra_offset = 8;
if (av == AV_SINGLE) {
return offset + extra_offset + assembly - 2;
}
if (av == AV_DOUBLE) {
return offset + 3 + extra_offset;
}
}
# endif
return index;
}

View file

@ -60,7 +60,7 @@ public:
CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiway multiway, int64_t affinity, int priority, bool softAES, bool prefetch, Assembly assembly);
typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, cryptonight_ctx **ctx);
typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, cryptonight_ctx **ctx, uint64_t height);
typedef void (*cn_mainloop_fun)(cryptonight_ctx *ctx);
typedef void (*cn_mainloop_double_fun)(cryptonight_ctx *ctx1, cryptonight_ctx *ctx2);
@ -98,8 +98,6 @@ protected:
rapidjson::Value toConfig(rapidjson::Document &doc) const override;
private:
static size_t fnIndex(Algo algorithm, AlgoVariant av, Variant variant, Assembly assembly);
const Algo m_algorithm;
const AlgoVariant m_av;
const Assembly m_assembly;

View file

@ -25,9 +25,11 @@
#include <thread>
#include <sstream>
#include "crypto/CryptoNight_test.h"
#include "common/log/Log.h"
#include "workers/CpuThread.h"
#include "workers/MultiWorker.h"
#include "workers/Workers.h"
@ -54,6 +56,11 @@ bool MultiWorker<N>::selfTest()
using namespace xmrig;
if (m_thread->algorithm() == CRYPTONIGHT) {
if (!verify2(VARIANT_WOW, test_input_WOW)) {
LOG_WARN("CryptonightR (Wownero) self-test failed");
return false;
}
const bool rc = verify(VARIANT_0, test_output_v0) &&
verify(VARIANT_1, test_output_v1) &&
verify(VARIANT_2, test_output_v2) &&
@ -121,7 +128,7 @@ void MultiWorker<N>::start()
storeStats();
}
m_thread->fn(m_state.job.algorithm().variant())(m_state.blob, m_state.job.size(), m_hash, m_ctx);
m_thread->fn(m_state.job.algorithm().variant())(m_state.blob, m_state.job.size(), m_hash, m_ctx, m_state.job.height());
for (size_t i = 0; i < N; ++i) {
if (*reinterpret_cast<uint64_t*>(m_hash + (i * 32) + 24) < m_state.job.target()) {
@ -162,11 +169,71 @@ bool MultiWorker<N>::verify(xmrig::Variant variant, const uint8_t *referenceValu
return false;
}
func(test_input, 76, m_hash, m_ctx);
func(test_input, 76, m_hash, m_ctx, 0);
return memcmp(m_hash, referenceValue, sizeof m_hash) == 0;
}
template<size_t N>
bool MultiWorker<N>::verify2(xmrig::Variant variant, const char *test_data)
{
xmrig::CpuThread::cn_hash_fun func = m_thread->fn(variant);
if (!func) {
return false;
}
std::stringstream s(test_data);
std::string expected_hex;
std::string input_hex;
uint64_t height;
while (!s.eof())
{
uint8_t referenceValue[N * 32];
uint8_t input[N * 256];
s >> expected_hex;
s >> input_hex;
s >> height;
if ((expected_hex.length() != 64) || (input_hex.length() > 512))
{
return false;
}
bool err = false;
for (int i = 0; i < 32; ++i)
{
referenceValue[i] = (hf_hex2bin(expected_hex[i * 2], err) << 4) + hf_hex2bin(expected_hex[i * 2 + 1], err);
}
const size_t input_len = input_hex.length() / 2;
for (size_t i = 0; i < input_len; ++i)
{
input[i] = (hf_hex2bin(input_hex[i * 2], err) << 4) + hf_hex2bin(input_hex[i * 2 + 1], err);
}
if (err)
{
return false;
}
for (size_t i = 1; i < N; ++i)
{
memcpy(input + i * input_len, input, input_len);
memcpy(referenceValue + i * 32, referenceValue, 32);
}
func(input, input_len, m_hash, m_ctx, height);
if (memcmp(m_hash, referenceValue, sizeof m_hash) != 0)
{
return false;
}
}
return true;
}
template<size_t N>
void MultiWorker<N>::consumeJob()
{

View file

@ -50,6 +50,7 @@ protected:
private:
bool resume(const Job &job);
bool verify(xmrig::Variant variant, const uint8_t *referenceValue);
bool verify2(xmrig::Variant variant, const char *test_data);
void consumeJob();
void save(const Job &job);