From 27980f24f8d662138c1b3aec323d9acc05e3f6b6 Mon Sep 17 00:00:00 2001 From: XMRig Date: Sun, 3 Mar 2019 20:19:17 +0700 Subject: [PATCH] Plain C "cn/r" implementation. --- CMakeLists.txt | 1 + algo/cryptonight/cryptonight.h | 1 + algo/cryptonight/cryptonight_monero.h | 52 ++- algo/cryptonight/cryptonight_r_av1.c | 10 +- algo/cryptonight/cryptonight_r_av2.c | 20 +- algo/cryptonight/cryptonight_r_av3.c | 10 +- algo/cryptonight/cryptonight_r_av4.c | 20 +- algo/cryptonight/variant4_random_math.h | 448 ++++++++++++++++++++++++ stratum.c | 7 +- stratum.h | 6 +- xmrig.c | 10 +- 11 files changed, 559 insertions(+), 26 deletions(-) create mode 100644 algo/cryptonight/variant4_random_math.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6607478dc..79ccbb08e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,7 @@ set(HEADERS algo/cryptonight/cryptonight_monero.h algo/cryptonight/cryptonight_softaes.h algo/cryptonight/cryptonight_test.h + algo/cryptonight/variant4_random_math.h compat.h cpu.h donate.h diff --git a/algo/cryptonight/cryptonight.h b/algo/cryptonight/cryptonight.h index da6a8ecfc..551a8710a 100644 --- a/algo/cryptonight/cryptonight.h +++ b/algo/cryptonight/cryptonight.h @@ -68,6 +68,7 @@ struct cryptonight_ctx { cn_mainloop_double_fun_ms_abi generated_code_double; struct cryptonight_r_data generated_code_data; struct cryptonight_r_data generated_code_double_data; + uint64_t height; }; diff --git a/algo/cryptonight/cryptonight_monero.h b/algo/cryptonight/cryptonight_monero.h index 2f64ad0a8..a1681a045 100644 --- a/algo/cryptonight/cryptonight_monero.h +++ b/algo/cryptonight/cryptonight_monero.h @@ -6,8 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -29,6 +29,8 @@ #include #include +#include +#include static inline __m128i int_sqrt_v2(const uint64_t n0) @@ -87,6 +89,17 @@ static inline __m128i int_sqrt_v2(const uint64_t n0) _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ } +# define VARIANT4_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \ + { \ + const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ + _c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \ + } + # define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ { \ const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \ @@ -99,4 +112,39 @@ static inline __m128i int_sqrt_v2(const uint64_t n0) _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ } + +#ifndef NOINLINE +#ifdef __GNUC__ +#define NOINLINE __attribute__ ((noinline)) +#elif _MSC_VER +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE +#endif +#endif + +#include "variant4_random_math.h" + +#define VARIANT4_RANDOM_MATH_INIT(part) \ + uint32_t r##part[9]; \ + struct V4_Instruction code##part[256]; \ + { \ + r##part[0] = (uint32_t)(h##part[12]); \ + r##part[1] = (uint32_t)(h##part[12] >> 32); \ + r##part[2] = (uint32_t)(h##part[13]); \ + r##part[3] = (uint32_t)(h##part[13] >> 32); \ + } \ + v4_random_math_init(code##part, ctx[part]->height); + +#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \ + { \ + cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \ + r##part[4] = (uint32_t)(al); \ + r##part[5] = (uint32_t)(ah); \ + r##part[6] = (uint32_t)(_mm_cvtsi128_si32(bx0)); \ + r##part[7] = (uint32_t)(_mm_cvtsi128_si32(bx1)); \ + r##part[8] = (uint32_t)(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ + v4_random_math(code##part, r##part); \ + } + #endif /* XMRIG_CRYPTONIGHT_MONERO_H */ diff --git a/algo/cryptonight/cryptonight_r_av1.c b/algo/cryptonight/cryptonight_r_av1.c index 145022d8a..af9b7f2e2 100644 --- a/algo/cryptonight/cryptonight_r_av1.c +++ b/algo/cryptonight/cryptonight_r_av1.c @@ -44,6 +44,7 @@ void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *rest VARIANT2_INIT(0); VARIANT2_SET_ROUNDING_MODE(); + VARIANT4_RANDOM_MATH_INIT(0); uint64_t al0 = h0[0] ^ h0[4]; uint64_t ah0 = h0[1] ^ h0[5]; @@ -58,7 +59,7 @@ void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *rest cx = _mm_aesenc_si128(cx, ax0); - VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); idx0 = _mm_cvtsi128_si64(cx); @@ -67,9 +68,12 @@ void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *rest cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; - VARIANT2_INTEGER_MATH(0, cl, cx); + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1); + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + lo = _umul128(idx0, cl, &hi); - VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); al0 += hi; ah0 += lo; diff --git a/algo/cryptonight/cryptonight_r_av2.c b/algo/cryptonight/cryptonight_r_av2.c index 507ce4210..965a91810 100644 --- a/algo/cryptonight/cryptonight_r_av2.c +++ b/algo/cryptonight/cryptonight_r_av2.c @@ -46,6 +46,8 @@ void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *rest VARIANT2_INIT(0); VARIANT2_INIT(1); VARIANT2_SET_ROUNDING_MODE(); + VARIANT4_RANDOM_MATH_INIT(0); + VARIANT4_RANDOM_MATH_INIT(1); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -73,10 +75,10 @@ void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *rest cx0 = _mm_aesenc_si128(cx0, ax0); cx1 = _mm_aesenc_si128(cx1, ax1); - VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); - VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); + VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); idx0 = _mm_cvtsi128_si64(cx0); @@ -86,9 +88,12 @@ void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *rest cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; - VARIANT2_INTEGER_MATH(0, cl, cx0); + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01); + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + lo = _umul128(idx0, cl, &hi); - VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); al0 += hi; ah0 += lo; @@ -103,9 +108,12 @@ void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *rest cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; - VARIANT2_INTEGER_MATH(1, cl, cx1); + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11); + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + lo = _umul128(idx1, cl, &hi); - VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); + VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); al1 += hi; ah1 += lo; diff --git a/algo/cryptonight/cryptonight_r_av3.c b/algo/cryptonight/cryptonight_r_av3.c index 46b0de386..b9c341e6f 100644 --- a/algo/cryptonight/cryptonight_r_av3.c +++ b/algo/cryptonight/cryptonight_r_av3.c @@ -44,6 +44,7 @@ void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *rest VARIANT2_INIT(0); VARIANT2_SET_ROUNDING_MODE(); + VARIANT4_RANDOM_MATH_INIT(0); uint64_t al0 = h0[0] ^ h0[4]; uint64_t ah0 = h0[1] ^ h0[5]; @@ -58,7 +59,7 @@ void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *rest cx = soft_aesenc(cx, ax0); - VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); idx0 = _mm_cvtsi128_si64(cx); @@ -67,9 +68,12 @@ void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *rest cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; - VARIANT2_INTEGER_MATH(0, cl, cx); + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1); + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + lo = _umul128(idx0, cl, &hi); - VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); al0 += hi; ah0 += lo; diff --git a/algo/cryptonight/cryptonight_r_av4.c b/algo/cryptonight/cryptonight_r_av4.c index bfaa678f6..522b229fc 100644 --- a/algo/cryptonight/cryptonight_r_av4.c +++ b/algo/cryptonight/cryptonight_r_av4.c @@ -46,6 +46,8 @@ void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *rest VARIANT2_INIT(0); VARIANT2_INIT(1); VARIANT2_SET_ROUNDING_MODE(); + VARIANT4_RANDOM_MATH_INIT(0); + VARIANT4_RANDOM_MATH_INIT(1); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -73,10 +75,10 @@ void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *rest cx0 = soft_aesenc(cx0, ax0); cx1 = soft_aesenc(cx1, ax1); - VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); - VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); + VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); idx0 = _mm_cvtsi128_si64(cx0); @@ -86,9 +88,12 @@ void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *rest cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; - VARIANT2_INTEGER_MATH(0, cl, cx0); + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01); + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + lo = _umul128(idx0, cl, &hi); - VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); al0 += hi; ah0 += lo; @@ -103,9 +108,12 @@ void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *rest cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; - VARIANT2_INTEGER_MATH(1, cl, cx1); + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11); + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + lo = _umul128(idx1, cl, &hi); - VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); + VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); al1 += hi; ah1 += lo; diff --git a/algo/cryptonight/variant4_random_math.h b/algo/cryptonight/variant4_random_math.h new file mode 100644 index 000000000..d2be1a2af --- /dev/null +++ b/algo/cryptonight/variant4_random_math.h @@ -0,0 +1,448 @@ +#ifndef VARIANT4_RANDOM_MATH_H +#define VARIANT4_RANDOM_MATH_H + + +#include +#include + + +#include "crypto/c_blake256.h" + + +enum V4_Settings +{ + // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications + TOTAL_LATENCY = 15 * 3, + + // Always generate at least 60 instructions + NUM_INSTRUCTIONS_MIN = 60, + + // Never generate more than 70 instructions (final RET instruction doesn't count here) + NUM_INSTRUCTIONS_MAX = 70, + + // Available ALUs for MUL + // Modern CPUs typically have only 1 ALU which can do multiplications + ALU_COUNT_MUL = 1, + + // Total available ALUs + // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code + ALU_COUNT = 3, +}; + +enum V4_InstructionList +{ + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution + V4_INSTRUCTION_COUNT = RET, +}; + +// V4_InstructionDefinition is used to generate code from random data +// Every random sequence of bytes is a valid code +// +// There are 9 registers in total: +// - 4 variable registers +// - 5 constant registers initialized from loop variables +// This is why dst_index is 2 bits +enum V4_InstructionDefinition +{ + V4_OPCODE_BITS = 3, + V4_DST_INDEX_BITS = 2, + V4_SRC_INDEX_BITS = 3, +}; + +struct V4_Instruction +{ + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t C; +}; + +#ifndef FORCEINLINE +#ifdef __GNUC__ +#define FORCEINLINE __attribute__((always_inline)) inline +#elif _MSC_VER +#define FORCEINLINE __forceinline +#else +#define FORCEINLINE inline +#endif +#endif + +#ifndef UNREACHABLE_CODE +#ifdef __GNUC__ +#define UNREACHABLE_CODE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE_CODE __assume(false) +#else +#define UNREACHABLE_CODE +#endif +#endif + +#define SWAP32LE(x) x +#define SWAP64LE(x) x +#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length)) + +// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU: +// every switch-case will point to the same destination on every iteration of Cryptonight main loop +// +// This is about as fast as it can get without using low-level machine code generation +//template +static void v4_random_math(const struct V4_Instruction* code, uint32_t r[9]) +{ +#define REG_BITS 32 +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const uint32_t src = r[op->src_index]; \ + uint32_t *dst = r + op->dst_index; \ + switch (op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ + } + +#define V4_EXEC_10(j) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ + V4_EXEC(j + 9) + + // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency + // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: + // + // 60 27960 + // 61 105054 + // 62 2452759 + // 63 5115997 + // 64 1022269 + // 65 1109635 + // 66 153145 + // 67 8550 + // 68 4529 + // 69 102 + + // Unroll 70 instructions here + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 + +#undef V4_EXEC_10 +#undef V4_EXEC +#undef REG_BITS +} + +// If we don't have enough data available, generate more +static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) +{ + if (*data_index + bytes_needed > data_size) + { + hash_extra_blake(data, data_size, (char*) data); + *data_index = 0; + } +} + +// Generates as many random math operations as possible with given latency and ALU restrictions +// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions +static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) +{ + // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle + // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake + // + // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors + // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors + // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + int8_t data[32]; + memset(data, 0, sizeof(data)); + uint64_t tmp = SWAP64LE(height); + memcpy(data, &tmp, sizeof(uint64_t)); + data[20] = -38; + + // Set data_index past the last byte in data + // to trigger full data update with blake hash + // before we start using it + size_t data_index = sizeof(data); + + int code_size; + + // There is a small chance (1.8%) that register R8 won't be used in the generated program + // So we keep track of it and try again if it's not used + bool r8_used; + do { + int latency[9]; + int asic_latency[9]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R8 are constant and are treated as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + + bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; + bool is_rotation[V4_INSTRUCTION_COUNT]; + bool rotated[4]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[ROR] = true; + is_rotation[ROL] = true; + + int num_retries = 0; + code_size = 0; + + int total_iterations = 0; + r8_used = false; + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + { + // Fail-safe to guarantee loop termination + ++total_iterations; + if (total_iterations > 256) + break; + + check_data(&data_index, 1, data, sizeof(data)); + + const uint8_t c = ((uint8_t*)data)[data_index++]; + + // MUL = opcodes 0-2 + // ADD = opcode 3 + // SUB = opcode 4 + // ROR/ROL = opcode 5, shift direction is selected randomly + // XOR = opcodes 6-7 + uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); + if (opcode == 5) + { + check_data(&data_index, 1, data, sizeof(data)); + opcode = (data[data_index++] >= 0) ? ROR : ROL; + } + else if (opcode >= 6) + { + opcode = XOR; + } + else + { + opcode = (opcode <= 2) ? MUL : (opcode - 2); + } + + uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); + uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; + + // Don't do ADD/SUB/XOR with the same register + if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + { + // a is always < 4, so we don't need to check bounds here + b = 8; + src_index = b; + } + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[opcode] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations + // 2xXOR(a, b) = NOP + if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < TOTAL_LATENCY) + { + for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 7 cycles + if (next_latency > latency[a] + 7) + { + continue; + } + + next_latency += op_latency[opcode]; + + if (next_latency <= TOTAL_LATENCY) + { + if (is_rotation[opcode]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[opcode]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; + + rotated[a] = is_rotation[opcode]; + + inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); + + code[code_size].opcode = opcode; + code[code_size].dst_index = dst_index; + code[code_size].src_index = src_index; + code[code_size].C = 0; + + if (src_index == 8) + { + r8_used = true; + } + + if (opcode == ADD) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; + + // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C" + check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); + uint32_t t; + memcpy(&t, data + data_index, sizeof(uint32_t)); + code[code_size].C = SWAP32LE(t); + data_index += sizeof(uint32_t); + } + + ++code_size; + if (code_size >= NUM_INSTRUCTIONS_MIN) + { + break; + } + } + else + { + ++num_retries; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + { + int min_idx = 0; + int max_idx = 0; + for (int i = 1; i < 4; ++i) + { + if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[opcode]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; + + code[code_size].opcode = opcode; + code[code_size].dst_index = min_idx; + code[code_size].src_index = max_idx; + code[code_size].C = 0; + ++code_size; + } + + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + + // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here + // Add final instruction to stop the interpreter + code[code_size].opcode = RET; + code[code_size].dst_index = 0; + code[code_size].src_index = 0; + code[code_size].C = 0; + + return code_size; +} + +#endif diff --git a/stratum.c b/stratum.c index e10904762..672b6b583 100644 --- a/stratum.c +++ b/stratum.c @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -709,6 +710,8 @@ static bool job_decode(const json_t *job) { memset(work.job_id, 0, sizeof(work.job_id)); memcpy(work.job_id, job_id, strlen(job_id)); + work.height = (uint64_t) json_integer_value(json_object_get(job, "height")); + return true; } diff --git a/stratum.h b/stratum.h index 00fd426fd..13f8f513f 100644 --- a/stratum.h +++ b/stratum.h @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -42,6 +43,7 @@ struct work { uint32_t target __attribute__((aligned(16))); uint32_t hash[8] __attribute__((aligned(16))); char job_id[64] __attribute__((aligned(16))); + uint64_t height; }; diff --git a/xmrig.c b/xmrig.c index 7ae8b3202..c1c3865f6 100644 --- a/xmrig.c +++ b/xmrig.c @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -306,6 +307,8 @@ static void *miner_thread(void *userdata) { struct timeval tv_start; gettimeofday(&tv_start, NULL); + persistentctx[0]->height = work.height; + /* scan nonces for a proof-of-work hash */ const int rc = scanhash_cryptonight(thr_id, hash, (uint8_t *) work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx); stats_add_hashes(thr_id, &tv_start, hashes_done); @@ -391,6 +394,9 @@ static void *miner_thread_double(void *userdata) { struct timeval tv_start; gettimeofday(&tv_start, NULL); + persistentctx[0]->height = work.height; + persistentctx[1]->height = work.height; + /* scan nonces for a proof-of-work hash */ const int rc = scanhash_cryptonight_double(thr_id, (uint32_t *) double_hash, double_blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx); stats_add_hashes(thr_id, &tv_start, hashes_done);