diff --git a/CMakeLists.txt b/CMakeLists.txt index 51b8b167d..7b9928253 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,7 @@ set(HEADERS algo/cryptonight/cryptonight_monero.h algo/cryptonight/cryptonight_softaes.h algo/cryptonight/cryptonight_test.h + algo/cryptonight/variant4_random_math.h compat.h cpu.h donate.h @@ -29,6 +30,7 @@ set(HEADERS_CRYPTO crypto/c_blake256.h crypto/c_jh.h crypto/c_skein.h + crypto/soft_aes.h ) set(HEADERS_COMPAT @@ -48,6 +50,10 @@ set(SOURCES algo/cryptonight/cryptonight_av2.c algo/cryptonight/cryptonight_av3.c algo/cryptonight/cryptonight_av4.c + algo/cryptonight/cryptonight_r_av1.c + algo/cryptonight/cryptonight_r_av2.c + algo/cryptonight/cryptonight_r_av3.c + algo/cryptonight/cryptonight_r_av4.c util.c options.c stratum.c @@ -61,7 +67,6 @@ set(SOURCES_CRYPTO crypto/c_blake256.c crypto/c_jh.c crypto/c_skein.c - crypto/soft_aes.c ) set(SOURCES_UTILS diff --git a/algo/cryptonight-lite/cryptonight_lite_softaes.h b/algo/cryptonight-lite/cryptonight_lite_softaes.h index 1e06a0f27..be0b73ef9 100644 --- a/algo/cryptonight-lite/cryptonight_lite_softaes.h +++ b/algo/cryptonight-lite/cryptonight_lite_softaes.h @@ -4,9 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2017 fireice-uk - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,16 +22,15 @@ * along with this program. If not, see . */ -#ifndef __CRYPTONIGHT_LITE_SOFTAES_H__ -#define __CRYPTONIGHT_LITE_SOFTAES_H__ +#ifndef XMRIG_CRYPTONIGHT_LITE_SOFTAES_H +#define XMRIG_CRYPTONIGHT_LITE_SOFTAES_H #include #include -extern __m128i soft_aesenc(__m128i in, __m128i key); -extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon); +#include "crypto/soft_aes.h" // This will shift and xor tmp1 into itself as 4 32-bit vals such as @@ -253,4 +252,4 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } -#endif /* __CRYPTONIGHT_LITE_SOFTAES_H__ */ +#endif /* XMRIG_CRYPTONIGHT_LITE_SOFTAES_H */ diff --git a/algo/cryptonight/cryptonight.c b/algo/cryptonight/cryptonight.c index 62dbdc507..7912295a9 100644 --- a/algo/cryptonight/cryptonight.c +++ b/algo/cryptonight/cryptonight.c @@ -6,7 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,9 +39,13 @@ #include "crypto/c_groestl.h" #include "crypto/c_jh.h" #include "crypto/c_skein.h" -#include "cryptonight.h" #include "cryptonight_test.h" +#include "cryptonight.h" #include "options.h" +#include "persistent_memory.h" + + +static cn_hash_fun asm_func_map[AV_MAX][VARIANT_MAX][ASM_MAX] = {}; void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); @@ -56,6 +61,11 @@ void cryptonight_av4_v0(const uint8_t *input, size_t size, uint8_t *output, stru void cryptonight_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); void cryptonight_av4_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_r_av1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_r_av2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_r_av3(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_r_av4(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); + #ifndef XMRIG_NO_AEON void cryptonight_lite_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); @@ -72,7 +82,13 @@ void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output, #ifndef XMRIG_NO_ASM void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_single_hash_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); + +void cryptonight_r_av1_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_r_av1_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_r_av2_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_r_av2_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); #endif @@ -89,6 +105,46 @@ static inline bool verify(enum Variant variant, uint8_t *output, struct cryptoni } +static inline bool verify2(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue) +{ + cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant); + if (func == NULL) { + return false; + } + + if (opt_double_hash) { + uint8_t input[128]; + + for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) { + const size_t size = cn_r_test_input[i].size; + memcpy(input, cn_r_test_input[i].data, size); + memcpy(input + size, cn_r_test_input[i].data, size); + + ctx[0]->height = ctx[1]->height = cn_r_test_input[i].height; + + func(input, size, output, ctx); + + if (memcmp(output, referenceValue + i * 32, 32) != 0 || memcmp(output + 32, referenceValue + i * 32, 32) != 0) { + return false; + } + } + } + else { + for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) { + ctx[0]->height = cn_r_test_input[i].height; + + func(cn_r_test_input[i].data, cn_r_test_input[i].size, output, ctx); + + if (memcmp(output, referenceValue + i * 32, 32) != 0) { + return false; + } + } + } + + return true; +} + + static bool self_test() { struct cryptonight_ctx *ctx[2]; uint8_t output[64]; @@ -97,15 +153,18 @@ static bool self_test() { const size_t size = opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE; bool result = false; - for (int i = 0; i < count; ++i) { + for (size_t i = 0; i < count; ++i) { ctx[i] = _mm_malloc(sizeof(struct cryptonight_ctx), 16); ctx[i]->memory = _mm_malloc(size, 16); + + init_cn_r(ctx[i]); } if (opt_algo == ALGO_CRYPTONIGHT) { - result = verify(VARIANT_0, output, ctx, test_output_v0) && - verify(VARIANT_1, output, ctx, test_output_v1) && - verify(VARIANT_2, output, ctx, test_output_v2); + result = verify(VARIANT_0, output, ctx, test_output_v0) && + verify(VARIANT_1, output, ctx, test_output_v1) && + verify(VARIANT_2, output, ctx, test_output_v2) && + verify2(VARIANT_4, output, ctx, test_output_r); } # ifndef XMRIG_NO_AEON else { @@ -115,7 +174,7 @@ static bool self_test() { # endif - for (int i = 0; i < count; ++i) { + for (size_t i = 0; i < count; ++i) { _mm_free(ctx[i]->memory); _mm_free(ctx[i]); } @@ -124,34 +183,20 @@ static bool self_test() { } -size_t fn_index(enum Algo algorithm, enum AlgoVariant av, enum Variant variant, enum Assembly assembly) +#ifndef XMRIG_NO_ASM +cn_hash_fun cryptonight_hash_asm_fn(enum AlgoVariant av, enum Variant variant, enum Assembly assembly) { - const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1; - -# ifndef XMRIG_NO_ASM if (assembly == ASM_AUTO) { - assembly = cpu_info.assembly; + assembly = (enum Assembly) cpu_info.assembly; } if (assembly == ASM_NONE) { - return index; + return NULL; } - const size_t offset = VARIANT_MAX * 4 * 2; - - if (algorithm == ALGO_CRYPTONIGHT && variant == VARIANT_2) { - if (av == AV_SINGLE) { - return offset + assembly - 2; - } - - if (av == AV_DOUBLE) { - return offset + 2; - } - } -# endif - - return index; + return asm_func_map[av][variant][assembly]; } +#endif cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant) @@ -160,10 +205,15 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V assert(variant > VARIANT_AUTO && variant < VARIANT_MAX); # ifndef XMRIG_NO_ASM - static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2 + 3] = { -# else - static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = { + if (algorithm == ALGO_CRYPTONIGHT) { + cn_hash_fun fun = cryptonight_hash_asm_fn(av, variant, opt_assembly); + if (fun) { + return fun; + } + } # endif + + static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = { cryptonight_av1_v0, cryptonight_av2_v0, cryptonight_av3_v0, @@ -177,6 +227,11 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V cryptonight_av3_v2, cryptonight_av4_v2, + cryptonight_r_av1, + cryptonight_r_av2, + cryptonight_r_av3, + cryptonight_r_av4, + # ifndef XMRIG_NO_AEON cryptonight_lite_av1_v0, cryptonight_lite_av2_v0, @@ -190,6 +245,10 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, # else NULL, NULL, @@ -203,16 +262,15 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V NULL, NULL, NULL, -# endif -# ifndef XMRIG_NO_ASM - cryptonight_single_hash_asm_intel, - cryptonight_single_hash_asm_ryzen, - cryptonight_double_hash_asm + NULL, + NULL, + NULL, + NULL, # endif }; # ifndef NDEBUG - const size_t index = fn_index(algorithm, av, variant, opt_assembly); + const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1; cn_hash_fun func = func_table[index]; @@ -221,7 +279,7 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V return func; # else - return func_table[fn_index(algorithm, av, variant, opt_assembly)]; + return func_table[VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1]; # endif } @@ -230,6 +288,24 @@ bool cryptonight_init(int av) { opt_double_hash = av == AV_DOUBLE || av == AV_DOUBLE_SOFT; +# ifndef XMRIG_NO_ASM + asm_func_map[AV_SINGLE][VARIANT_2][ASM_INTEL] = cryptonight_single_hash_asm_intel; + asm_func_map[AV_SINGLE][VARIANT_2][ASM_RYZEN] = cryptonight_single_hash_asm_intel; + asm_func_map[AV_SINGLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_single_hash_asm_bulldozer; + + asm_func_map[AV_DOUBLE][VARIANT_2][ASM_INTEL] = cryptonight_double_hash_asm; + asm_func_map[AV_DOUBLE][VARIANT_2][ASM_RYZEN] = cryptonight_double_hash_asm; + asm_func_map[AV_DOUBLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_double_hash_asm; + + asm_func_map[AV_SINGLE][VARIANT_4][ASM_INTEL] = cryptonight_r_av1_asm_intel; + asm_func_map[AV_SINGLE][VARIANT_4][ASM_RYZEN] = cryptonight_r_av1_asm_intel; + asm_func_map[AV_SINGLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av1_asm_bulldozer; + + asm_func_map[AV_DOUBLE][VARIANT_4][ASM_INTEL] = cryptonight_r_av2_asm_intel; + asm_func_map[AV_DOUBLE][VARIANT_4][ASM_RYZEN] = cryptonight_r_av2_asm_intel; + asm_func_map[AV_DOUBLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av2_asm_bulldozer; +# endif + return self_test(); } @@ -267,6 +343,10 @@ static inline enum Variant cryptonight_variant(uint8_t version) return VARIANT_1; } + if (version >= 10) { + return VARIANT_4; + } + if (version >= 8) { return VARIANT_2; } @@ -276,7 +356,7 @@ static inline enum Variant cryptonight_variant(uint8_t version) #ifndef BUILD_TEST -int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) { +int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) { uint32_t *nonceptr = (uint32_t*) (((char*) blob) + 39); enum Variant variant = cryptonight_variant(blob[0]); @@ -296,7 +376,7 @@ int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blo } -int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) { +int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) { int rc = 0; uint32_t *nonceptr0 = (uint32_t*) (((char*) blob) + 39); uint32_t *nonceptr1 = (uint32_t*) (((char*) blob) + 39 + blob_size); diff --git a/algo/cryptonight/cryptonight.h b/algo/cryptonight/cryptonight.h index 74646ef5b..4b39ce9e3 100644 --- a/algo/cryptonight/cryptonight.h +++ b/algo/cryptonight/cryptonight.h @@ -6,7 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,9 +39,30 @@ #define MEMORY_LITE 1048576 /* 1 MiB */ +#if defined _MSC_VER || defined XMRIG_ARM +#define ABI_ATTRIBUTE +#else +#define ABI_ATTRIBUTE __attribute__((ms_abi)) +#endif + + +struct cryptonight_ctx; +typedef void(*cn_mainloop_fun_ms_abi)(struct cryptonight_ctx*) ABI_ATTRIBUTE; +typedef void(*cn_mainloop_double_fun_ms_abi)(struct cryptonight_ctx*, struct cryptonight_ctx*) ABI_ATTRIBUTE; + + struct cryptonight_ctx { uint8_t state[224] __attribute__((aligned(16))); - uint8_t* memory __attribute__((aligned(16))); + uint8_t *memory __attribute__((aligned(16))); + + uint8_t unused[40]; + const uint32_t *saes_table; + + cn_mainloop_fun_ms_abi generated_code; + cn_mainloop_double_fun_ms_abi generated_code_double; + uint64_t generated_code_height; + uint64_t generated_code_double_height; + uint64_t height; }; @@ -52,7 +74,8 @@ extern void (* const extra_hashes[4])(const void *, size_t, char *); cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant); bool cryptonight_init(int av); -int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx); -int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx); +int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx); +int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx); + #endif /* XMRIG_CRYPTONIGHT_H */ diff --git a/algo/cryptonight/cryptonight_av1.c b/algo/cryptonight/cryptonight_av1.c index c71635ea5..e4a0662da 100644 --- a/algo/cryptonight/cryptonight_av1.c +++ b/algo/cryptonight/cryptonight_av1.c @@ -196,6 +196,7 @@ void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *res #ifndef XMRIG_NO_ASM extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx); extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx); +extern void cnv2_mainloop_bulldozer_asm(struct cryptonight_ctx *ctx); extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1); @@ -225,6 +226,19 @@ void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t siz } +void cryptonight_single_hash_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + cnv2_mainloop_bulldozer_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + keccakf((uint64_t*) ctx[0]->state, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { keccak(input, size, ctx[0]->state, 200); diff --git a/algo/cryptonight/cryptonight_monero.h b/algo/cryptonight/cryptonight_monero.h index 2f64ad0a8..a1681a045 100644 --- a/algo/cryptonight/cryptonight_monero.h +++ b/algo/cryptonight/cryptonight_monero.h @@ -6,8 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -29,6 +29,8 @@ #include #include +#include +#include static inline __m128i int_sqrt_v2(const uint64_t n0) @@ -87,6 +89,17 @@ static inline __m128i int_sqrt_v2(const uint64_t n0) _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ } +# define VARIANT4_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \ + { \ + const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ + _c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \ + } + # define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ { \ const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \ @@ -99,4 +112,39 @@ static inline __m128i int_sqrt_v2(const uint64_t n0) _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ } + +#ifndef NOINLINE +#ifdef __GNUC__ +#define NOINLINE __attribute__ ((noinline)) +#elif _MSC_VER +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE +#endif +#endif + +#include "variant4_random_math.h" + +#define VARIANT4_RANDOM_MATH_INIT(part) \ + uint32_t r##part[9]; \ + struct V4_Instruction code##part[256]; \ + { \ + r##part[0] = (uint32_t)(h##part[12]); \ + r##part[1] = (uint32_t)(h##part[12] >> 32); \ + r##part[2] = (uint32_t)(h##part[13]); \ + r##part[3] = (uint32_t)(h##part[13] >> 32); \ + } \ + v4_random_math_init(code##part, ctx[part]->height); + +#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \ + { \ + cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \ + r##part[4] = (uint32_t)(al); \ + r##part[5] = (uint32_t)(ah); \ + r##part[6] = (uint32_t)(_mm_cvtsi128_si32(bx0)); \ + r##part[7] = (uint32_t)(_mm_cvtsi128_si32(bx1)); \ + r##part[8] = (uint32_t)(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ + v4_random_math(code##part, r##part); \ + } + #endif /* XMRIG_CRYPTONIGHT_MONERO_H */ diff --git a/algo/cryptonight/cryptonight_r_av1.c b/algo/cryptonight/cryptonight_r_av1.c new file mode 100644 index 000000000..4fdebe018 --- /dev/null +++ b/algo/cryptonight/cryptonight_r_av1.c @@ -0,0 +1,143 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017 fireice-uk + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "crypto/c_keccak.h" +#include "cryptonight.h" +#include "cryptonight_aesni.h" +#include "cryptonight_monero.h" + + +void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + VARIANT2_INIT(0); + VARIANT2_SET_ROUNDING_MODE(); + VARIANT4_RANDOM_MATH_INIT(0); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + + uint64_t idx0 = al0; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + + cx = _mm_aesenc_si128(cx, ax0); + + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); + + idx0 = _mm_cvtsi128_si64(cx); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1); + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + + lo = _umul128(idx0, cl, &hi); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + bx1 = bx0; + bx0 = cx; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +#ifndef XMRIG_NO_ASM +void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM); + + +void cryptonight_r_av1_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (ctx[0]->generated_code_height != ctx[0]->height) { + struct V4_Instruction code[256]; + const int code_size = v4_random_math_init(code, ctx[0]->height); + + v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_INTEL); + ctx[0]->generated_code_height = ctx[0]->height; + } + + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + ctx[0]->generated_code(ctx[0]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + keccakf((uint64_t*) ctx[0]->state, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_r_av1_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (ctx[0]->generated_code_height != ctx[0]->height) { + struct V4_Instruction code[256]; + const int code_size = v4_random_math_init(code, ctx[0]->height); + + v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_BULLDOZER); + ctx[0]->generated_code_height = ctx[0]->height; + } + + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + ctx[0]->generated_code(ctx[0]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + keccakf((uint64_t*) ctx[0]->state, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} +#endif diff --git a/algo/cryptonight/cryptonight_r_av2.c b/algo/cryptonight/cryptonight_r_av2.c new file mode 100644 index 000000000..db64145a9 --- /dev/null +++ b/algo/cryptonight/cryptonight_r_av2.c @@ -0,0 +1,202 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017 fireice-uk + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "crypto/c_keccak.h" +#include "cryptonight.h" +#include "cryptonight_aesni.h" +#include "cryptonight_monero.h" + + +void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + VARIANT2_INIT(0); + VARIANT2_INIT(1); + VARIANT2_SET_ROUNDING_MODE(); + VARIANT4_RANDOM_MATH_INIT(0); + VARIANT4_RANDOM_MATH_INIT(1); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = al0; + uint64_t idx1 = al1; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); + + VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); + _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); + + idx0 = _mm_cvtsi128_si64(cx0); + idx1 = _mm_cvtsi128_si64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01); + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + + lo = _umul128(idx0, cl, &hi); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11); + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + + lo = _umul128(idx1, cl, &hi); + VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); + + al1 += hi; + ah1 += lo; + + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; + + al1 ^= cl; + ah1 ^= ch; + idx1 = al1; + + bx01 = bx00; + bx11 = bx10; + + bx00 = cx0; + bx10 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} + + +#ifndef XMRIG_NO_ASM +void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM); + + +void cryptonight_r_av2_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (ctx[0]->generated_code_height != ctx[0]->height) { + struct V4_Instruction code[256]; + const int code_size = v4_random_math_init(code, ctx[0]->height); + v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_INTEL); + ctx[0]->generated_code_height = ctx[0]->height; + } + + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory); + + ctx[0]->generated_code_double(ctx[0], ctx[1]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state); + + keccakf((uint64_t *) ctx[0]->state, 24); + keccakf((uint64_t *) ctx[1]->state, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} + + +void cryptonight_r_av2_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (ctx[0]->generated_code_height != ctx[0]->height) { + struct V4_Instruction code[256]; + const int code_size = v4_random_math_init(code, ctx[0]->height); + v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_BULLDOZER); + ctx[0]->generated_code_height = ctx[0]->height; + } + + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory); + + ctx[0]->generated_code_double(ctx[0], ctx[1]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state); + + keccakf((uint64_t *) ctx[0]->state, 24); + keccakf((uint64_t *) ctx[1]->state, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} +#endif diff --git a/algo/cryptonight/cryptonight_r_av3.c b/algo/cryptonight/cryptonight_r_av3.c new file mode 100644 index 000000000..a322ebcd8 --- /dev/null +++ b/algo/cryptonight/cryptonight_r_av3.c @@ -0,0 +1,112 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017 fireice-uk + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "crypto/c_keccak.h" +#include "cryptonight.h" +#include "cryptonight_monero.h" +#include "cryptonight_softaes.h" + + +#ifndef XMRIG_NO_ASM +void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM); +#endif + + +void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + +# ifndef XMRIG_NO_ASM + if (ctx[0]->generated_code_height != ctx[0]->height) { + struct V4_Instruction code[256]; + const int code_size = v4_random_math_init(code, ctx[0]->height); + + v4_soft_aes_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_NONE); + ctx[0]->generated_code_height = ctx[0]->height; + } + + ctx[0]->saes_table = (const uint32_t*)saes_table; + ctx[0]->generated_code(ctx[0]); +# else + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + VARIANT2_INIT(0); + VARIANT2_SET_ROUNDING_MODE(); + VARIANT4_RANDOM_MATH_INIT(0); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + + uint64_t idx0 = al0; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + + cx = soft_aesenc(cx, ax0); + + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); + + idx0 = _mm_cvtsi128_si64(cx); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1); + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + + lo = _umul128(idx0, cl, &hi); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + bx1 = bx0; + bx0 = cx; + } +# endif + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + keccakf((uint64_t *) ctx[0]->state, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} diff --git a/algo/cryptonight/cryptonight_r_av4.c b/algo/cryptonight/cryptonight_r_av4.c new file mode 100644 index 000000000..522b229fc --- /dev/null +++ b/algo/cryptonight/cryptonight_r_av4.c @@ -0,0 +1,143 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017 fireice-uk + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "crypto/c_keccak.h" +#include "cryptonight.h" +#include "cryptonight_monero.h" +#include "cryptonight_softaes.h" + + +void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + VARIANT2_INIT(0); + VARIANT2_INIT(1); + VARIANT2_SET_ROUNDING_MODE(); + VARIANT4_RANDOM_MATH_INIT(0); + VARIANT4_RANDOM_MATH_INIT(1); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = al0; + uint64_t idx1 = al1; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + cx0 = soft_aesenc(cx0, ax0); + cx1 = soft_aesenc(cx1, ax1); + + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); + + VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); + _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); + + idx0 = _mm_cvtsi128_si64(cx0); + idx1 = _mm_cvtsi128_si64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01); + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + + lo = _umul128(idx0, cl, &hi); + VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11); + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + + lo = _umul128(idx1, cl, &hi); + VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); + + al1 += hi; + ah1 += lo; + + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; + + al1 ^= cl; + ah1 ^= ch; + idx1 = al1; + + bx01 = bx00; + bx11 = bx10; + + bx00 = cx0; + bx10 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} diff --git a/algo/cryptonight/cryptonight_softaes.h b/algo/cryptonight/cryptonight_softaes.h index 4e25b768e..9a229788e 100644 --- a/algo/cryptonight/cryptonight_softaes.h +++ b/algo/cryptonight/cryptonight_softaes.h @@ -4,9 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2017 fireice-uk - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -30,8 +30,7 @@ #include -extern __m128i soft_aesenc(__m128i in, __m128i key); -extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon); +#include "crypto/soft_aes.h" // This will shift and xor tmp1 into itself as 4 32-bit vals such as diff --git a/algo/cryptonight/cryptonight_test.h b/algo/cryptonight/cryptonight_test.h index 04efe911a..67303f32e 100644 --- a/algo/cryptonight/cryptonight_test.h +++ b/algo/cryptonight/cryptonight_test.h @@ -6,8 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -27,6 +27,9 @@ #define XMRIG_CRYPTONIGHT_TEST_H +#include + + const static uint8_t test_input[152] = { 0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00, 0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B, @@ -67,6 +70,42 @@ const static uint8_t test_output_v2[64] = { }; +struct cn_r_test_input_data +{ + uint64_t height; + size_t size; + uint8_t data[64]; +}; + + +const static struct cn_r_test_input_data cn_r_test_input[] = { + { 1806260, 44, { 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74 } }, + { 1806261, 50, { 0x4c, 0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x63, 0x74, 0x65, 0x74, 0x75, 0x72, 0x20, 0x61, 0x64, 0x69, 0x70, 0x69, 0x73, 0x63, 0x69, 0x6e, 0x67 } }, + { 1806262, 48, { 0x65, 0x6c, 0x69, 0x74, 0x2c, 0x20, 0x73, 0x65, 0x64, 0x20, 0x64, 0x6f, 0x20, 0x65, 0x69, 0x75, 0x73, 0x6d, 0x6f, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x63, 0x69, 0x64, 0x69, 0x64, 0x75, 0x6e, 0x74, 0x20, 0x75, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x65 } }, + { 1806263, 48, { 0x65, 0x74, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x61, 0x67, 0x6e, 0x61, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x61, 0x2e, 0x20, 0x55, 0x74, 0x20, 0x65, 0x6e, 0x69, 0x6d, 0x20, 0x61, 0x64, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x20, 0x76, 0x65, 0x6e, 0x69, 0x61, 0x6d, 0x2c } }, + { 1806264, 46, { 0x71, 0x75, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x73, 0x74, 0x72, 0x75, 0x64, 0x20, 0x65, 0x78, 0x65, 0x72, 0x63, 0x69, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x6f, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x69, 0x73, 0x20, 0x6e, 0x69, 0x73, 0x69 } }, + { 1806265, 45, { 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x69, 0x70, 0x20, 0x65, 0x78, 0x20, 0x65, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x64, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x71, 0x75, 0x61, 0x74, 0x2e, 0x20, 0x44, 0x75, 0x69, 0x73, 0x20, 0x61, 0x75, 0x74, 0x65 } }, + { 1806266, 47, { 0x69, 0x72, 0x75, 0x72, 0x65, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x72, 0x65, 0x68, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x76, 0x6f, 0x6c, 0x75, 0x70, 0x74, 0x61, 0x74, 0x65, 0x20, 0x76, 0x65, 0x6c, 0x69, 0x74 } }, + { 1806267, 44, { 0x65, 0x73, 0x73, 0x65, 0x20, 0x63, 0x69, 0x6c, 0x6c, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x65, 0x75, 0x20, 0x66, 0x75, 0x67, 0x69, 0x61, 0x74, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x20, 0x70, 0x61, 0x72, 0x69, 0x61, 0x74, 0x75, 0x72, 0x2e } }, + { 1806268, 47, { 0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x65, 0x75, 0x72, 0x20, 0x73, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x63, 0x63, 0x61, 0x65, 0x63, 0x61, 0x74, 0x20, 0x63, 0x75, 0x70, 0x69, 0x64, 0x61, 0x74, 0x61, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x2c } }, + { 1806269, 62, { 0x73, 0x75, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x75, 0x6c, 0x70, 0x61, 0x20, 0x71, 0x75, 0x69, 0x20, 0x6f, 0x66, 0x66, 0x69, 0x63, 0x69, 0x61, 0x20, 0x64, 0x65, 0x73, 0x65, 0x72, 0x75, 0x6e, 0x74, 0x20, 0x6d, 0x6f, 0x6c, 0x6c, 0x69, 0x74, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x20, 0x69, 0x64, 0x20, 0x65, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x75, 0x6d, 0x2e } }, +}; + + +// "cn/r" +const static uint8_t test_output_r[] = { + 0xf7, 0x59, 0x58, 0x8a, 0xd5, 0x7e, 0x75, 0x84, 0x67, 0x29, 0x54, 0x43, 0xa9, 0xbd, 0x71, 0x49, 0x0a, 0xbf, 0xf8, 0xe9, 0xda, 0xd1, 0xb9, 0x5b, 0x6b, 0xf2, 0xf5, 0xd0, 0xd7, 0x83, 0x87, 0xbc, + 0x5b, 0xb8, 0x33, 0xde, 0xca, 0x2b, 0xdd, 0x72, 0x52, 0xa9, 0xcc, 0xd7, 0xb4, 0xce, 0x0b, 0x6a, 0x48, 0x54, 0x51, 0x57, 0x94, 0xb5, 0x6c, 0x20, 0x72, 0x62, 0xf7, 0xa5, 0xb9, 0xbd, 0xb5, 0x66, + 0x1e, 0xe6, 0x72, 0x8d, 0xa6, 0x0f, 0xbd, 0x8d, 0x7d, 0x55, 0xb2, 0xb1, 0xad, 0xe4, 0x87, 0xa3, 0xcf, 0x52, 0xa2, 0xc3, 0xac, 0x6f, 0x52, 0x0d, 0xb1, 0x2c, 0x27, 0xd8, 0x92, 0x1f, 0x6c, 0xab, + 0x69, 0x69, 0xfe, 0x2d, 0xdf, 0xb7, 0x58, 0x43, 0x8d, 0x48, 0x04, 0x9f, 0x30, 0x2f, 0xc2, 0x10, 0x8a, 0x4f, 0xcc, 0x93, 0xe3, 0x76, 0x69, 0x17, 0x0e, 0x6d, 0xb4, 0xb0, 0xb9, 0xb4, 0xc4, 0xcb, + 0x7f, 0x30, 0x48, 0xb4, 0xe9, 0x0d, 0x0c, 0xbe, 0x7a, 0x57, 0xc0, 0x39, 0x4f, 0x37, 0x33, 0x8a, 0x01, 0xfa, 0xe3, 0xad, 0xfd, 0xc0, 0xe5, 0x12, 0x6d, 0x86, 0x3a, 0x89, 0x5e, 0xb0, 0x4e, 0x02, + 0x1d, 0x29, 0x04, 0x43, 0xa4, 0xb5, 0x42, 0xaf, 0x04, 0xa8, 0x2f, 0x6b, 0x24, 0x94, 0xa6, 0xee, 0x7f, 0x20, 0xf2, 0x75, 0x4c, 0x58, 0xe0, 0x84, 0x90, 0x32, 0x48, 0x3a, 0x56, 0xe8, 0xe2, 0xef, + 0xc4, 0x3c, 0xc6, 0x56, 0x74, 0x36, 0xa8, 0x6a, 0xfb, 0xd6, 0xaa, 0x9e, 0xaa, 0x7c, 0x27, 0x6e, 0x98, 0x06, 0x83, 0x03, 0x34, 0xb6, 0x14, 0xb2, 0xbe, 0xe2, 0x3c, 0xc7, 0x66, 0x34, 0xf6, 0xfd, + 0x87, 0xbe, 0x24, 0x79, 0xc0, 0xc4, 0xe8, 0xed, 0xfd, 0xfa, 0xa5, 0x60, 0x3e, 0x93, 0xf4, 0x26, 0x5b, 0x3f, 0x82, 0x24, 0xc1, 0xc5, 0x94, 0x6f, 0xeb, 0x42, 0x48, 0x19, 0xd1, 0x89, 0x90, 0xa4, + 0xdd, 0x9d, 0x6a, 0x6d, 0x8e, 0x47, 0x46, 0x5c, 0xce, 0xac, 0x08, 0x77, 0xef, 0x88, 0x9b, 0x93, 0xe7, 0xeb, 0xa9, 0x79, 0x55, 0x7e, 0x39, 0x35, 0xd7, 0xf8, 0x6d, 0xce, 0x11, 0xb0, 0x70, 0xf3, + 0x75, 0xc6, 0xf2, 0xae, 0x49, 0xa2, 0x05, 0x21, 0xde, 0x97, 0x28, 0x5b, 0x43, 0x1e, 0x71, 0x71, 0x25, 0x84, 0x7f, 0xb8, 0x93, 0x5e, 0xd8, 0x4a, 0x61, 0xe7, 0xf8, 0xd3, 0x6a, 0x2c, 0x3d, 0x8e, +}; + #ifndef XMRIG_NO_AEON const static uint8_t test_output_v0_lite[64] = { diff --git a/algo/cryptonight/variant4_random_math.h b/algo/cryptonight/variant4_random_math.h new file mode 100644 index 000000000..62a54cb29 --- /dev/null +++ b/algo/cryptonight/variant4_random_math.h @@ -0,0 +1,449 @@ +#ifndef VARIANT4_RANDOM_MATH_H +#define VARIANT4_RANDOM_MATH_H + + +#include +#include +#include + + +#include "crypto/c_blake256.h" + + +enum V4_Settings +{ + // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications + TOTAL_LATENCY = 15 * 3, + + // Always generate at least 60 instructions + NUM_INSTRUCTIONS_MIN = 60, + + // Never generate more than 70 instructions (final RET instruction doesn't count here) + NUM_INSTRUCTIONS_MAX = 70, + + // Available ALUs for MUL + // Modern CPUs typically have only 1 ALU which can do multiplications + ALU_COUNT_MUL = 1, + + // Total available ALUs + // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code + ALU_COUNT = 3, +}; + +enum V4_InstructionList +{ + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution + V4_INSTRUCTION_COUNT = RET, +}; + +// V4_InstructionDefinition is used to generate code from random data +// Every random sequence of bytes is a valid code +// +// There are 9 registers in total: +// - 4 variable registers +// - 5 constant registers initialized from loop variables +// This is why dst_index is 2 bits +enum V4_InstructionDefinition +{ + V4_OPCODE_BITS = 3, + V4_DST_INDEX_BITS = 2, + V4_SRC_INDEX_BITS = 3, +}; + +struct V4_Instruction +{ + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t C; +}; + +#ifndef FORCEINLINE +#ifdef __GNUC__ +#define FORCEINLINE __attribute__((always_inline)) inline +#elif _MSC_VER +#define FORCEINLINE __forceinline +#else +#define FORCEINLINE inline +#endif +#endif + +#ifndef UNREACHABLE_CODE +#ifdef __GNUC__ +#define UNREACHABLE_CODE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE_CODE __assume(false) +#else +#define UNREACHABLE_CODE +#endif +#endif + +#define SWAP32LE(x) x +#define SWAP64LE(x) x +#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length)) + +// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU: +// every switch-case will point to the same destination on every iteration of Cryptonight main loop +// +// This is about as fast as it can get without using low-level machine code generation +//template +static void v4_random_math(const struct V4_Instruction* code, uint32_t r[9]) +{ +#define REG_BITS 32 +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const uint32_t src = r[op->src_index]; \ + uint32_t *dst = r + op->dst_index; \ + switch (op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ + } + +#define V4_EXEC_10(j) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ + V4_EXEC(j + 9) + + // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency + // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: + // + // 60 27960 + // 61 105054 + // 62 2452759 + // 63 5115997 + // 64 1022269 + // 65 1109635 + // 66 153145 + // 67 8550 + // 68 4529 + // 69 102 + + // Unroll 70 instructions here + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 + +#undef V4_EXEC_10 +#undef V4_EXEC +#undef REG_BITS +} + +// If we don't have enough data available, generate more +static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) +{ + if (*data_index + bytes_needed > data_size) + { + hash_extra_blake(data, data_size, (char*) data); + *data_index = 0; + } +} + +// Generates as many random math operations as possible with given latency and ALU restrictions +// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions +static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) +{ + // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle + // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake + // + // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors + // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors + // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + int8_t data[32]; + memset(data, 0, sizeof(data)); + uint64_t tmp = SWAP64LE(height); + memcpy(data, &tmp, sizeof(uint64_t)); + data[20] = -38; + + // Set data_index past the last byte in data + // to trigger full data update with blake hash + // before we start using it + size_t data_index = sizeof(data); + + int code_size; + + // There is a small chance (1.8%) that register R8 won't be used in the generated program + // So we keep track of it and try again if it's not used + bool r8_used; + do { + int latency[9]; + int asic_latency[9]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R8 are constant and are treated as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + + bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; + bool is_rotation[V4_INSTRUCTION_COUNT]; + bool rotated[4]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[ROR] = true; + is_rotation[ROL] = true; + + int num_retries = 0; + code_size = 0; + + int total_iterations = 0; + r8_used = false; + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + { + // Fail-safe to guarantee loop termination + ++total_iterations; + if (total_iterations > 256) + break; + + check_data(&data_index, 1, data, sizeof(data)); + + const uint8_t c = ((uint8_t*)data)[data_index++]; + + // MUL = opcodes 0-2 + // ADD = opcode 3 + // SUB = opcode 4 + // ROR/ROL = opcode 5, shift direction is selected randomly + // XOR = opcodes 6-7 + uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); + if (opcode == 5) + { + check_data(&data_index, 1, data, sizeof(data)); + opcode = (data[data_index++] >= 0) ? ROR : ROL; + } + else if (opcode >= 6) + { + opcode = XOR; + } + else + { + opcode = (opcode <= 2) ? MUL : (opcode - 2); + } + + uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); + uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; + + // Don't do ADD/SUB/XOR with the same register + if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + { + // a is always < 4, so we don't need to check bounds here + b = 8; + src_index = b; + } + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[opcode] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations + // 2xXOR(a, b) = NOP + if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < TOTAL_LATENCY) + { + for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 7 cycles + if (next_latency > latency[a] + 7) + { + continue; + } + + next_latency += op_latency[opcode]; + + if (next_latency <= TOTAL_LATENCY) + { + if (is_rotation[opcode]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[opcode]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; + + rotated[a] = is_rotation[opcode]; + + inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); + + code[code_size].opcode = opcode; + code[code_size].dst_index = dst_index; + code[code_size].src_index = src_index; + code[code_size].C = 0; + + if (src_index == 8) + { + r8_used = true; + } + + if (opcode == ADD) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; + + // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C" + check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); + uint32_t t; + memcpy(&t, data + data_index, sizeof(uint32_t)); + code[code_size].C = SWAP32LE(t); + data_index += sizeof(uint32_t); + } + + ++code_size; + if (code_size >= NUM_INSTRUCTIONS_MIN) + { + break; + } + } + else + { + ++num_retries; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + { + int min_idx = 0; + int max_idx = 0; + for (int i = 1; i < 4; ++i) + { + if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[opcode]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; + + code[code_size].opcode = opcode; + code[code_size].dst_index = min_idx; + code[code_size].src_index = max_idx; + code[code_size].C = 0; + ++code_size; + } + + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + + // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here + // Add final instruction to stop the interpreter + code[code_size].opcode = RET; + code[code_size].dst_index = 0; + code[code_size].src_index = 0; + code[code_size].C = 0; + + return code_size; +} + +#endif diff --git a/cmake/asm.cmake b/cmake/asm.cmake index 4420342c9..0e605ddb5 100644 --- a/cmake/asm.cmake +++ b/cmake/asm.cmake @@ -1,30 +1,24 @@ if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) set(XMRIG_ASM_LIBRARY "xmrig-asm") - if (CMAKE_C_COMPILER_ID MATCHES MSVC) - enable_language(ASM_MASM) + enable_language(ASM) - if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141) - set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.asm") - else() - set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.asm") - endif() - - set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) + if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) + set(XMRIG_ASM_FILES + "crypto/asm/win64/cn_main_loop.S" + "crypto/asm/CryptonightR_template.S" + ) else() - enable_language(ASM) - - if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) - set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.S") - else() - set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.S") - endif() - - set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) + set(XMRIG_ASM_FILES + "crypto/asm/cn_main_loop.S" + "crypto/asm/CryptonightR_template.S" + ) endif() - add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE}) - set(XMRIG_ASM_SOURCES "") + set_property(SOURCE ${XMRIG_ASM_FILES} PROPERTY C) + + add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILES}) + set(XMRIG_ASM_SOURCES "crypto/CryptonightR_gen.c") set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C) else() set(XMRIG_ASM_SOURCES "") diff --git a/cpu.c b/cpu.c index 0d28559a0..2509a27e1 100644 --- a/cpu.c +++ b/cpu.c @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -64,20 +65,20 @@ void cpu_init_common() { if (data.flags[CPU_FEATURE_AES]) { cpu_info.flags |= CPU_FLAG_AES; - -# ifndef XMRIG_NO_ASM - if (data.vendor == VENDOR_AMD) { - cpu_info.assembly = ASM_RYZEN; - } - else if (data.vendor == VENDOR_INTEL) { - cpu_info.assembly = ASM_INTEL; - } -# endif } if (data.flags[CPU_FEATURE_BMI2]) { cpu_info.flags |= CPU_FLAG_BMI2; } + +# ifndef XMRIG_NO_ASM + if (data.vendor == VENDOR_AMD) { + cpu_info.assembly = (data.ext_family >= 23) ? ASM_RYZEN : ASM_BULLDOZER; + } + else if (data.vendor == VENDOR_INTEL) { + cpu_info.assembly = ASM_INTEL; + } +# endif } #endif diff --git a/cpu.h b/cpu.h index e9314bbe7..d979c86f2 100644 --- a/cpu.h +++ b/cpu.h @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/crypto/CryptonightR_gen.c b/crypto/CryptonightR_gen.c new file mode 100644 index 000000000..2f2ace21d --- /dev/null +++ b/crypto/CryptonightR_gen.c @@ -0,0 +1,146 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include "algo/cryptonight/cryptonight_monero.h" +#include "crypto/asm/CryptonightR_template.h" +#include "persistent_memory.h" + + +static inline void add_code(uint8_t **p, void (*p1)(), void (*p2)()) +{ + const ptrdiff_t size = (const uint8_t*)(p2) - (const uint8_t*)(p1); + if (size > 0) { + memcpy(*p, (const void *) p1, size); + *p += size; + } +} + + +static inline void add_random_math(uint8_t **p, const struct V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, enum Assembly ASM) +{ + uint32_t prev_rot_src = (uint32_t)(-1); + + for (int i = 0;; ++i) { + const struct V4_Instruction inst = code[i]; + if (inst.opcode == RET) { + break; + } + + uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2); + uint8_t dst_index = inst.dst_index; + uint8_t src_index = inst.src_index; + + const uint32_t a = inst.dst_index; + const uint32_t b = inst.src_index; + const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); + + switch (inst.opcode) { + case ROR: + case ROL: + if (b != prev_rot_src) { + prev_rot_src = b; + add_code(p, instructions_mov[c], instructions_mov[c + 1]); + } + break; + } + + if (a == prev_rot_src) { + prev_rot_src = (uint32_t)(-1); + } + + void_func begin = instructions[c]; + + if ((ASM = ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) { + // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL + // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41 + uint8_t* prefix = (uint8_t*) begin; + + if (*prefix == 0x49) { + **p = 0x41; + *p += 1; + } + + begin = (void_func)(prefix + 1); + } + + add_code(p, begin, instructions[c + 1]); + + if (inst.opcode == ADD) { + *(uint32_t*)(*p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C; + if (is_64_bit) { + prev_rot_src = (uint32_t)(-1); + } + } + } +} + + +void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM) +{ + uint8_t* p0 = machine_code; + uint8_t* p = p0; + + add_code(&p, CryptonightR_template_part1, CryptonightR_template_part2); + add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(&p, CryptonightR_template_part2, CryptonightR_template_part3); + *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0)); + add_code(&p, CryptonightR_template_part3, CryptonightR_template_end); + + flush_instruction_cache(machine_code, p - p0); +} + + +void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM) +{ + uint8_t* p0 = (uint8_t*) machine_code; + uint8_t* p = p0; + + add_code(&p, CryptonightR_template_double_part1, CryptonightR_template_double_part2); + add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(&p, CryptonightR_template_double_part2, CryptonightR_template_double_part3); + add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(&p, CryptonightR_template_double_part3, CryptonightR_template_double_part4); + *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0)); + add_code(&p, CryptonightR_template_double_part4, CryptonightR_template_double_end); + + flush_instruction_cache(machine_code, p - p0); +} + + +void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM) +{ + uint8_t* p0 = machine_code; + uint8_t* p = p0; + + add_code(&p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2); + add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(&p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3); + *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0)); + add_code(&p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end); + + flush_instruction_cache(machine_code, p - p0); +} diff --git a/crypto/asm/CryptonightR_soft_aes_template.inc b/crypto/asm/CryptonightR_soft_aes_template.inc new file mode 100644 index 000000000..40c7874d2 --- /dev/null +++ b/crypto/asm/CryptonightR_soft_aes_template.inc @@ -0,0 +1,279 @@ +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end) + +ALIGN(64) +FN_PREFIX(CryptonightR_soft_aes_template_part1): + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 232 + + mov eax, [rcx+96] + mov ebx, [rcx+100] + mov esi, [rcx+104] + mov edx, [rcx+108] + mov [rsp+144], eax + mov [rsp+148], ebx + mov [rsp+152], esi + mov [rsp+156], edx + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movq xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movq xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movq xmm5, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movq xmm10, QWORD PTR [r10+96] + movq xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+328], rax + movq xmm12, r11 + mov QWORD PTR [rsp+320], r9 + punpcklqdq xmm5, xmm0 + movq xmm13, rcx + mov r12d, 524288 + + ALIGN(64) +FN_PREFIX(CryptonightR_soft_aes_template_mainloop): + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movq xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movq xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+328] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movq r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm2 + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + pxor xmm6, xmm1 + pxor xmm6, xmm0 + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movq rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movq rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [r13], xmm0 + + mov ebx, [rsp+144] + mov ebp, [rsp+152] + add ebx, [rsp+148] + add ebp, [rsp+156] + shl rbp, 32 + or rbx, rbp + + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + + mov [rsp+160], rbx + mov [rsp+168], rdi + mov [rsp+176], rbp + mov [rsp+184], r10 + mov r10, rsp + + mov ebx, [rsp+144] + mov esi, [rsp+148] + mov edi, [rsp+152] + mov ebp, [rsp+156] + + movd esp, xmm7 + movaps xmm0, xmm7 + psrldq xmm0, 8 + movd r15d, xmm0 + movd eax, xmm4 + movd edx, xmm5 + movaps xmm0, xmm5 + psrldq xmm0, 8 + movd r9d, xmm0 + +FN_PREFIX(CryptonightR_soft_aes_template_part2): + mov rsp, r10 + mov [rsp+144], ebx + mov [rsp+148], esi + mov [rsp+152], edi + mov [rsp+156], ebp + + mov edi, edi + shl rbp, 32 + or rbp, rdi + xor r8, rbp + + mov ebx, ebx + shl rsi, 32 + or rsi, rbx + xor QWORD PTR [rsp+320], rsi + + mov rbx, [rsp+160] + mov rdi, [rsp+168] + mov rbp, [rsp+176] + mov r10, [rsp+184] + + mov r9, r10 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm2 + pxor xmm6, xmm1 + paddq xmm1, xmm7 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm6, xmm0 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+320] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+304] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+320], r9 + mov QWORD PTR [rsp+328], rax + sub r12d, 1 + jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop) + +FN_PREFIX(CryptonightR_soft_aes_template_part3): + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 232 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret +FN_PREFIX(CryptonightR_soft_aes_template_end): diff --git a/crypto/asm/CryptonightR_template.S b/crypto/asm/CryptonightR_template.S new file mode 100644 index 000000000..d45bcfcc0 --- /dev/null +++ b/crypto/asm/CryptonightR_template.S @@ -0,0 +1,1593 @@ +#ifdef __APPLE__ +# define ALIGN(x) .align 6 +#else +# define ALIGN(x) .align 64 +#endif +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif + +#define PUBLIC .global + +PUBLIC FN_PREFIX(CryptonightR_instruction0) +PUBLIC FN_PREFIX(CryptonightR_instruction1) +PUBLIC FN_PREFIX(CryptonightR_instruction2) +PUBLIC FN_PREFIX(CryptonightR_instruction3) +PUBLIC FN_PREFIX(CryptonightR_instruction4) +PUBLIC FN_PREFIX(CryptonightR_instruction5) +PUBLIC FN_PREFIX(CryptonightR_instruction6) +PUBLIC FN_PREFIX(CryptonightR_instruction7) +PUBLIC FN_PREFIX(CryptonightR_instruction8) +PUBLIC FN_PREFIX(CryptonightR_instruction9) +PUBLIC FN_PREFIX(CryptonightR_instruction10) +PUBLIC FN_PREFIX(CryptonightR_instruction11) +PUBLIC FN_PREFIX(CryptonightR_instruction12) +PUBLIC FN_PREFIX(CryptonightR_instruction13) +PUBLIC FN_PREFIX(CryptonightR_instruction14) +PUBLIC FN_PREFIX(CryptonightR_instruction15) +PUBLIC FN_PREFIX(CryptonightR_instruction16) +PUBLIC FN_PREFIX(CryptonightR_instruction17) +PUBLIC FN_PREFIX(CryptonightR_instruction18) +PUBLIC FN_PREFIX(CryptonightR_instruction19) +PUBLIC FN_PREFIX(CryptonightR_instruction20) +PUBLIC FN_PREFIX(CryptonightR_instruction21) +PUBLIC FN_PREFIX(CryptonightR_instruction22) +PUBLIC FN_PREFIX(CryptonightR_instruction23) +PUBLIC FN_PREFIX(CryptonightR_instruction24) +PUBLIC FN_PREFIX(CryptonightR_instruction25) +PUBLIC FN_PREFIX(CryptonightR_instruction26) +PUBLIC FN_PREFIX(CryptonightR_instruction27) +PUBLIC FN_PREFIX(CryptonightR_instruction28) +PUBLIC FN_PREFIX(CryptonightR_instruction29) +PUBLIC FN_PREFIX(CryptonightR_instruction30) +PUBLIC FN_PREFIX(CryptonightR_instruction31) +PUBLIC FN_PREFIX(CryptonightR_instruction32) +PUBLIC FN_PREFIX(CryptonightR_instruction33) +PUBLIC FN_PREFIX(CryptonightR_instruction34) +PUBLIC FN_PREFIX(CryptonightR_instruction35) +PUBLIC FN_PREFIX(CryptonightR_instruction36) +PUBLIC FN_PREFIX(CryptonightR_instruction37) +PUBLIC FN_PREFIX(CryptonightR_instruction38) +PUBLIC FN_PREFIX(CryptonightR_instruction39) +PUBLIC FN_PREFIX(CryptonightR_instruction40) +PUBLIC FN_PREFIX(CryptonightR_instruction41) +PUBLIC FN_PREFIX(CryptonightR_instruction42) +PUBLIC FN_PREFIX(CryptonightR_instruction43) +PUBLIC FN_PREFIX(CryptonightR_instruction44) +PUBLIC FN_PREFIX(CryptonightR_instruction45) +PUBLIC FN_PREFIX(CryptonightR_instruction46) +PUBLIC FN_PREFIX(CryptonightR_instruction47) +PUBLIC FN_PREFIX(CryptonightR_instruction48) +PUBLIC FN_PREFIX(CryptonightR_instruction49) +PUBLIC FN_PREFIX(CryptonightR_instruction50) +PUBLIC FN_PREFIX(CryptonightR_instruction51) +PUBLIC FN_PREFIX(CryptonightR_instruction52) +PUBLIC FN_PREFIX(CryptonightR_instruction53) +PUBLIC FN_PREFIX(CryptonightR_instruction54) +PUBLIC FN_PREFIX(CryptonightR_instruction55) +PUBLIC FN_PREFIX(CryptonightR_instruction56) +PUBLIC FN_PREFIX(CryptonightR_instruction57) +PUBLIC FN_PREFIX(CryptonightR_instruction58) +PUBLIC FN_PREFIX(CryptonightR_instruction59) +PUBLIC FN_PREFIX(CryptonightR_instruction60) +PUBLIC FN_PREFIX(CryptonightR_instruction61) +PUBLIC FN_PREFIX(CryptonightR_instruction62) +PUBLIC FN_PREFIX(CryptonightR_instruction63) +PUBLIC FN_PREFIX(CryptonightR_instruction64) +PUBLIC FN_PREFIX(CryptonightR_instruction65) +PUBLIC FN_PREFIX(CryptonightR_instruction66) +PUBLIC FN_PREFIX(CryptonightR_instruction67) +PUBLIC FN_PREFIX(CryptonightR_instruction68) +PUBLIC FN_PREFIX(CryptonightR_instruction69) +PUBLIC FN_PREFIX(CryptonightR_instruction70) +PUBLIC FN_PREFIX(CryptonightR_instruction71) +PUBLIC FN_PREFIX(CryptonightR_instruction72) +PUBLIC FN_PREFIX(CryptonightR_instruction73) +PUBLIC FN_PREFIX(CryptonightR_instruction74) +PUBLIC FN_PREFIX(CryptonightR_instruction75) +PUBLIC FN_PREFIX(CryptonightR_instruction76) +PUBLIC FN_PREFIX(CryptonightR_instruction77) +PUBLIC FN_PREFIX(CryptonightR_instruction78) +PUBLIC FN_PREFIX(CryptonightR_instruction79) +PUBLIC FN_PREFIX(CryptonightR_instruction80) +PUBLIC FN_PREFIX(CryptonightR_instruction81) +PUBLIC FN_PREFIX(CryptonightR_instruction82) +PUBLIC FN_PREFIX(CryptonightR_instruction83) +PUBLIC FN_PREFIX(CryptonightR_instruction84) +PUBLIC FN_PREFIX(CryptonightR_instruction85) +PUBLIC FN_PREFIX(CryptonightR_instruction86) +PUBLIC FN_PREFIX(CryptonightR_instruction87) +PUBLIC FN_PREFIX(CryptonightR_instruction88) +PUBLIC FN_PREFIX(CryptonightR_instruction89) +PUBLIC FN_PREFIX(CryptonightR_instruction90) +PUBLIC FN_PREFIX(CryptonightR_instruction91) +PUBLIC FN_PREFIX(CryptonightR_instruction92) +PUBLIC FN_PREFIX(CryptonightR_instruction93) +PUBLIC FN_PREFIX(CryptonightR_instruction94) +PUBLIC FN_PREFIX(CryptonightR_instruction95) +PUBLIC FN_PREFIX(CryptonightR_instruction96) +PUBLIC FN_PREFIX(CryptonightR_instruction97) +PUBLIC FN_PREFIX(CryptonightR_instruction98) +PUBLIC FN_PREFIX(CryptonightR_instruction99) +PUBLIC FN_PREFIX(CryptonightR_instruction100) +PUBLIC FN_PREFIX(CryptonightR_instruction101) +PUBLIC FN_PREFIX(CryptonightR_instruction102) +PUBLIC FN_PREFIX(CryptonightR_instruction103) +PUBLIC FN_PREFIX(CryptonightR_instruction104) +PUBLIC FN_PREFIX(CryptonightR_instruction105) +PUBLIC FN_PREFIX(CryptonightR_instruction106) +PUBLIC FN_PREFIX(CryptonightR_instruction107) +PUBLIC FN_PREFIX(CryptonightR_instruction108) +PUBLIC FN_PREFIX(CryptonightR_instruction109) +PUBLIC FN_PREFIX(CryptonightR_instruction110) +PUBLIC FN_PREFIX(CryptonightR_instruction111) +PUBLIC FN_PREFIX(CryptonightR_instruction112) +PUBLIC FN_PREFIX(CryptonightR_instruction113) +PUBLIC FN_PREFIX(CryptonightR_instruction114) +PUBLIC FN_PREFIX(CryptonightR_instruction115) +PUBLIC FN_PREFIX(CryptonightR_instruction116) +PUBLIC FN_PREFIX(CryptonightR_instruction117) +PUBLIC FN_PREFIX(CryptonightR_instruction118) +PUBLIC FN_PREFIX(CryptonightR_instruction119) +PUBLIC FN_PREFIX(CryptonightR_instruction120) +PUBLIC FN_PREFIX(CryptonightR_instruction121) +PUBLIC FN_PREFIX(CryptonightR_instruction122) +PUBLIC FN_PREFIX(CryptonightR_instruction123) +PUBLIC FN_PREFIX(CryptonightR_instruction124) +PUBLIC FN_PREFIX(CryptonightR_instruction125) +PUBLIC FN_PREFIX(CryptonightR_instruction126) +PUBLIC FN_PREFIX(CryptonightR_instruction127) +PUBLIC FN_PREFIX(CryptonightR_instruction128) +PUBLIC FN_PREFIX(CryptonightR_instruction129) +PUBLIC FN_PREFIX(CryptonightR_instruction130) +PUBLIC FN_PREFIX(CryptonightR_instruction131) +PUBLIC FN_PREFIX(CryptonightR_instruction132) +PUBLIC FN_PREFIX(CryptonightR_instruction133) +PUBLIC FN_PREFIX(CryptonightR_instruction134) +PUBLIC FN_PREFIX(CryptonightR_instruction135) +PUBLIC FN_PREFIX(CryptonightR_instruction136) +PUBLIC FN_PREFIX(CryptonightR_instruction137) +PUBLIC FN_PREFIX(CryptonightR_instruction138) +PUBLIC FN_PREFIX(CryptonightR_instruction139) +PUBLIC FN_PREFIX(CryptonightR_instruction140) +PUBLIC FN_PREFIX(CryptonightR_instruction141) +PUBLIC FN_PREFIX(CryptonightR_instruction142) +PUBLIC FN_PREFIX(CryptonightR_instruction143) +PUBLIC FN_PREFIX(CryptonightR_instruction144) +PUBLIC FN_PREFIX(CryptonightR_instruction145) +PUBLIC FN_PREFIX(CryptonightR_instruction146) +PUBLIC FN_PREFIX(CryptonightR_instruction147) +PUBLIC FN_PREFIX(CryptonightR_instruction148) +PUBLIC FN_PREFIX(CryptonightR_instruction149) +PUBLIC FN_PREFIX(CryptonightR_instruction150) +PUBLIC FN_PREFIX(CryptonightR_instruction151) +PUBLIC FN_PREFIX(CryptonightR_instruction152) +PUBLIC FN_PREFIX(CryptonightR_instruction153) +PUBLIC FN_PREFIX(CryptonightR_instruction154) +PUBLIC FN_PREFIX(CryptonightR_instruction155) +PUBLIC FN_PREFIX(CryptonightR_instruction156) +PUBLIC FN_PREFIX(CryptonightR_instruction157) +PUBLIC FN_PREFIX(CryptonightR_instruction158) +PUBLIC FN_PREFIX(CryptonightR_instruction159) +PUBLIC FN_PREFIX(CryptonightR_instruction160) +PUBLIC FN_PREFIX(CryptonightR_instruction161) +PUBLIC FN_PREFIX(CryptonightR_instruction162) +PUBLIC FN_PREFIX(CryptonightR_instruction163) +PUBLIC FN_PREFIX(CryptonightR_instruction164) +PUBLIC FN_PREFIX(CryptonightR_instruction165) +PUBLIC FN_PREFIX(CryptonightR_instruction166) +PUBLIC FN_PREFIX(CryptonightR_instruction167) +PUBLIC FN_PREFIX(CryptonightR_instruction168) +PUBLIC FN_PREFIX(CryptonightR_instruction169) +PUBLIC FN_PREFIX(CryptonightR_instruction170) +PUBLIC FN_PREFIX(CryptonightR_instruction171) +PUBLIC FN_PREFIX(CryptonightR_instruction172) +PUBLIC FN_PREFIX(CryptonightR_instruction173) +PUBLIC FN_PREFIX(CryptonightR_instruction174) +PUBLIC FN_PREFIX(CryptonightR_instruction175) +PUBLIC FN_PREFIX(CryptonightR_instruction176) +PUBLIC FN_PREFIX(CryptonightR_instruction177) +PUBLIC FN_PREFIX(CryptonightR_instruction178) +PUBLIC FN_PREFIX(CryptonightR_instruction179) +PUBLIC FN_PREFIX(CryptonightR_instruction180) +PUBLIC FN_PREFIX(CryptonightR_instruction181) +PUBLIC FN_PREFIX(CryptonightR_instruction182) +PUBLIC FN_PREFIX(CryptonightR_instruction183) +PUBLIC FN_PREFIX(CryptonightR_instruction184) +PUBLIC FN_PREFIX(CryptonightR_instruction185) +PUBLIC FN_PREFIX(CryptonightR_instruction186) +PUBLIC FN_PREFIX(CryptonightR_instruction187) +PUBLIC FN_PREFIX(CryptonightR_instruction188) +PUBLIC FN_PREFIX(CryptonightR_instruction189) +PUBLIC FN_PREFIX(CryptonightR_instruction190) +PUBLIC FN_PREFIX(CryptonightR_instruction191) +PUBLIC FN_PREFIX(CryptonightR_instruction192) +PUBLIC FN_PREFIX(CryptonightR_instruction193) +PUBLIC FN_PREFIX(CryptonightR_instruction194) +PUBLIC FN_PREFIX(CryptonightR_instruction195) +PUBLIC FN_PREFIX(CryptonightR_instruction196) +PUBLIC FN_PREFIX(CryptonightR_instruction197) +PUBLIC FN_PREFIX(CryptonightR_instruction198) +PUBLIC FN_PREFIX(CryptonightR_instruction199) +PUBLIC FN_PREFIX(CryptonightR_instruction200) +PUBLIC FN_PREFIX(CryptonightR_instruction201) +PUBLIC FN_PREFIX(CryptonightR_instruction202) +PUBLIC FN_PREFIX(CryptonightR_instruction203) +PUBLIC FN_PREFIX(CryptonightR_instruction204) +PUBLIC FN_PREFIX(CryptonightR_instruction205) +PUBLIC FN_PREFIX(CryptonightR_instruction206) +PUBLIC FN_PREFIX(CryptonightR_instruction207) +PUBLIC FN_PREFIX(CryptonightR_instruction208) +PUBLIC FN_PREFIX(CryptonightR_instruction209) +PUBLIC FN_PREFIX(CryptonightR_instruction210) +PUBLIC FN_PREFIX(CryptonightR_instruction211) +PUBLIC FN_PREFIX(CryptonightR_instruction212) +PUBLIC FN_PREFIX(CryptonightR_instruction213) +PUBLIC FN_PREFIX(CryptonightR_instruction214) +PUBLIC FN_PREFIX(CryptonightR_instruction215) +PUBLIC FN_PREFIX(CryptonightR_instruction216) +PUBLIC FN_PREFIX(CryptonightR_instruction217) +PUBLIC FN_PREFIX(CryptonightR_instruction218) +PUBLIC FN_PREFIX(CryptonightR_instruction219) +PUBLIC FN_PREFIX(CryptonightR_instruction220) +PUBLIC FN_PREFIX(CryptonightR_instruction221) +PUBLIC FN_PREFIX(CryptonightR_instruction222) +PUBLIC FN_PREFIX(CryptonightR_instruction223) +PUBLIC FN_PREFIX(CryptonightR_instruction224) +PUBLIC FN_PREFIX(CryptonightR_instruction225) +PUBLIC FN_PREFIX(CryptonightR_instruction226) +PUBLIC FN_PREFIX(CryptonightR_instruction227) +PUBLIC FN_PREFIX(CryptonightR_instruction228) +PUBLIC FN_PREFIX(CryptonightR_instruction229) +PUBLIC FN_PREFIX(CryptonightR_instruction230) +PUBLIC FN_PREFIX(CryptonightR_instruction231) +PUBLIC FN_PREFIX(CryptonightR_instruction232) +PUBLIC FN_PREFIX(CryptonightR_instruction233) +PUBLIC FN_PREFIX(CryptonightR_instruction234) +PUBLIC FN_PREFIX(CryptonightR_instruction235) +PUBLIC FN_PREFIX(CryptonightR_instruction236) +PUBLIC FN_PREFIX(CryptonightR_instruction237) +PUBLIC FN_PREFIX(CryptonightR_instruction238) +PUBLIC FN_PREFIX(CryptonightR_instruction239) +PUBLIC FN_PREFIX(CryptonightR_instruction240) +PUBLIC FN_PREFIX(CryptonightR_instruction241) +PUBLIC FN_PREFIX(CryptonightR_instruction242) +PUBLIC FN_PREFIX(CryptonightR_instruction243) +PUBLIC FN_PREFIX(CryptonightR_instruction244) +PUBLIC FN_PREFIX(CryptonightR_instruction245) +PUBLIC FN_PREFIX(CryptonightR_instruction246) +PUBLIC FN_PREFIX(CryptonightR_instruction247) +PUBLIC FN_PREFIX(CryptonightR_instruction248) +PUBLIC FN_PREFIX(CryptonightR_instruction249) +PUBLIC FN_PREFIX(CryptonightR_instruction250) +PUBLIC FN_PREFIX(CryptonightR_instruction251) +PUBLIC FN_PREFIX(CryptonightR_instruction252) +PUBLIC FN_PREFIX(CryptonightR_instruction253) +PUBLIC FN_PREFIX(CryptonightR_instruction254) +PUBLIC FN_PREFIX(CryptonightR_instruction255) +PUBLIC FN_PREFIX(CryptonightR_instruction256) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov0) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov1) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov2) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov3) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov4) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov5) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov6) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov7) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov8) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov9) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov10) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov11) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov12) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov13) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov14) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov15) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov16) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov17) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov18) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov19) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov20) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov21) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov22) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov23) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov24) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov25) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov26) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov27) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov28) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov29) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov30) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov31) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov32) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov33) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov34) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov35) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov36) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov37) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov38) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov39) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov40) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov41) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov42) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov43) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov44) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov45) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov46) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov47) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov48) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov49) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov50) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov51) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov52) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov53) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov54) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov55) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov56) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov57) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov58) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov59) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov60) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov61) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov62) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov63) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov64) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov65) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov66) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov67) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov68) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov69) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov70) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov71) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov72) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov73) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov74) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov75) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov76) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov77) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov78) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov79) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov80) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov81) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov82) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov83) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov84) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov85) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov86) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov87) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov88) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov89) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov90) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov91) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov92) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov93) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov94) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov95) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov96) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov97) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov98) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov99) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov100) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov101) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov102) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov103) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov104) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov105) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov106) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov107) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov108) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov109) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov110) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov111) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov112) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov113) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov114) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov115) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov116) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov117) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov118) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov119) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov120) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov121) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov122) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov123) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov124) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov125) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov126) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov127) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov128) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov129) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov130) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov131) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov132) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov133) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov134) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov135) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov136) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov137) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov138) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov139) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov140) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov141) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov142) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov143) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov144) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov145) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov146) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov147) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov148) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov149) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov150) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov151) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov152) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov153) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov154) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov155) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov156) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov157) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov158) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov159) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov160) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov161) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov162) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov163) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov164) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov165) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov166) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov167) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov168) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov169) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov170) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov171) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov172) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov173) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov174) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov175) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov176) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov177) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov178) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov179) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov180) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov181) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov182) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov183) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov184) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov185) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov186) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov187) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov188) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov189) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov190) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov191) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov192) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov193) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov194) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov195) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov196) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov197) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov198) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov199) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov200) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov201) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov202) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov203) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov204) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov205) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov206) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov207) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov208) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov209) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov210) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov211) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov212) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov213) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov214) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov215) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov216) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov217) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov218) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov219) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov220) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov221) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov222) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov223) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov224) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov225) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov226) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov227) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov228) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov229) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov230) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov231) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov232) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov233) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov234) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov235) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov236) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov237) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov238) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov239) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov240) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov241) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov242) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov243) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov244) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov245) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov246) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov247) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov248) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov249) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov250) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov251) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov252) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov253) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov254) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov255) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov256) + +#include "CryptonightR_template.inc" +#include "CryptonightR_soft_aes_template.inc" + +FN_PREFIX(CryptonightR_instruction0): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction1): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction2): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction3): + add rbx, r9 + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction4): + sub rbx, r9 +FN_PREFIX(CryptonightR_instruction5): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction6): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction7): + xor rbx, r9 +FN_PREFIX(CryptonightR_instruction8): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction9): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction10): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction11): + add rsi, rbx + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction12): + sub rsi, rbx +FN_PREFIX(CryptonightR_instruction13): + ror esi, cl +FN_PREFIX(CryptonightR_instruction14): + rol esi, cl +FN_PREFIX(CryptonightR_instruction15): + xor rsi, rbx +FN_PREFIX(CryptonightR_instruction16): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction17): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction18): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction19): + add rdi, rbx + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction20): + sub rdi, rbx +FN_PREFIX(CryptonightR_instruction21): + ror edi, cl +FN_PREFIX(CryptonightR_instruction22): + rol edi, cl +FN_PREFIX(CryptonightR_instruction23): + xor rdi, rbx +FN_PREFIX(CryptonightR_instruction24): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction25): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction26): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction27): + add rbp, rbx + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction28): + sub rbp, rbx +FN_PREFIX(CryptonightR_instruction29): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction30): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction31): + xor rbp, rbx +FN_PREFIX(CryptonightR_instruction32): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction33): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction34): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction35): + add rbx, rsi + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction36): + sub rbx, rsi +FN_PREFIX(CryptonightR_instruction37): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction38): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction39): + xor rbx, rsi +FN_PREFIX(CryptonightR_instruction40): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction41): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction42): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction43): + add rsi, r9 + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction44): + sub rsi, r9 +FN_PREFIX(CryptonightR_instruction45): + ror esi, cl +FN_PREFIX(CryptonightR_instruction46): + rol esi, cl +FN_PREFIX(CryptonightR_instruction47): + xor rsi, r9 +FN_PREFIX(CryptonightR_instruction48): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction49): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction50): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction51): + add rdi, rsi + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction52): + sub rdi, rsi +FN_PREFIX(CryptonightR_instruction53): + ror edi, cl +FN_PREFIX(CryptonightR_instruction54): + rol edi, cl +FN_PREFIX(CryptonightR_instruction55): + xor rdi, rsi +FN_PREFIX(CryptonightR_instruction56): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction57): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction58): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction59): + add rbp, rsi + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction60): + sub rbp, rsi +FN_PREFIX(CryptonightR_instruction61): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction62): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction63): + xor rbp, rsi +FN_PREFIX(CryptonightR_instruction64): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction65): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction66): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction67): + add rbx, rdi + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction68): + sub rbx, rdi +FN_PREFIX(CryptonightR_instruction69): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction70): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction71): + xor rbx, rdi +FN_PREFIX(CryptonightR_instruction72): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction73): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction74): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction75): + add rsi, rdi + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction76): + sub rsi, rdi +FN_PREFIX(CryptonightR_instruction77): + ror esi, cl +FN_PREFIX(CryptonightR_instruction78): + rol esi, cl +FN_PREFIX(CryptonightR_instruction79): + xor rsi, rdi +FN_PREFIX(CryptonightR_instruction80): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction81): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction82): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction83): + add rdi, r9 + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction84): + sub rdi, r9 +FN_PREFIX(CryptonightR_instruction85): + ror edi, cl +FN_PREFIX(CryptonightR_instruction86): + rol edi, cl +FN_PREFIX(CryptonightR_instruction87): + xor rdi, r9 +FN_PREFIX(CryptonightR_instruction88): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction89): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction90): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction91): + add rbp, rdi + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction92): + sub rbp, rdi +FN_PREFIX(CryptonightR_instruction93): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction94): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction95): + xor rbp, rdi +FN_PREFIX(CryptonightR_instruction96): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction97): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction98): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction99): + add rbx, rbp + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction100): + sub rbx, rbp +FN_PREFIX(CryptonightR_instruction101): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction102): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction103): + xor rbx, rbp +FN_PREFIX(CryptonightR_instruction104): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction105): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction106): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction107): + add rsi, rbp + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction108): + sub rsi, rbp +FN_PREFIX(CryptonightR_instruction109): + ror esi, cl +FN_PREFIX(CryptonightR_instruction110): + rol esi, cl +FN_PREFIX(CryptonightR_instruction111): + xor rsi, rbp +FN_PREFIX(CryptonightR_instruction112): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction113): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction114): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction115): + add rdi, rbp + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction116): + sub rdi, rbp +FN_PREFIX(CryptonightR_instruction117): + ror edi, cl +FN_PREFIX(CryptonightR_instruction118): + rol edi, cl +FN_PREFIX(CryptonightR_instruction119): + xor rdi, rbp +FN_PREFIX(CryptonightR_instruction120): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction121): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction122): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction123): + add rbp, r9 + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction124): + sub rbp, r9 +FN_PREFIX(CryptonightR_instruction125): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction126): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction127): + xor rbp, r9 +FN_PREFIX(CryptonightR_instruction128): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction129): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction130): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction131): + add rbx, rsp + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction132): + sub rbx, rsp +FN_PREFIX(CryptonightR_instruction133): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction134): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction135): + xor rbx, rsp +FN_PREFIX(CryptonightR_instruction136): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction137): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction138): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction139): + add rsi, rsp + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction140): + sub rsi, rsp +FN_PREFIX(CryptonightR_instruction141): + ror esi, cl +FN_PREFIX(CryptonightR_instruction142): + rol esi, cl +FN_PREFIX(CryptonightR_instruction143): + xor rsi, rsp +FN_PREFIX(CryptonightR_instruction144): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction145): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction146): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction147): + add rdi, rsp + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction148): + sub rdi, rsp +FN_PREFIX(CryptonightR_instruction149): + ror edi, cl +FN_PREFIX(CryptonightR_instruction150): + rol edi, cl +FN_PREFIX(CryptonightR_instruction151): + xor rdi, rsp +FN_PREFIX(CryptonightR_instruction152): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction153): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction154): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction155): + add rbp, rsp + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction156): + sub rbp, rsp +FN_PREFIX(CryptonightR_instruction157): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction158): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction159): + xor rbp, rsp +FN_PREFIX(CryptonightR_instruction160): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction161): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction162): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction163): + add rbx, r15 + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction164): + sub rbx, r15 +FN_PREFIX(CryptonightR_instruction165): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction166): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction167): + xor rbx, r15 +FN_PREFIX(CryptonightR_instruction168): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction169): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction170): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction171): + add rsi, r15 + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction172): + sub rsi, r15 +FN_PREFIX(CryptonightR_instruction173): + ror esi, cl +FN_PREFIX(CryptonightR_instruction174): + rol esi, cl +FN_PREFIX(CryptonightR_instruction175): + xor rsi, r15 +FN_PREFIX(CryptonightR_instruction176): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction177): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction178): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction179): + add rdi, r15 + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction180): + sub rdi, r15 +FN_PREFIX(CryptonightR_instruction181): + ror edi, cl +FN_PREFIX(CryptonightR_instruction182): + rol edi, cl +FN_PREFIX(CryptonightR_instruction183): + xor rdi, r15 +FN_PREFIX(CryptonightR_instruction184): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction185): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction186): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction187): + add rbp, r15 + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction188): + sub rbp, r15 +FN_PREFIX(CryptonightR_instruction189): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction190): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction191): + xor rbp, r15 +FN_PREFIX(CryptonightR_instruction192): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction193): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction194): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction195): + add rbx, rax + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction196): + sub rbx, rax +FN_PREFIX(CryptonightR_instruction197): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction198): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction199): + xor rbx, rax +FN_PREFIX(CryptonightR_instruction200): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction201): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction202): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction203): + add rsi, rax + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction204): + sub rsi, rax +FN_PREFIX(CryptonightR_instruction205): + ror esi, cl +FN_PREFIX(CryptonightR_instruction206): + rol esi, cl +FN_PREFIX(CryptonightR_instruction207): + xor rsi, rax +FN_PREFIX(CryptonightR_instruction208): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction209): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction210): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction211): + add rdi, rax + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction212): + sub rdi, rax +FN_PREFIX(CryptonightR_instruction213): + ror edi, cl +FN_PREFIX(CryptonightR_instruction214): + rol edi, cl +FN_PREFIX(CryptonightR_instruction215): + xor rdi, rax +FN_PREFIX(CryptonightR_instruction216): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction217): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction218): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction219): + add rbp, rax + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction220): + sub rbp, rax +FN_PREFIX(CryptonightR_instruction221): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction222): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction223): + xor rbp, rax +FN_PREFIX(CryptonightR_instruction224): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction225): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction226): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction227): + add rbx, rdx + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction228): + sub rbx, rdx +FN_PREFIX(CryptonightR_instruction229): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction230): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction231): + xor rbx, rdx +FN_PREFIX(CryptonightR_instruction232): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction233): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction234): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction235): + add rsi, rdx + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction236): + sub rsi, rdx +FN_PREFIX(CryptonightR_instruction237): + ror esi, cl +FN_PREFIX(CryptonightR_instruction238): + rol esi, cl +FN_PREFIX(CryptonightR_instruction239): + xor rsi, rdx +FN_PREFIX(CryptonightR_instruction240): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction241): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction242): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction243): + add rdi, rdx + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction244): + sub rdi, rdx +FN_PREFIX(CryptonightR_instruction245): + ror edi, cl +FN_PREFIX(CryptonightR_instruction246): + rol edi, cl +FN_PREFIX(CryptonightR_instruction247): + xor rdi, rdx +FN_PREFIX(CryptonightR_instruction248): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction249): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction250): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction251): + add rbp, rdx + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction252): + sub rbp, rdx +FN_PREFIX(CryptonightR_instruction253): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction254): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction255): + xor rbp, rdx +FN_PREFIX(CryptonightR_instruction256): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction_mov0): + +FN_PREFIX(CryptonightR_instruction_mov1): + +FN_PREFIX(CryptonightR_instruction_mov2): + +FN_PREFIX(CryptonightR_instruction_mov3): + +FN_PREFIX(CryptonightR_instruction_mov4): + +FN_PREFIX(CryptonightR_instruction_mov5): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov6): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov7): + +FN_PREFIX(CryptonightR_instruction_mov8): + +FN_PREFIX(CryptonightR_instruction_mov9): + +FN_PREFIX(CryptonightR_instruction_mov10): + +FN_PREFIX(CryptonightR_instruction_mov11): + +FN_PREFIX(CryptonightR_instruction_mov12): + +FN_PREFIX(CryptonightR_instruction_mov13): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov14): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov15): + +FN_PREFIX(CryptonightR_instruction_mov16): + +FN_PREFIX(CryptonightR_instruction_mov17): + +FN_PREFIX(CryptonightR_instruction_mov18): + +FN_PREFIX(CryptonightR_instruction_mov19): + +FN_PREFIX(CryptonightR_instruction_mov20): + +FN_PREFIX(CryptonightR_instruction_mov21): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov22): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov23): + +FN_PREFIX(CryptonightR_instruction_mov24): + +FN_PREFIX(CryptonightR_instruction_mov25): + +FN_PREFIX(CryptonightR_instruction_mov26): + +FN_PREFIX(CryptonightR_instruction_mov27): + +FN_PREFIX(CryptonightR_instruction_mov28): + +FN_PREFIX(CryptonightR_instruction_mov29): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov30): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov31): + +FN_PREFIX(CryptonightR_instruction_mov32): + +FN_PREFIX(CryptonightR_instruction_mov33): + +FN_PREFIX(CryptonightR_instruction_mov34): + +FN_PREFIX(CryptonightR_instruction_mov35): + +FN_PREFIX(CryptonightR_instruction_mov36): + +FN_PREFIX(CryptonightR_instruction_mov37): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov38): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov39): + +FN_PREFIX(CryptonightR_instruction_mov40): + +FN_PREFIX(CryptonightR_instruction_mov41): + +FN_PREFIX(CryptonightR_instruction_mov42): + +FN_PREFIX(CryptonightR_instruction_mov43): + +FN_PREFIX(CryptonightR_instruction_mov44): + +FN_PREFIX(CryptonightR_instruction_mov45): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov46): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov47): + +FN_PREFIX(CryptonightR_instruction_mov48): + +FN_PREFIX(CryptonightR_instruction_mov49): + +FN_PREFIX(CryptonightR_instruction_mov50): + +FN_PREFIX(CryptonightR_instruction_mov51): + +FN_PREFIX(CryptonightR_instruction_mov52): + +FN_PREFIX(CryptonightR_instruction_mov53): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov54): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov55): + +FN_PREFIX(CryptonightR_instruction_mov56): + +FN_PREFIX(CryptonightR_instruction_mov57): + +FN_PREFIX(CryptonightR_instruction_mov58): + +FN_PREFIX(CryptonightR_instruction_mov59): + +FN_PREFIX(CryptonightR_instruction_mov60): + +FN_PREFIX(CryptonightR_instruction_mov61): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov62): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov63): + +FN_PREFIX(CryptonightR_instruction_mov64): + +FN_PREFIX(CryptonightR_instruction_mov65): + +FN_PREFIX(CryptonightR_instruction_mov66): + +FN_PREFIX(CryptonightR_instruction_mov67): + +FN_PREFIX(CryptonightR_instruction_mov68): + +FN_PREFIX(CryptonightR_instruction_mov69): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov70): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov71): + +FN_PREFIX(CryptonightR_instruction_mov72): + +FN_PREFIX(CryptonightR_instruction_mov73): + +FN_PREFIX(CryptonightR_instruction_mov74): + +FN_PREFIX(CryptonightR_instruction_mov75): + +FN_PREFIX(CryptonightR_instruction_mov76): + +FN_PREFIX(CryptonightR_instruction_mov77): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov78): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov79): + +FN_PREFIX(CryptonightR_instruction_mov80): + +FN_PREFIX(CryptonightR_instruction_mov81): + +FN_PREFIX(CryptonightR_instruction_mov82): + +FN_PREFIX(CryptonightR_instruction_mov83): + +FN_PREFIX(CryptonightR_instruction_mov84): + +FN_PREFIX(CryptonightR_instruction_mov85): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov86): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov87): + +FN_PREFIX(CryptonightR_instruction_mov88): + +FN_PREFIX(CryptonightR_instruction_mov89): + +FN_PREFIX(CryptonightR_instruction_mov90): + +FN_PREFIX(CryptonightR_instruction_mov91): + +FN_PREFIX(CryptonightR_instruction_mov92): + +FN_PREFIX(CryptonightR_instruction_mov93): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov94): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov95): + +FN_PREFIX(CryptonightR_instruction_mov96): + +FN_PREFIX(CryptonightR_instruction_mov97): + +FN_PREFIX(CryptonightR_instruction_mov98): + +FN_PREFIX(CryptonightR_instruction_mov99): + +FN_PREFIX(CryptonightR_instruction_mov100): + +FN_PREFIX(CryptonightR_instruction_mov101): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov102): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov103): + +FN_PREFIX(CryptonightR_instruction_mov104): + +FN_PREFIX(CryptonightR_instruction_mov105): + +FN_PREFIX(CryptonightR_instruction_mov106): + +FN_PREFIX(CryptonightR_instruction_mov107): + +FN_PREFIX(CryptonightR_instruction_mov108): + +FN_PREFIX(CryptonightR_instruction_mov109): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov110): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov111): + +FN_PREFIX(CryptonightR_instruction_mov112): + +FN_PREFIX(CryptonightR_instruction_mov113): + +FN_PREFIX(CryptonightR_instruction_mov114): + +FN_PREFIX(CryptonightR_instruction_mov115): + +FN_PREFIX(CryptonightR_instruction_mov116): + +FN_PREFIX(CryptonightR_instruction_mov117): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov118): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov119): + +FN_PREFIX(CryptonightR_instruction_mov120): + +FN_PREFIX(CryptonightR_instruction_mov121): + +FN_PREFIX(CryptonightR_instruction_mov122): + +FN_PREFIX(CryptonightR_instruction_mov123): + +FN_PREFIX(CryptonightR_instruction_mov124): + +FN_PREFIX(CryptonightR_instruction_mov125): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov126): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov127): + +FN_PREFIX(CryptonightR_instruction_mov128): + +FN_PREFIX(CryptonightR_instruction_mov129): + +FN_PREFIX(CryptonightR_instruction_mov130): + +FN_PREFIX(CryptonightR_instruction_mov131): + +FN_PREFIX(CryptonightR_instruction_mov132): + +FN_PREFIX(CryptonightR_instruction_mov133): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov134): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov135): + +FN_PREFIX(CryptonightR_instruction_mov136): + +FN_PREFIX(CryptonightR_instruction_mov137): + +FN_PREFIX(CryptonightR_instruction_mov138): + +FN_PREFIX(CryptonightR_instruction_mov139): + +FN_PREFIX(CryptonightR_instruction_mov140): + +FN_PREFIX(CryptonightR_instruction_mov141): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov142): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov143): + +FN_PREFIX(CryptonightR_instruction_mov144): + +FN_PREFIX(CryptonightR_instruction_mov145): + +FN_PREFIX(CryptonightR_instruction_mov146): + +FN_PREFIX(CryptonightR_instruction_mov147): + +FN_PREFIX(CryptonightR_instruction_mov148): + +FN_PREFIX(CryptonightR_instruction_mov149): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov150): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov151): + +FN_PREFIX(CryptonightR_instruction_mov152): + +FN_PREFIX(CryptonightR_instruction_mov153): + +FN_PREFIX(CryptonightR_instruction_mov154): + +FN_PREFIX(CryptonightR_instruction_mov155): + +FN_PREFIX(CryptonightR_instruction_mov156): + +FN_PREFIX(CryptonightR_instruction_mov157): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov158): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov159): + +FN_PREFIX(CryptonightR_instruction_mov160): + +FN_PREFIX(CryptonightR_instruction_mov161): + +FN_PREFIX(CryptonightR_instruction_mov162): + +FN_PREFIX(CryptonightR_instruction_mov163): + +FN_PREFIX(CryptonightR_instruction_mov164): + +FN_PREFIX(CryptonightR_instruction_mov165): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov166): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov167): + +FN_PREFIX(CryptonightR_instruction_mov168): + +FN_PREFIX(CryptonightR_instruction_mov169): + +FN_PREFIX(CryptonightR_instruction_mov170): + +FN_PREFIX(CryptonightR_instruction_mov171): + +FN_PREFIX(CryptonightR_instruction_mov172): + +FN_PREFIX(CryptonightR_instruction_mov173): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov174): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov175): + +FN_PREFIX(CryptonightR_instruction_mov176): + +FN_PREFIX(CryptonightR_instruction_mov177): + +FN_PREFIX(CryptonightR_instruction_mov178): + +FN_PREFIX(CryptonightR_instruction_mov179): + +FN_PREFIX(CryptonightR_instruction_mov180): + +FN_PREFIX(CryptonightR_instruction_mov181): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov182): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov183): + +FN_PREFIX(CryptonightR_instruction_mov184): + +FN_PREFIX(CryptonightR_instruction_mov185): + +FN_PREFIX(CryptonightR_instruction_mov186): + +FN_PREFIX(CryptonightR_instruction_mov187): + +FN_PREFIX(CryptonightR_instruction_mov188): + +FN_PREFIX(CryptonightR_instruction_mov189): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov190): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov191): + +FN_PREFIX(CryptonightR_instruction_mov192): + +FN_PREFIX(CryptonightR_instruction_mov193): + +FN_PREFIX(CryptonightR_instruction_mov194): + +FN_PREFIX(CryptonightR_instruction_mov195): + +FN_PREFIX(CryptonightR_instruction_mov196): + +FN_PREFIX(CryptonightR_instruction_mov197): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov198): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov199): + +FN_PREFIX(CryptonightR_instruction_mov200): + +FN_PREFIX(CryptonightR_instruction_mov201): + +FN_PREFIX(CryptonightR_instruction_mov202): + +FN_PREFIX(CryptonightR_instruction_mov203): + +FN_PREFIX(CryptonightR_instruction_mov204): + +FN_PREFIX(CryptonightR_instruction_mov205): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov206): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov207): + +FN_PREFIX(CryptonightR_instruction_mov208): + +FN_PREFIX(CryptonightR_instruction_mov209): + +FN_PREFIX(CryptonightR_instruction_mov210): + +FN_PREFIX(CryptonightR_instruction_mov211): + +FN_PREFIX(CryptonightR_instruction_mov212): + +FN_PREFIX(CryptonightR_instruction_mov213): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov214): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov215): + +FN_PREFIX(CryptonightR_instruction_mov216): + +FN_PREFIX(CryptonightR_instruction_mov217): + +FN_PREFIX(CryptonightR_instruction_mov218): + +FN_PREFIX(CryptonightR_instruction_mov219): + +FN_PREFIX(CryptonightR_instruction_mov220): + +FN_PREFIX(CryptonightR_instruction_mov221): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov222): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov223): + +FN_PREFIX(CryptonightR_instruction_mov224): + +FN_PREFIX(CryptonightR_instruction_mov225): + +FN_PREFIX(CryptonightR_instruction_mov226): + +FN_PREFIX(CryptonightR_instruction_mov227): + +FN_PREFIX(CryptonightR_instruction_mov228): + +FN_PREFIX(CryptonightR_instruction_mov229): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov230): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov231): + +FN_PREFIX(CryptonightR_instruction_mov232): + +FN_PREFIX(CryptonightR_instruction_mov233): + +FN_PREFIX(CryptonightR_instruction_mov234): + +FN_PREFIX(CryptonightR_instruction_mov235): + +FN_PREFIX(CryptonightR_instruction_mov236): + +FN_PREFIX(CryptonightR_instruction_mov237): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov238): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov239): + +FN_PREFIX(CryptonightR_instruction_mov240): + +FN_PREFIX(CryptonightR_instruction_mov241): + +FN_PREFIX(CryptonightR_instruction_mov242): + +FN_PREFIX(CryptonightR_instruction_mov243): + +FN_PREFIX(CryptonightR_instruction_mov244): + +FN_PREFIX(CryptonightR_instruction_mov245): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov246): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov247): + +FN_PREFIX(CryptonightR_instruction_mov248): + +FN_PREFIX(CryptonightR_instruction_mov249): + +FN_PREFIX(CryptonightR_instruction_mov250): + +FN_PREFIX(CryptonightR_instruction_mov251): + +FN_PREFIX(CryptonightR_instruction_mov252): + +FN_PREFIX(CryptonightR_instruction_mov253): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov254): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov255): + +FN_PREFIX(CryptonightR_instruction_mov256): diff --git a/crypto/asm/CryptonightR_template.h b/crypto/asm/CryptonightR_template.h new file mode 100644 index 000000000..e76759469 --- /dev/null +++ b/crypto/asm/CryptonightR_template.h @@ -0,0 +1,1060 @@ +typedef void(*void_func)(); + +void CryptonightR_template_part1(); +void CryptonightR_template_mainloop(); +void CryptonightR_template_part2(); +void CryptonightR_template_part3(); +void CryptonightR_template_end(); +void CryptonightR_template_double_part1(); +void CryptonightR_template_double_mainloop(); +void CryptonightR_template_double_part2(); +void CryptonightR_template_double_part3(); +void CryptonightR_template_double_part4(); +void CryptonightR_template_double_end(); + +void CryptonightR_soft_aes_template_part1(); +void CryptonightR_soft_aes_template_mainloop(); +void CryptonightR_soft_aes_template_part2(); +void CryptonightR_soft_aes_template_part3(); +void CryptonightR_soft_aes_template_end(); +void CryptonightR_soft_aes_template_double_part1(); +void CryptonightR_soft_aes_template_double_mainloop(); +void CryptonightR_soft_aes_template_double_part2(); +void CryptonightR_soft_aes_template_double_part3(); +void CryptonightR_soft_aes_template_double_part4(); +void CryptonightR_soft_aes_template_double_end(); + +void CryptonightR_instruction0(); +void CryptonightR_instruction1(); +void CryptonightR_instruction2(); +void CryptonightR_instruction3(); +void CryptonightR_instruction4(); +void CryptonightR_instruction5(); +void CryptonightR_instruction6(); +void CryptonightR_instruction7(); +void CryptonightR_instruction8(); +void CryptonightR_instruction9(); +void CryptonightR_instruction10(); +void CryptonightR_instruction11(); +void CryptonightR_instruction12(); +void CryptonightR_instruction13(); +void CryptonightR_instruction14(); +void CryptonightR_instruction15(); +void CryptonightR_instruction16(); +void CryptonightR_instruction17(); +void CryptonightR_instruction18(); +void CryptonightR_instruction19(); +void CryptonightR_instruction20(); +void CryptonightR_instruction21(); +void CryptonightR_instruction22(); +void CryptonightR_instruction23(); +void CryptonightR_instruction24(); +void CryptonightR_instruction25(); +void CryptonightR_instruction26(); +void CryptonightR_instruction27(); +void CryptonightR_instruction28(); +void CryptonightR_instruction29(); +void CryptonightR_instruction30(); +void CryptonightR_instruction31(); +void CryptonightR_instruction32(); +void CryptonightR_instruction33(); +void CryptonightR_instruction34(); +void CryptonightR_instruction35(); +void CryptonightR_instruction36(); +void CryptonightR_instruction37(); +void CryptonightR_instruction38(); +void CryptonightR_instruction39(); +void CryptonightR_instruction40(); +void CryptonightR_instruction41(); +void CryptonightR_instruction42(); +void CryptonightR_instruction43(); +void CryptonightR_instruction44(); +void CryptonightR_instruction45(); +void CryptonightR_instruction46(); +void CryptonightR_instruction47(); +void CryptonightR_instruction48(); +void CryptonightR_instruction49(); +void CryptonightR_instruction50(); +void CryptonightR_instruction51(); +void CryptonightR_instruction52(); +void CryptonightR_instruction53(); +void CryptonightR_instruction54(); +void CryptonightR_instruction55(); +void CryptonightR_instruction56(); +void CryptonightR_instruction57(); +void CryptonightR_instruction58(); +void CryptonightR_instruction59(); +void CryptonightR_instruction60(); +void CryptonightR_instruction61(); +void CryptonightR_instruction62(); +void CryptonightR_instruction63(); +void CryptonightR_instruction64(); +void CryptonightR_instruction65(); +void CryptonightR_instruction66(); +void CryptonightR_instruction67(); +void CryptonightR_instruction68(); +void CryptonightR_instruction69(); +void CryptonightR_instruction70(); +void CryptonightR_instruction71(); +void CryptonightR_instruction72(); +void CryptonightR_instruction73(); +void CryptonightR_instruction74(); +void CryptonightR_instruction75(); +void CryptonightR_instruction76(); +void CryptonightR_instruction77(); +void CryptonightR_instruction78(); +void CryptonightR_instruction79(); +void CryptonightR_instruction80(); +void CryptonightR_instruction81(); +void CryptonightR_instruction82(); +void CryptonightR_instruction83(); +void CryptonightR_instruction84(); +void CryptonightR_instruction85(); +void CryptonightR_instruction86(); +void CryptonightR_instruction87(); +void CryptonightR_instruction88(); +void CryptonightR_instruction89(); +void CryptonightR_instruction90(); +void CryptonightR_instruction91(); +void CryptonightR_instruction92(); +void CryptonightR_instruction93(); +void CryptonightR_instruction94(); +void CryptonightR_instruction95(); +void CryptonightR_instruction96(); +void CryptonightR_instruction97(); +void CryptonightR_instruction98(); +void CryptonightR_instruction99(); +void CryptonightR_instruction100(); +void CryptonightR_instruction101(); +void CryptonightR_instruction102(); +void CryptonightR_instruction103(); +void CryptonightR_instruction104(); +void CryptonightR_instruction105(); +void CryptonightR_instruction106(); +void CryptonightR_instruction107(); +void CryptonightR_instruction108(); +void CryptonightR_instruction109(); +void CryptonightR_instruction110(); +void CryptonightR_instruction111(); +void CryptonightR_instruction112(); +void CryptonightR_instruction113(); +void CryptonightR_instruction114(); +void CryptonightR_instruction115(); +void CryptonightR_instruction116(); +void CryptonightR_instruction117(); +void CryptonightR_instruction118(); +void CryptonightR_instruction119(); +void CryptonightR_instruction120(); +void CryptonightR_instruction121(); +void CryptonightR_instruction122(); +void CryptonightR_instruction123(); +void CryptonightR_instruction124(); +void CryptonightR_instruction125(); +void CryptonightR_instruction126(); +void CryptonightR_instruction127(); +void CryptonightR_instruction128(); +void CryptonightR_instruction129(); +void CryptonightR_instruction130(); +void CryptonightR_instruction131(); +void CryptonightR_instruction132(); +void CryptonightR_instruction133(); +void CryptonightR_instruction134(); +void CryptonightR_instruction135(); +void CryptonightR_instruction136(); +void CryptonightR_instruction137(); +void CryptonightR_instruction138(); +void CryptonightR_instruction139(); +void CryptonightR_instruction140(); +void CryptonightR_instruction141(); +void CryptonightR_instruction142(); +void CryptonightR_instruction143(); +void CryptonightR_instruction144(); +void CryptonightR_instruction145(); +void CryptonightR_instruction146(); +void CryptonightR_instruction147(); +void CryptonightR_instruction148(); +void CryptonightR_instruction149(); +void CryptonightR_instruction150(); +void CryptonightR_instruction151(); +void CryptonightR_instruction152(); +void CryptonightR_instruction153(); +void CryptonightR_instruction154(); +void CryptonightR_instruction155(); +void CryptonightR_instruction156(); +void CryptonightR_instruction157(); +void CryptonightR_instruction158(); +void CryptonightR_instruction159(); +void CryptonightR_instruction160(); +void CryptonightR_instruction161(); +void CryptonightR_instruction162(); +void CryptonightR_instruction163(); +void CryptonightR_instruction164(); +void CryptonightR_instruction165(); +void CryptonightR_instruction166(); +void CryptonightR_instruction167(); +void CryptonightR_instruction168(); +void CryptonightR_instruction169(); +void CryptonightR_instruction170(); +void CryptonightR_instruction171(); +void CryptonightR_instruction172(); +void CryptonightR_instruction173(); +void CryptonightR_instruction174(); +void CryptonightR_instruction175(); +void CryptonightR_instruction176(); +void CryptonightR_instruction177(); +void CryptonightR_instruction178(); +void CryptonightR_instruction179(); +void CryptonightR_instruction180(); +void CryptonightR_instruction181(); +void CryptonightR_instruction182(); +void CryptonightR_instruction183(); +void CryptonightR_instruction184(); +void CryptonightR_instruction185(); +void CryptonightR_instruction186(); +void CryptonightR_instruction187(); +void CryptonightR_instruction188(); +void CryptonightR_instruction189(); +void CryptonightR_instruction190(); +void CryptonightR_instruction191(); +void CryptonightR_instruction192(); +void CryptonightR_instruction193(); +void CryptonightR_instruction194(); +void CryptonightR_instruction195(); +void CryptonightR_instruction196(); +void CryptonightR_instruction197(); +void CryptonightR_instruction198(); +void CryptonightR_instruction199(); +void CryptonightR_instruction200(); +void CryptonightR_instruction201(); +void CryptonightR_instruction202(); +void CryptonightR_instruction203(); +void CryptonightR_instruction204(); +void CryptonightR_instruction205(); +void CryptonightR_instruction206(); +void CryptonightR_instruction207(); +void CryptonightR_instruction208(); +void CryptonightR_instruction209(); +void CryptonightR_instruction210(); +void CryptonightR_instruction211(); +void CryptonightR_instruction212(); +void CryptonightR_instruction213(); +void CryptonightR_instruction214(); +void CryptonightR_instruction215(); +void CryptonightR_instruction216(); +void CryptonightR_instruction217(); +void CryptonightR_instruction218(); +void CryptonightR_instruction219(); +void CryptonightR_instruction220(); +void CryptonightR_instruction221(); +void CryptonightR_instruction222(); +void CryptonightR_instruction223(); +void CryptonightR_instruction224(); +void CryptonightR_instruction225(); +void CryptonightR_instruction226(); +void CryptonightR_instruction227(); +void CryptonightR_instruction228(); +void CryptonightR_instruction229(); +void CryptonightR_instruction230(); +void CryptonightR_instruction231(); +void CryptonightR_instruction232(); +void CryptonightR_instruction233(); +void CryptonightR_instruction234(); +void CryptonightR_instruction235(); +void CryptonightR_instruction236(); +void CryptonightR_instruction237(); +void CryptonightR_instruction238(); +void CryptonightR_instruction239(); +void CryptonightR_instruction240(); +void CryptonightR_instruction241(); +void CryptonightR_instruction242(); +void CryptonightR_instruction243(); +void CryptonightR_instruction244(); +void CryptonightR_instruction245(); +void CryptonightR_instruction246(); +void CryptonightR_instruction247(); +void CryptonightR_instruction248(); +void CryptonightR_instruction249(); +void CryptonightR_instruction250(); +void CryptonightR_instruction251(); +void CryptonightR_instruction252(); +void CryptonightR_instruction253(); +void CryptonightR_instruction254(); +void CryptonightR_instruction255(); +void CryptonightR_instruction256(); +void CryptonightR_instruction_mov0(); +void CryptonightR_instruction_mov1(); +void CryptonightR_instruction_mov2(); +void CryptonightR_instruction_mov3(); +void CryptonightR_instruction_mov4(); +void CryptonightR_instruction_mov5(); +void CryptonightR_instruction_mov6(); +void CryptonightR_instruction_mov7(); +void CryptonightR_instruction_mov8(); +void CryptonightR_instruction_mov9(); +void CryptonightR_instruction_mov10(); +void CryptonightR_instruction_mov11(); +void CryptonightR_instruction_mov12(); +void CryptonightR_instruction_mov13(); +void CryptonightR_instruction_mov14(); +void CryptonightR_instruction_mov15(); +void CryptonightR_instruction_mov16(); +void CryptonightR_instruction_mov17(); +void CryptonightR_instruction_mov18(); +void CryptonightR_instruction_mov19(); +void CryptonightR_instruction_mov20(); +void CryptonightR_instruction_mov21(); +void CryptonightR_instruction_mov22(); +void CryptonightR_instruction_mov23(); +void CryptonightR_instruction_mov24(); +void CryptonightR_instruction_mov25(); +void CryptonightR_instruction_mov26(); +void CryptonightR_instruction_mov27(); +void CryptonightR_instruction_mov28(); +void CryptonightR_instruction_mov29(); +void CryptonightR_instruction_mov30(); +void CryptonightR_instruction_mov31(); +void CryptonightR_instruction_mov32(); +void CryptonightR_instruction_mov33(); +void CryptonightR_instruction_mov34(); +void CryptonightR_instruction_mov35(); +void CryptonightR_instruction_mov36(); +void CryptonightR_instruction_mov37(); +void CryptonightR_instruction_mov38(); +void CryptonightR_instruction_mov39(); +void CryptonightR_instruction_mov40(); +void CryptonightR_instruction_mov41(); +void CryptonightR_instruction_mov42(); +void CryptonightR_instruction_mov43(); +void CryptonightR_instruction_mov44(); +void CryptonightR_instruction_mov45(); +void CryptonightR_instruction_mov46(); +void CryptonightR_instruction_mov47(); +void CryptonightR_instruction_mov48(); +void CryptonightR_instruction_mov49(); +void CryptonightR_instruction_mov50(); +void CryptonightR_instruction_mov51(); +void CryptonightR_instruction_mov52(); +void CryptonightR_instruction_mov53(); +void CryptonightR_instruction_mov54(); +void CryptonightR_instruction_mov55(); +void CryptonightR_instruction_mov56(); +void CryptonightR_instruction_mov57(); +void CryptonightR_instruction_mov58(); +void CryptonightR_instruction_mov59(); +void CryptonightR_instruction_mov60(); +void CryptonightR_instruction_mov61(); +void CryptonightR_instruction_mov62(); +void CryptonightR_instruction_mov63(); +void CryptonightR_instruction_mov64(); +void CryptonightR_instruction_mov65(); +void CryptonightR_instruction_mov66(); +void CryptonightR_instruction_mov67(); +void CryptonightR_instruction_mov68(); +void CryptonightR_instruction_mov69(); +void CryptonightR_instruction_mov70(); +void CryptonightR_instruction_mov71(); +void CryptonightR_instruction_mov72(); +void CryptonightR_instruction_mov73(); +void CryptonightR_instruction_mov74(); +void CryptonightR_instruction_mov75(); +void CryptonightR_instruction_mov76(); +void CryptonightR_instruction_mov77(); +void CryptonightR_instruction_mov78(); +void CryptonightR_instruction_mov79(); +void CryptonightR_instruction_mov80(); +void CryptonightR_instruction_mov81(); +void CryptonightR_instruction_mov82(); +void CryptonightR_instruction_mov83(); +void CryptonightR_instruction_mov84(); +void CryptonightR_instruction_mov85(); +void CryptonightR_instruction_mov86(); +void CryptonightR_instruction_mov87(); +void CryptonightR_instruction_mov88(); +void CryptonightR_instruction_mov89(); +void CryptonightR_instruction_mov90(); +void CryptonightR_instruction_mov91(); +void CryptonightR_instruction_mov92(); +void CryptonightR_instruction_mov93(); +void CryptonightR_instruction_mov94(); +void CryptonightR_instruction_mov95(); +void CryptonightR_instruction_mov96(); +void CryptonightR_instruction_mov97(); +void CryptonightR_instruction_mov98(); +void CryptonightR_instruction_mov99(); +void CryptonightR_instruction_mov100(); +void CryptonightR_instruction_mov101(); +void CryptonightR_instruction_mov102(); +void CryptonightR_instruction_mov103(); +void CryptonightR_instruction_mov104(); +void CryptonightR_instruction_mov105(); +void CryptonightR_instruction_mov106(); +void CryptonightR_instruction_mov107(); +void CryptonightR_instruction_mov108(); +void CryptonightR_instruction_mov109(); +void CryptonightR_instruction_mov110(); +void CryptonightR_instruction_mov111(); +void CryptonightR_instruction_mov112(); +void CryptonightR_instruction_mov113(); +void CryptonightR_instruction_mov114(); +void CryptonightR_instruction_mov115(); +void CryptonightR_instruction_mov116(); +void CryptonightR_instruction_mov117(); +void CryptonightR_instruction_mov118(); +void CryptonightR_instruction_mov119(); +void CryptonightR_instruction_mov120(); +void CryptonightR_instruction_mov121(); +void CryptonightR_instruction_mov122(); +void CryptonightR_instruction_mov123(); +void CryptonightR_instruction_mov124(); +void CryptonightR_instruction_mov125(); +void CryptonightR_instruction_mov126(); +void CryptonightR_instruction_mov127(); +void CryptonightR_instruction_mov128(); +void CryptonightR_instruction_mov129(); +void CryptonightR_instruction_mov130(); +void CryptonightR_instruction_mov131(); +void CryptonightR_instruction_mov132(); +void CryptonightR_instruction_mov133(); +void CryptonightR_instruction_mov134(); +void CryptonightR_instruction_mov135(); +void CryptonightR_instruction_mov136(); +void CryptonightR_instruction_mov137(); +void CryptonightR_instruction_mov138(); +void CryptonightR_instruction_mov139(); +void CryptonightR_instruction_mov140(); +void CryptonightR_instruction_mov141(); +void CryptonightR_instruction_mov142(); +void CryptonightR_instruction_mov143(); +void CryptonightR_instruction_mov144(); +void CryptonightR_instruction_mov145(); +void CryptonightR_instruction_mov146(); +void CryptonightR_instruction_mov147(); +void CryptonightR_instruction_mov148(); +void CryptonightR_instruction_mov149(); +void CryptonightR_instruction_mov150(); +void CryptonightR_instruction_mov151(); +void CryptonightR_instruction_mov152(); +void CryptonightR_instruction_mov153(); +void CryptonightR_instruction_mov154(); +void CryptonightR_instruction_mov155(); +void CryptonightR_instruction_mov156(); +void CryptonightR_instruction_mov157(); +void CryptonightR_instruction_mov158(); +void CryptonightR_instruction_mov159(); +void CryptonightR_instruction_mov160(); +void CryptonightR_instruction_mov161(); +void CryptonightR_instruction_mov162(); +void CryptonightR_instruction_mov163(); +void CryptonightR_instruction_mov164(); +void CryptonightR_instruction_mov165(); +void CryptonightR_instruction_mov166(); +void CryptonightR_instruction_mov167(); +void CryptonightR_instruction_mov168(); +void CryptonightR_instruction_mov169(); +void CryptonightR_instruction_mov170(); +void CryptonightR_instruction_mov171(); +void CryptonightR_instruction_mov172(); +void CryptonightR_instruction_mov173(); +void CryptonightR_instruction_mov174(); +void CryptonightR_instruction_mov175(); +void CryptonightR_instruction_mov176(); +void CryptonightR_instruction_mov177(); +void CryptonightR_instruction_mov178(); +void CryptonightR_instruction_mov179(); +void CryptonightR_instruction_mov180(); +void CryptonightR_instruction_mov181(); +void CryptonightR_instruction_mov182(); +void CryptonightR_instruction_mov183(); +void CryptonightR_instruction_mov184(); +void CryptonightR_instruction_mov185(); +void CryptonightR_instruction_mov186(); +void CryptonightR_instruction_mov187(); +void CryptonightR_instruction_mov188(); +void CryptonightR_instruction_mov189(); +void CryptonightR_instruction_mov190(); +void CryptonightR_instruction_mov191(); +void CryptonightR_instruction_mov192(); +void CryptonightR_instruction_mov193(); +void CryptonightR_instruction_mov194(); +void CryptonightR_instruction_mov195(); +void CryptonightR_instruction_mov196(); +void CryptonightR_instruction_mov197(); +void CryptonightR_instruction_mov198(); +void CryptonightR_instruction_mov199(); +void CryptonightR_instruction_mov200(); +void CryptonightR_instruction_mov201(); +void CryptonightR_instruction_mov202(); +void CryptonightR_instruction_mov203(); +void CryptonightR_instruction_mov204(); +void CryptonightR_instruction_mov205(); +void CryptonightR_instruction_mov206(); +void CryptonightR_instruction_mov207(); +void CryptonightR_instruction_mov208(); +void CryptonightR_instruction_mov209(); +void CryptonightR_instruction_mov210(); +void CryptonightR_instruction_mov211(); +void CryptonightR_instruction_mov212(); +void CryptonightR_instruction_mov213(); +void CryptonightR_instruction_mov214(); +void CryptonightR_instruction_mov215(); +void CryptonightR_instruction_mov216(); +void CryptonightR_instruction_mov217(); +void CryptonightR_instruction_mov218(); +void CryptonightR_instruction_mov219(); +void CryptonightR_instruction_mov220(); +void CryptonightR_instruction_mov221(); +void CryptonightR_instruction_mov222(); +void CryptonightR_instruction_mov223(); +void CryptonightR_instruction_mov224(); +void CryptonightR_instruction_mov225(); +void CryptonightR_instruction_mov226(); +void CryptonightR_instruction_mov227(); +void CryptonightR_instruction_mov228(); +void CryptonightR_instruction_mov229(); +void CryptonightR_instruction_mov230(); +void CryptonightR_instruction_mov231(); +void CryptonightR_instruction_mov232(); +void CryptonightR_instruction_mov233(); +void CryptonightR_instruction_mov234(); +void CryptonightR_instruction_mov235(); +void CryptonightR_instruction_mov236(); +void CryptonightR_instruction_mov237(); +void CryptonightR_instruction_mov238(); +void CryptonightR_instruction_mov239(); +void CryptonightR_instruction_mov240(); +void CryptonightR_instruction_mov241(); +void CryptonightR_instruction_mov242(); +void CryptonightR_instruction_mov243(); +void CryptonightR_instruction_mov244(); +void CryptonightR_instruction_mov245(); +void CryptonightR_instruction_mov246(); +void CryptonightR_instruction_mov247(); +void CryptonightR_instruction_mov248(); +void CryptonightR_instruction_mov249(); +void CryptonightR_instruction_mov250(); +void CryptonightR_instruction_mov251(); +void CryptonightR_instruction_mov252(); +void CryptonightR_instruction_mov253(); +void CryptonightR_instruction_mov254(); +void CryptonightR_instruction_mov255(); +void CryptonightR_instruction_mov256(); + +const void_func instructions[257] = { + CryptonightR_instruction0, + CryptonightR_instruction1, + CryptonightR_instruction2, + CryptonightR_instruction3, + CryptonightR_instruction4, + CryptonightR_instruction5, + CryptonightR_instruction6, + CryptonightR_instruction7, + CryptonightR_instruction8, + CryptonightR_instruction9, + CryptonightR_instruction10, + CryptonightR_instruction11, + CryptonightR_instruction12, + CryptonightR_instruction13, + CryptonightR_instruction14, + CryptonightR_instruction15, + CryptonightR_instruction16, + CryptonightR_instruction17, + CryptonightR_instruction18, + CryptonightR_instruction19, + CryptonightR_instruction20, + CryptonightR_instruction21, + CryptonightR_instruction22, + CryptonightR_instruction23, + CryptonightR_instruction24, + CryptonightR_instruction25, + CryptonightR_instruction26, + CryptonightR_instruction27, + CryptonightR_instruction28, + CryptonightR_instruction29, + CryptonightR_instruction30, + CryptonightR_instruction31, + CryptonightR_instruction32, + CryptonightR_instruction33, + CryptonightR_instruction34, + CryptonightR_instruction35, + CryptonightR_instruction36, + CryptonightR_instruction37, + CryptonightR_instruction38, + CryptonightR_instruction39, + CryptonightR_instruction40, + CryptonightR_instruction41, + CryptonightR_instruction42, + CryptonightR_instruction43, + CryptonightR_instruction44, + CryptonightR_instruction45, + CryptonightR_instruction46, + CryptonightR_instruction47, + CryptonightR_instruction48, + CryptonightR_instruction49, + CryptonightR_instruction50, + CryptonightR_instruction51, + CryptonightR_instruction52, + CryptonightR_instruction53, + CryptonightR_instruction54, + CryptonightR_instruction55, + CryptonightR_instruction56, + CryptonightR_instruction57, + CryptonightR_instruction58, + CryptonightR_instruction59, + CryptonightR_instruction60, + CryptonightR_instruction61, + CryptonightR_instruction62, + CryptonightR_instruction63, + CryptonightR_instruction64, + CryptonightR_instruction65, + CryptonightR_instruction66, + CryptonightR_instruction67, + CryptonightR_instruction68, + CryptonightR_instruction69, + CryptonightR_instruction70, + CryptonightR_instruction71, + CryptonightR_instruction72, + CryptonightR_instruction73, + CryptonightR_instruction74, + CryptonightR_instruction75, + CryptonightR_instruction76, + CryptonightR_instruction77, + CryptonightR_instruction78, + CryptonightR_instruction79, + CryptonightR_instruction80, + CryptonightR_instruction81, + CryptonightR_instruction82, + CryptonightR_instruction83, + CryptonightR_instruction84, + CryptonightR_instruction85, + CryptonightR_instruction86, + CryptonightR_instruction87, + CryptonightR_instruction88, + CryptonightR_instruction89, + CryptonightR_instruction90, + CryptonightR_instruction91, + CryptonightR_instruction92, + CryptonightR_instruction93, + CryptonightR_instruction94, + CryptonightR_instruction95, + CryptonightR_instruction96, + CryptonightR_instruction97, + CryptonightR_instruction98, + CryptonightR_instruction99, + CryptonightR_instruction100, + CryptonightR_instruction101, + CryptonightR_instruction102, + CryptonightR_instruction103, + CryptonightR_instruction104, + CryptonightR_instruction105, + CryptonightR_instruction106, + CryptonightR_instruction107, + CryptonightR_instruction108, + CryptonightR_instruction109, + CryptonightR_instruction110, + CryptonightR_instruction111, + CryptonightR_instruction112, + CryptonightR_instruction113, + CryptonightR_instruction114, + CryptonightR_instruction115, + CryptonightR_instruction116, + CryptonightR_instruction117, + CryptonightR_instruction118, + CryptonightR_instruction119, + CryptonightR_instruction120, + CryptonightR_instruction121, + CryptonightR_instruction122, + CryptonightR_instruction123, + CryptonightR_instruction124, + CryptonightR_instruction125, + CryptonightR_instruction126, + CryptonightR_instruction127, + CryptonightR_instruction128, + CryptonightR_instruction129, + CryptonightR_instruction130, + CryptonightR_instruction131, + CryptonightR_instruction132, + CryptonightR_instruction133, + CryptonightR_instruction134, + CryptonightR_instruction135, + CryptonightR_instruction136, + CryptonightR_instruction137, + CryptonightR_instruction138, + CryptonightR_instruction139, + CryptonightR_instruction140, + CryptonightR_instruction141, + CryptonightR_instruction142, + CryptonightR_instruction143, + CryptonightR_instruction144, + CryptonightR_instruction145, + CryptonightR_instruction146, + CryptonightR_instruction147, + CryptonightR_instruction148, + CryptonightR_instruction149, + CryptonightR_instruction150, + CryptonightR_instruction151, + CryptonightR_instruction152, + CryptonightR_instruction153, + CryptonightR_instruction154, + CryptonightR_instruction155, + CryptonightR_instruction156, + CryptonightR_instruction157, + CryptonightR_instruction158, + CryptonightR_instruction159, + CryptonightR_instruction160, + CryptonightR_instruction161, + CryptonightR_instruction162, + CryptonightR_instruction163, + CryptonightR_instruction164, + CryptonightR_instruction165, + CryptonightR_instruction166, + CryptonightR_instruction167, + CryptonightR_instruction168, + CryptonightR_instruction169, + CryptonightR_instruction170, + CryptonightR_instruction171, + CryptonightR_instruction172, + CryptonightR_instruction173, + CryptonightR_instruction174, + CryptonightR_instruction175, + CryptonightR_instruction176, + CryptonightR_instruction177, + CryptonightR_instruction178, + CryptonightR_instruction179, + CryptonightR_instruction180, + CryptonightR_instruction181, + CryptonightR_instruction182, + CryptonightR_instruction183, + CryptonightR_instruction184, + CryptonightR_instruction185, + CryptonightR_instruction186, + CryptonightR_instruction187, + CryptonightR_instruction188, + CryptonightR_instruction189, + CryptonightR_instruction190, + CryptonightR_instruction191, + CryptonightR_instruction192, + CryptonightR_instruction193, + CryptonightR_instruction194, + CryptonightR_instruction195, + CryptonightR_instruction196, + CryptonightR_instruction197, + CryptonightR_instruction198, + CryptonightR_instruction199, + CryptonightR_instruction200, + CryptonightR_instruction201, + CryptonightR_instruction202, + CryptonightR_instruction203, + CryptonightR_instruction204, + CryptonightR_instruction205, + CryptonightR_instruction206, + CryptonightR_instruction207, + CryptonightR_instruction208, + CryptonightR_instruction209, + CryptonightR_instruction210, + CryptonightR_instruction211, + CryptonightR_instruction212, + CryptonightR_instruction213, + CryptonightR_instruction214, + CryptonightR_instruction215, + CryptonightR_instruction216, + CryptonightR_instruction217, + CryptonightR_instruction218, + CryptonightR_instruction219, + CryptonightR_instruction220, + CryptonightR_instruction221, + CryptonightR_instruction222, + CryptonightR_instruction223, + CryptonightR_instruction224, + CryptonightR_instruction225, + CryptonightR_instruction226, + CryptonightR_instruction227, + CryptonightR_instruction228, + CryptonightR_instruction229, + CryptonightR_instruction230, + CryptonightR_instruction231, + CryptonightR_instruction232, + CryptonightR_instruction233, + CryptonightR_instruction234, + CryptonightR_instruction235, + CryptonightR_instruction236, + CryptonightR_instruction237, + CryptonightR_instruction238, + CryptonightR_instruction239, + CryptonightR_instruction240, + CryptonightR_instruction241, + CryptonightR_instruction242, + CryptonightR_instruction243, + CryptonightR_instruction244, + CryptonightR_instruction245, + CryptonightR_instruction246, + CryptonightR_instruction247, + CryptonightR_instruction248, + CryptonightR_instruction249, + CryptonightR_instruction250, + CryptonightR_instruction251, + CryptonightR_instruction252, + CryptonightR_instruction253, + CryptonightR_instruction254, + CryptonightR_instruction255, + CryptonightR_instruction256, +}; + +const void_func instructions_mov[257] = { + CryptonightR_instruction_mov0, + CryptonightR_instruction_mov1, + CryptonightR_instruction_mov2, + CryptonightR_instruction_mov3, + CryptonightR_instruction_mov4, + CryptonightR_instruction_mov5, + CryptonightR_instruction_mov6, + CryptonightR_instruction_mov7, + CryptonightR_instruction_mov8, + CryptonightR_instruction_mov9, + CryptonightR_instruction_mov10, + CryptonightR_instruction_mov11, + CryptonightR_instruction_mov12, + CryptonightR_instruction_mov13, + CryptonightR_instruction_mov14, + CryptonightR_instruction_mov15, + CryptonightR_instruction_mov16, + CryptonightR_instruction_mov17, + CryptonightR_instruction_mov18, + CryptonightR_instruction_mov19, + CryptonightR_instruction_mov20, + CryptonightR_instruction_mov21, + CryptonightR_instruction_mov22, + CryptonightR_instruction_mov23, + CryptonightR_instruction_mov24, + CryptonightR_instruction_mov25, + CryptonightR_instruction_mov26, + CryptonightR_instruction_mov27, + CryptonightR_instruction_mov28, + CryptonightR_instruction_mov29, + CryptonightR_instruction_mov30, + CryptonightR_instruction_mov31, + CryptonightR_instruction_mov32, + CryptonightR_instruction_mov33, + CryptonightR_instruction_mov34, + CryptonightR_instruction_mov35, + CryptonightR_instruction_mov36, + CryptonightR_instruction_mov37, + CryptonightR_instruction_mov38, + CryptonightR_instruction_mov39, + CryptonightR_instruction_mov40, + CryptonightR_instruction_mov41, + CryptonightR_instruction_mov42, + CryptonightR_instruction_mov43, + CryptonightR_instruction_mov44, + CryptonightR_instruction_mov45, + CryptonightR_instruction_mov46, + CryptonightR_instruction_mov47, + CryptonightR_instruction_mov48, + CryptonightR_instruction_mov49, + CryptonightR_instruction_mov50, + CryptonightR_instruction_mov51, + CryptonightR_instruction_mov52, + CryptonightR_instruction_mov53, + CryptonightR_instruction_mov54, + CryptonightR_instruction_mov55, + CryptonightR_instruction_mov56, + CryptonightR_instruction_mov57, + CryptonightR_instruction_mov58, + CryptonightR_instruction_mov59, + CryptonightR_instruction_mov60, + CryptonightR_instruction_mov61, + CryptonightR_instruction_mov62, + CryptonightR_instruction_mov63, + CryptonightR_instruction_mov64, + CryptonightR_instruction_mov65, + CryptonightR_instruction_mov66, + CryptonightR_instruction_mov67, + CryptonightR_instruction_mov68, + CryptonightR_instruction_mov69, + CryptonightR_instruction_mov70, + CryptonightR_instruction_mov71, + CryptonightR_instruction_mov72, + CryptonightR_instruction_mov73, + CryptonightR_instruction_mov74, + CryptonightR_instruction_mov75, + CryptonightR_instruction_mov76, + CryptonightR_instruction_mov77, + CryptonightR_instruction_mov78, + CryptonightR_instruction_mov79, + CryptonightR_instruction_mov80, + CryptonightR_instruction_mov81, + CryptonightR_instruction_mov82, + CryptonightR_instruction_mov83, + CryptonightR_instruction_mov84, + CryptonightR_instruction_mov85, + CryptonightR_instruction_mov86, + CryptonightR_instruction_mov87, + CryptonightR_instruction_mov88, + CryptonightR_instruction_mov89, + CryptonightR_instruction_mov90, + CryptonightR_instruction_mov91, + CryptonightR_instruction_mov92, + CryptonightR_instruction_mov93, + CryptonightR_instruction_mov94, + CryptonightR_instruction_mov95, + CryptonightR_instruction_mov96, + CryptonightR_instruction_mov97, + CryptonightR_instruction_mov98, + CryptonightR_instruction_mov99, + CryptonightR_instruction_mov100, + CryptonightR_instruction_mov101, + CryptonightR_instruction_mov102, + CryptonightR_instruction_mov103, + CryptonightR_instruction_mov104, + CryptonightR_instruction_mov105, + CryptonightR_instruction_mov106, + CryptonightR_instruction_mov107, + CryptonightR_instruction_mov108, + CryptonightR_instruction_mov109, + CryptonightR_instruction_mov110, + CryptonightR_instruction_mov111, + CryptonightR_instruction_mov112, + CryptonightR_instruction_mov113, + CryptonightR_instruction_mov114, + CryptonightR_instruction_mov115, + CryptonightR_instruction_mov116, + CryptonightR_instruction_mov117, + CryptonightR_instruction_mov118, + CryptonightR_instruction_mov119, + CryptonightR_instruction_mov120, + CryptonightR_instruction_mov121, + CryptonightR_instruction_mov122, + CryptonightR_instruction_mov123, + CryptonightR_instruction_mov124, + CryptonightR_instruction_mov125, + CryptonightR_instruction_mov126, + CryptonightR_instruction_mov127, + CryptonightR_instruction_mov128, + CryptonightR_instruction_mov129, + CryptonightR_instruction_mov130, + CryptonightR_instruction_mov131, + CryptonightR_instruction_mov132, + CryptonightR_instruction_mov133, + CryptonightR_instruction_mov134, + CryptonightR_instruction_mov135, + CryptonightR_instruction_mov136, + CryptonightR_instruction_mov137, + CryptonightR_instruction_mov138, + CryptonightR_instruction_mov139, + CryptonightR_instruction_mov140, + CryptonightR_instruction_mov141, + CryptonightR_instruction_mov142, + CryptonightR_instruction_mov143, + CryptonightR_instruction_mov144, + CryptonightR_instruction_mov145, + CryptonightR_instruction_mov146, + CryptonightR_instruction_mov147, + CryptonightR_instruction_mov148, + CryptonightR_instruction_mov149, + CryptonightR_instruction_mov150, + CryptonightR_instruction_mov151, + CryptonightR_instruction_mov152, + CryptonightR_instruction_mov153, + CryptonightR_instruction_mov154, + CryptonightR_instruction_mov155, + CryptonightR_instruction_mov156, + CryptonightR_instruction_mov157, + CryptonightR_instruction_mov158, + CryptonightR_instruction_mov159, + CryptonightR_instruction_mov160, + CryptonightR_instruction_mov161, + CryptonightR_instruction_mov162, + CryptonightR_instruction_mov163, + CryptonightR_instruction_mov164, + CryptonightR_instruction_mov165, + CryptonightR_instruction_mov166, + CryptonightR_instruction_mov167, + CryptonightR_instruction_mov168, + CryptonightR_instruction_mov169, + CryptonightR_instruction_mov170, + CryptonightR_instruction_mov171, + CryptonightR_instruction_mov172, + CryptonightR_instruction_mov173, + CryptonightR_instruction_mov174, + CryptonightR_instruction_mov175, + CryptonightR_instruction_mov176, + CryptonightR_instruction_mov177, + CryptonightR_instruction_mov178, + CryptonightR_instruction_mov179, + CryptonightR_instruction_mov180, + CryptonightR_instruction_mov181, + CryptonightR_instruction_mov182, + CryptonightR_instruction_mov183, + CryptonightR_instruction_mov184, + CryptonightR_instruction_mov185, + CryptonightR_instruction_mov186, + CryptonightR_instruction_mov187, + CryptonightR_instruction_mov188, + CryptonightR_instruction_mov189, + CryptonightR_instruction_mov190, + CryptonightR_instruction_mov191, + CryptonightR_instruction_mov192, + CryptonightR_instruction_mov193, + CryptonightR_instruction_mov194, + CryptonightR_instruction_mov195, + CryptonightR_instruction_mov196, + CryptonightR_instruction_mov197, + CryptonightR_instruction_mov198, + CryptonightR_instruction_mov199, + CryptonightR_instruction_mov200, + CryptonightR_instruction_mov201, + CryptonightR_instruction_mov202, + CryptonightR_instruction_mov203, + CryptonightR_instruction_mov204, + CryptonightR_instruction_mov205, + CryptonightR_instruction_mov206, + CryptonightR_instruction_mov207, + CryptonightR_instruction_mov208, + CryptonightR_instruction_mov209, + CryptonightR_instruction_mov210, + CryptonightR_instruction_mov211, + CryptonightR_instruction_mov212, + CryptonightR_instruction_mov213, + CryptonightR_instruction_mov214, + CryptonightR_instruction_mov215, + CryptonightR_instruction_mov216, + CryptonightR_instruction_mov217, + CryptonightR_instruction_mov218, + CryptonightR_instruction_mov219, + CryptonightR_instruction_mov220, + CryptonightR_instruction_mov221, + CryptonightR_instruction_mov222, + CryptonightR_instruction_mov223, + CryptonightR_instruction_mov224, + CryptonightR_instruction_mov225, + CryptonightR_instruction_mov226, + CryptonightR_instruction_mov227, + CryptonightR_instruction_mov228, + CryptonightR_instruction_mov229, + CryptonightR_instruction_mov230, + CryptonightR_instruction_mov231, + CryptonightR_instruction_mov232, + CryptonightR_instruction_mov233, + CryptonightR_instruction_mov234, + CryptonightR_instruction_mov235, + CryptonightR_instruction_mov236, + CryptonightR_instruction_mov237, + CryptonightR_instruction_mov238, + CryptonightR_instruction_mov239, + CryptonightR_instruction_mov240, + CryptonightR_instruction_mov241, + CryptonightR_instruction_mov242, + CryptonightR_instruction_mov243, + CryptonightR_instruction_mov244, + CryptonightR_instruction_mov245, + CryptonightR_instruction_mov246, + CryptonightR_instruction_mov247, + CryptonightR_instruction_mov248, + CryptonightR_instruction_mov249, + CryptonightR_instruction_mov250, + CryptonightR_instruction_mov251, + CryptonightR_instruction_mov252, + CryptonightR_instruction_mov253, + CryptonightR_instruction_mov254, + CryptonightR_instruction_mov255, + CryptonightR_instruction_mov256, +}; diff --git a/crypto/asm/CryptonightR_template.inc b/crypto/asm/CryptonightR_template.inc new file mode 100644 index 000000000..8ecab7247 --- /dev/null +++ b/crypto/asm/CryptonightR_template.inc @@ -0,0 +1,531 @@ +PUBLIC FN_PREFIX(CryptonightR_template_part1) +PUBLIC FN_PREFIX(CryptonightR_template_mainloop) +PUBLIC FN_PREFIX(CryptonightR_template_part2) +PUBLIC FN_PREFIX(CryptonightR_template_part3) +PUBLIC FN_PREFIX(CryptonightR_template_end) +PUBLIC FN_PREFIX(CryptonightR_template_double_part1) +PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop) +PUBLIC FN_PREFIX(CryptonightR_template_double_part2) +PUBLIC FN_PREFIX(CryptonightR_template_double_part3) +PUBLIC FN_PREFIX(CryptonightR_template_double_part4) +PUBLIC FN_PREFIX(CryptonightR_template_double_end) + +ALIGN(64) +FN_PREFIX(CryptonightR_template_part1): + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movq xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movq xmm0, r12 + movq xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movq xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +FN_PREFIX(CryptonightR_template_mainloop): + movdqa xmm5, XMMWORD PTR [r9+r11] + movq xmm0, r15 + movq xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + + mov r13d, r9d + mov eax, r9d + xor r9d, 48 + xor r13d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movaps xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [r13+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + pxor xmm0, xmm2 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + + movq r12, xmm5 + movd r10d, xmm5 + and r10d, 2097136 + + paddq xmm3, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r13+r11], xmm3 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + +FN_PREFIX(CryptonightR_template_part2): + lea rcx, [r10+r11] + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor rsp, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov rax, r13 + mul r12 + add r15, rax + add rsp, rdx + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + movaps xmm3, xmm1 + movdqa xmm2, XMMWORD PTR [r9+r11] + movdqa xmm0, XMMWORD PTR [r10+r11] + pxor xmm1, xmm2 + pxor xmm5, xmm0 + pxor xmm5, xmm1 + paddq xmm3, xmm4 + paddq xmm2, xmm6 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqu XMMWORD PTR [r12+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm3 + + movdqa xmm7, xmm6 + mov QWORD PTR [rcx], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [rcx+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz FN_PREFIX(CryptonightR_template_mainloop) + +FN_PREFIX(CryptonightR_template_part3): + movq rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +FN_PREFIX(CryptonightR_template_end): + +ALIGN(64) +FN_PREFIX(CryptonightR_template_double_part1): + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movq xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movq xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movq xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movq xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movq xmm0, rcx + mov r11d, 524288 + movq xmm10, rax + punpcklqdq xmm10, xmm0 + + movq xmm14, QWORD PTR [rsp+128] + movq xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +FN_PREFIX(CryptonightR_template_double_mainloop): + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movq xmm0, r12 + mov ecx, ebx + movq xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movq xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm1 + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + movq rdx, xmm6 + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movq xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm1 + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movq rdi, xmm5 + movq rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movq xmm0, rsp + movq xmm1, rsi + movq xmm2, rdi + movq xmm11, rbp + movq xmm12, r15 + movq xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +FN_PREFIX(CryptonightR_template_double_part2): + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r14, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r12, rax + + movq rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movq rsi, xmm1 + movq rdi, xmm2 + movq rbp, xmm11 + movq r15, xmm12 + movq rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movdqu xmm1, XMMWORD PTR [rcx+rsi] + pxor xmm6, xmm1 + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + pxor xmm6, xmm2 + paddq xmm2, xmm3 + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + pxor xmm6, xmm0 + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movq rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movq xmm0, rsp + movq xmm1, rbx + movq xmm2, rsi + movq xmm11, rdi + movq xmm12, rbp + movq xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movq xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +FN_PREFIX(CryptonightR_template_double_part3): + + movq r15, xmm13 + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r13, rax + + movq rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movq rbx, xmm1 + movq rsi, xmm2 + movq rdi, xmm11 + movq rbp, xmm12 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + mov rdi, rcx + mov r8, rax + movdqu xmm1, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm1 + xor ebp, 48 + paddq xmm1, xmm8 + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm2 + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm0 + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movq rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz FN_PREFIX(CryptonightR_template_double_mainloop) + +FN_PREFIX(CryptonightR_template_double_part4): + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +FN_PREFIX(CryptonightR_template_double_end): diff --git a/crypto/asm/cnv2_double_main_loop_sandybridge.inc b/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc similarity index 99% rename from crypto/asm/cnv2_double_main_loop_sandybridge.inc rename to crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc index e8251bc7f..aa5101a83 100644 --- a/crypto/asm/cnv2_double_main_loop_sandybridge.inc +++ b/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc @@ -94,7 +94,7 @@ lea r9, QWORD PTR [rdx+r13] movdqu xmm15, XMMWORD PTR [r9] - ALIGN 16 + ALIGN(64) main_loop_double_sandybridge: movdqu xmm9, xmm15 mov eax, edx diff --git a/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc b/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc new file mode 100644 index 000000000..c764501db --- /dev/null +++ b/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN(64) +cnv2_main_loop_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movq r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movq xmm0, rax + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_bulldozer + shr rdi, 19 + +sqrt_fixup_bulldozer_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_bulldozer_endp + +sqrt_fixup_bulldozer: + movq r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_bulldozer_ret + +cnv2_main_loop_bulldozer_endp: diff --git a/crypto/asm/cnv2_main_loop_ivybridge.inc b/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc similarity index 99% rename from crypto/asm/cnv2_main_loop_ivybridge.inc rename to crypto/asm/cn2/cnv2_main_loop_ivybridge.inc index 8c2c2d3b0..06f1d28be 100644 --- a/crypto/asm/cnv2_main_loop_ivybridge.inc +++ b/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc @@ -50,7 +50,7 @@ punpcklqdq xmm5, xmm0 movdqu xmm6, XMMWORD PTR [r10+rbx] - ALIGN 16 + ALIGN(64) main_loop_ivybridge: lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d diff --git a/crypto/asm/cnv2_main_loop_ryzen.inc b/crypto/asm/cn2/cnv2_main_loop_ryzen.inc similarity index 99% rename from crypto/asm/cnv2_main_loop_ryzen.inc rename to crypto/asm/cn2/cnv2_main_loop_ryzen.inc index d386aa2df..5dbf5917f 100644 --- a/crypto/asm/cnv2_main_loop_ryzen.inc +++ b/crypto/asm/cn2/cnv2_main_loop_ryzen.inc @@ -45,7 +45,7 @@ movq xmm0, rcx punpcklqdq xmm4, xmm0 - ALIGN 16 + ALIGN(64) main_loop_ryzen: movdqa xmm5, XMMWORD PTR [r10+rbx] movq xmm0, r11 diff --git a/crypto/asm/cnv2_main_loop.S b/crypto/asm/cn_main_loop.S similarity index 51% rename from crypto/asm/cnv2_main_loop.S rename to crypto/asm/cn_main_loop.S index 4dbcbbda7..a792337f0 100644 --- a/crypto/asm/cnv2_main_loop.S +++ b/crypto/asm/cn_main_loop.S @@ -1,4 +1,8 @@ -#define ALIGN .align +#ifdef __APPLE__ +# define ALIGN(x) .align 6 +#else +# define ALIGN(x) .align 64 +#endif .intel_syntax noprefix #ifdef __APPLE__ # define FN_PREFIX(fn) _ ## fn @@ -9,29 +13,42 @@ #endif .global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) -ALIGN 16 +ALIGN(64) FN_PREFIX(cnv2_mainloop_ivybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cnv2_main_loop_ivybridge.inc" + #include "cn2/cnv2_main_loop_ivybridge.inc" add rsp, 48 ret 0 + mov eax, 3735929054 -ALIGN 16 +ALIGN(64) FN_PREFIX(cnv2_mainloop_ryzen_asm): sub rsp, 48 mov rcx, rdi - #include "cnv2_main_loop_ryzen.inc" + #include "cn2/cnv2_main_loop_ryzen.inc" add rsp, 48 ret 0 + mov eax, 3735929054 -ALIGN 16 +ALIGN(64) +FN_PREFIX(cnv2_mainloop_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn2/cnv2_main_loop_bulldozer.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + +ALIGN(64) FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): sub rsp, 48 mov rcx, rdi mov rdx, rsi - #include "cnv2_double_main_loop_sandybridge.inc" + #include "cn2/cnv2_double_main_loop_sandybridge.inc" add rsp, 48 ret 0 + mov eax, 3735929054 diff --git a/crypto/asm/cnv2_main_loop.asm b/crypto/asm/cnv2_main_loop.asm deleted file mode 100644 index d95222675..000000000 --- a/crypto/asm/cnv2_main_loop.asm +++ /dev/null @@ -1,25 +0,0 @@ -_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE -PUBLIC cnv2_mainloop_ivybridge_asm -PUBLIC cnv2_mainloop_ryzen_asm -PUBLIC cnv2_double_mainloop_sandybridge_asm - -ALIGN 64 -cnv2_mainloop_ivybridge_asm PROC - INCLUDE cnv2_main_loop_ivybridge.inc - ret 0 -cnv2_mainloop_ivybridge_asm ENDP - -ALIGN 64 -cnv2_mainloop_ryzen_asm PROC - INCLUDE cnv2_main_loop_ryzen.inc - ret 0 -cnv2_mainloop_ryzen_asm ENDP - -ALIGN 64 -cnv2_double_mainloop_sandybridge_asm PROC - INCLUDE cnv2_double_main_loop_sandybridge.inc - ret 0 -cnv2_double_mainloop_sandybridge_asm ENDP - -_TEXT_CNV2_MAINLOOP ENDS -END diff --git a/crypto/asm/win64/cn_main_loop.S b/crypto/asm/win64/cn_main_loop.S new file mode 100644 index 000000000..1200c4dfe --- /dev/null +++ b/crypto/asm/win64/cn_main_loop.S @@ -0,0 +1,31 @@ +#define ALIGN(x) .align 64 +.intel_syntax noprefix +.section .text +.global cnv2_mainloop_ivybridge_asm +.global cnv2_mainloop_ryzen_asm +.global cnv2_mainloop_bulldozer_asm +.global cnv2_double_mainloop_sandybridge_asm + +ALIGN(64) +cnv2_mainloop_ivybridge_asm: + #include "../cn2/cnv2_main_loop_ivybridge.inc" + ret 0 + mov eax, 3735929054 + +ALIGN(64) +cnv2_mainloop_ryzen_asm: + #include "../cn2/cnv2_main_loop_ryzen.inc" + ret 0 + mov eax, 3735929054 + +ALIGN(64) +cnv2_mainloop_bulldozer_asm: + #include "../cn2/cnv2_main_loop_bulldozer.inc" + ret 0 + mov eax, 3735929054 + +ALIGN(64) +cnv2_double_mainloop_sandybridge_asm: + #include "../cn2/cnv2_double_main_loop_sandybridge.inc" + ret 0 + mov eax, 3735929054 diff --git a/crypto/asm/win64/cnv2_main_loop.S b/crypto/asm/win64/cnv2_main_loop.S deleted file mode 100644 index 78eb11859..000000000 --- a/crypto/asm/win64/cnv2_main_loop.S +++ /dev/null @@ -1,21 +0,0 @@ -#define ALIGN .align -.intel_syntax noprefix -.section .text -.global cnv2_mainloop_ivybridge_asm -.global cnv2_mainloop_ryzen_asm -.global cnv2_double_mainloop_sandybridge_asm - -ALIGN 16 -cnv2_mainloop_ivybridge_asm: - #include "../cnv2_main_loop_ivybridge.inc" - ret 0 - -ALIGN 16 -cnv2_mainloop_ryzen_asm: - #include "../cnv2_main_loop_ryzen.inc" - ret 0 - -ALIGN 16 -cnv2_double_mainloop_sandybridge_asm: - #include "../cnv2_double_main_loop_sandybridge.inc" - ret 0 diff --git a/crypto/soft_aes.c b/crypto/soft_aes.c deleted file mode 100644 index 6904c5261..000000000 --- a/crypto/soft_aes.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Additional permission under GNU GPL version 3 section 7 - * - * If you modify this Program, or any covered work, by linking or combining - * it with OpenSSL (or a modified version of that library), containing parts - * covered by the terms of OpenSSL License and SSLeay License, the licensors - * of this Program grant you additional permission to convey the resulting work. - * - */ - -/* - * The orginal author of this AES implementation is Karl Malbrain. - */ - -#ifdef __GNUC__ -#include -#else -#include -#endif // __GNUC__ - -#include - -#define TABLE_ALIGN 32 -#define WPOLY 0x011b -#define N_COLS 4 -#define AES_BLOCK_SIZE 16 -#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2)) - -#if defined(_MSC_VER) -#define ALIGN __declspec(align(TABLE_ALIGN)) -#elif defined(__GNUC__) -#define ALIGN __attribute__ ((aligned(16))) -#else -#define ALIGN -#endif - -#define rf1(r,c) (r) -#define word_in(x,c) (*((uint32_t*)(x)+(c))) -#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v)) - -#define s(x,c) x[c] -#define si(y,x,c) (s(y,c) = word_in(x, c)) -#define so(y,x,c) word_out(y, c, s(x,c)) -#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3) -#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3) -#define round(y,x,k) \ -y[0] = (k)[0] ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \ -y[1] = (k)[1] ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \ -y[2] = (k)[2] ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \ -y[3] = (k)[3] ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]); -#define to_byte(x) ((x) & 0xff) -#define bval(x,n) to_byte((x) >> (8 * (n))) - -#define fwd_var(x,r,c)\ - ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ - : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\ - : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ - : ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))) - -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c)) - -#define sb_data(w) {\ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ - w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ - w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ - w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ - w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ - w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ - w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ - w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ - w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ - w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ - w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ - w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ - w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ - w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ - w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ - w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ - w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ - w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ - w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ - w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ - w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ - w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ - w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ - w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ - w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ - w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ - w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ - w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } - -#define rc_data(w) {\ - w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\ - w(0x1b), w(0x36) } - -#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \ - ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) - -#define h0(x) (x) -#define w0(p) bytes2word(p, 0, 0, 0) -#define w1(p) bytes2word(0, p, 0, 0) -#define w2(p) bytes2word(0, 0, p, 0) -#define w3(p) bytes2word(0, 0, 0, p) - -#define u0(p) bytes2word(f2(p), p, p, f3(p)) -#define u1(p) bytes2word(f3(p), f2(p), p, p) -#define u2(p) bytes2word(p, f3(p), f2(p), p) -#define u3(p) bytes2word(p, p, f3(p), f2(p)) - -#define v0(p) bytes2word(fe(p), f9(p), fd(p), fb(p)) -#define v1(p) bytes2word(fb(p), fe(p), f9(p), fd(p)) -#define v2(p) bytes2word(fd(p), fb(p), fe(p), f9(p)) -#define v3(p) bytes2word(f9(p), fd(p), fb(p), fe(p)) - -#define f2(x) ((x<<1) ^ (((x>>7) & 1) * WPOLY)) -#define f4(x) ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY)) -#define f8(x) ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY)) -#define f3(x) (f2(x) ^ x) -#define f9(x) (f8(x) ^ x) -#define fb(x) (f8(x) ^ f2(x) ^ x) -#define fd(x) (f8(x) ^ f4(x) ^ x) -#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -#define t_dec(m,n) t_##m##n -#define t_set(m,n) t_##m##n -#define t_use(m,n) t_##m##n - -#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) } - -#define four_tables(x,tab,vf,rf,c) \ - (tab[0][bval(vf(x,0,c),rf(0,c))] \ - ^ tab[1][bval(vf(x,1,c),rf(1,c))] \ - ^ tab[2][bval(vf(x,2,c),rf(2,c))] \ - ^ tab[3][bval(vf(x,3,c),rf(3,c))]) - -d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3); - -__m128i soft_aesenc(__m128i in, __m128i key) -{ - uint32_t x0, x1, x2, x3; - x0 = _mm_cvtsi128_si32(in); - x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); - x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA)); - x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF)); - - __m128i out = _mm_set_epi32( - (t_fn[0][x3 & 0xff] ^ t_fn[1][(x0 >> 8) & 0xff] ^ t_fn[2][(x1 >> 16) & 0xff] ^ t_fn[3][x2 >> 24]), - (t_fn[0][x2 & 0xff] ^ t_fn[1][(x3 >> 8) & 0xff] ^ t_fn[2][(x0 >> 16) & 0xff] ^ t_fn[3][x1 >> 24]), - (t_fn[0][x1 & 0xff] ^ t_fn[1][(x2 >> 8) & 0xff] ^ t_fn[2][(x3 >> 16) & 0xff] ^ t_fn[3][x0 >> 24]), - (t_fn[0][x0 & 0xff] ^ t_fn[1][(x1 >> 8) & 0xff] ^ t_fn[2][(x2 >> 16) & 0xff] ^ t_fn[3][x3 >> 24])); - - return _mm_xor_si128(out, key); -} - -uint8_t Sbox[256] = { // forward s-box -0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, -0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, -0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, -0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, -0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, -0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, -0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, -0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, -0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, -0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, -0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, -0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, -0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, -0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, -0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, -0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; - -static inline void sub_word(uint8_t* key) -{ - key[0] = Sbox[key[0]]; - key[1] = Sbox[key[1]]; - key[2] = Sbox[key[2]]; - key[3] = Sbox[key[3]]; -} - -#ifdef __clang__ -uint32_t _rotr(uint32_t value, uint32_t amount) -{ - return (value >> amount) | (value << ((32 - amount) & 31)); -} -#endif - -__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) -{ - uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); - uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); - sub_word((uint8_t*)&X1); - sub_word((uint8_t*)&X3); - return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1); -} diff --git a/crypto/soft_aes.h b/crypto/soft_aes.h new file mode 100644 index 000000000..3479b547a --- /dev/null +++ b/crypto/soft_aes.h @@ -0,0 +1,131 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +/* + * Parts of this file are originally copyright (c) 2014-2017, The Monero Project + */ +#pragma once + + +#if defined(XMRIG_ARM) +# include "crypto/SSE2NEON.h" +#elif defined(__GNUC__) +# include +#else +# include +#endif + +#include + + +#define saes_data(w) {\ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ + w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ + w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ + w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ + w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ + w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ + w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ + w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ + w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ + w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ + w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ + w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ + w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ + w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ + w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ + w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ + w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ + w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ + w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ + w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ + w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ + w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ + w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ + w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ + w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ + w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ + w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ + w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } + +#define SAES_WPOLY 0x011b + +#define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \ + ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) + +#define saes_f2(x) ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY)) +#define saes_f3(x) (saes_f2(x) ^ x) +#define saes_h0(x) (x) + +#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p)) +#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p) +#define saes_u2(p) saes_b2w( p, saes_f3(p), saes_f2(p), p) +#define saes_u3(p) saes_b2w( p, p, saes_f3(p), saes_f2(p)) + +__attribute__((aligned(16))) const static uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) }; +__attribute__((aligned(16))) const static uint8_t saes_sbox[256] = saes_data(saes_h0); + + +static inline __m128i soft_aesenc(__m128i in, __m128i key) +{ + uint32_t x0, x1, x2, x3; + x0 = _mm_cvtsi128_si32(in); + x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); + x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA)); + x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF)); + + __m128i out = _mm_set_epi32( + (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]), + (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]), + (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]), + (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, key); +} + +static inline uint32_t sub_word(uint32_t key) +{ + return (saes_sbox[key >> 24 ] << 24) | + (saes_sbox[(key >> 16) & 0xff] << 16 ) | + (saes_sbox[(key >> 8) & 0xff] << 8 ) | + saes_sbox[key & 0xff]; +} + +#if defined(__clang__) || defined(XMRIG_ARM) +static inline uint32_t _rotr(uint32_t value, uint32_t amount) +{ + return (value >> amount) | (value << ((32 - amount) & 31)); +} +#endif + + +static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) +{ + const uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55))); + const uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF))); + return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1); +} diff --git a/mac/memory_mac.c b/mac/memory_mac.c index 3f5f714cb..949b0ea6a 100644 --- a/mac/memory_mac.c +++ b/mac/memory_mac.c @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -74,3 +75,21 @@ void persistent_memory_free() { _mm_free(persistent_memory); } } + + +void *allocate_executable_memory(size_t size) +{ + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); +} + + +void protect_executable_memory(void *p, size_t size) +{ + mprotect(p, size, PROT_READ | PROT_EXEC); +} + + +void flush_instruction_cache(void *p, size_t size) +{ + __builtin___clear_cache((char*) p, (char*)(p) + size); +} diff --git a/memory.c b/memory.c index f40646e69..0ae4b69ff 100644 --- a/memory.c +++ b/memory.c @@ -4,7 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2018 XMRig + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -41,14 +43,29 @@ void * persistent_calloc(size_t num, size_t size) { } +void init_cn_r(struct cryptonight_ctx *ctx) +{ + uint8_t *p = allocate_executable_memory(0x4000); + + ctx->generated_code = (cn_mainloop_fun_ms_abi) p; + ctx->generated_code_double = (cn_mainloop_double_fun_ms_abi)(p + 0x2000); + ctx->generated_code_height = ctx->generated_code_double_height = (uint64_t)(-1); + ctx->height = 0; +} + + void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id) { const int ratio = (opt_double_hash && opt_algo == ALGO_CRYPTONIGHT) ? 2 : 1; ctx[0] = persistent_calloc(1, sizeof(struct cryptonight_ctx)); ctx[0]->memory = &persistent_memory[MEMORY * (thr_id * ratio + 1)]; + init_cn_r(ctx[0]); + if (opt_double_hash) { ctx[1] = persistent_calloc(1, sizeof(struct cryptonight_ctx)); ctx[1]->memory = ctx[0]->memory + (opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE); + + init_cn_r(ctx[1]); } } diff --git a/options.c b/options.c index 8fe9bff81..cfe1bfe05 100644 --- a/options.c +++ b/options.c @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -74,6 +75,8 @@ static struct AlgoData const algorithms[] = { { "cryptonight/0", "cn/0", ALGO_CRYPTONIGHT, VARIANT_0 }, { "cryptonight/1", "cn/1", ALGO_CRYPTONIGHT, VARIANT_1 }, { "cryptonight/2", "cn/2", ALGO_CRYPTONIGHT, VARIANT_2 }, + { "cryptonight/4", "cn/4", ALGO_CRYPTONIGHT, VARIANT_4 }, + { "cryptonight/r", "cn/r", ALGO_CRYPTONIGHT, VARIANT_4 }, # ifndef XMRIG_NO_AEON { "cryptonight-lite", "cn-lite", ALGO_CRYPTONIGHT_LITE, VARIANT_AUTO }, @@ -88,7 +91,7 @@ static char const usage[] = "\ Usage: " APP_ID " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO cryptonight (default) or cryptonight-lite\n\ - --variant=N cryptonight variant: 0-2\n\ + --variant=N cryptonight variant: 0-4\n\ -o, --url=URL URL of mining server\n\ -b, --backup-url=URL URL of backup mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ @@ -124,7 +127,7 @@ static struct option const options[] = { { "cpu-affinity", 1, NULL, 1020 }, { "donate-level", 1, NULL, 1003 }, { "help", 0, NULL, 'h' }, - { "keepalive", 0, NULL ,'k' }, + { "keepalive", 0, NULL, 'k' }, { "max-cpu-usage", 1, NULL, 1004 }, { "nicehash", 0, NULL, 1006 }, { "no-color", 0, NULL, 1002 }, @@ -156,6 +159,7 @@ static const char *variant_names[] = { "0", "1", "2", + "4" }; @@ -163,7 +167,8 @@ static const char *asm_names[] = { "none", "auto", "intel", - "ryzen" + "ryzen", + "bulldozer" }; @@ -380,9 +385,13 @@ static void parse_arg(int key, char *arg) { case 1021: /* --variant */ v = atoi(arg); - if (v > VARIANT_AUTO && v < VARIANT_MAX) { + if (v == 4 || strcasecmp(arg, "r") == 0) { + opt_variant = VARIANT_4; + } + else if (v > VARIANT_AUTO && v < VARIANT_MAX) { opt_variant = v; } + break; case 1006: /* --nicehash */ @@ -397,7 +406,7 @@ static void parse_arg(int key, char *arg) { static void parse_config(json_t *config, char *ref) { - int i; + size_t i; char buf[16]; json_t *val; diff --git a/options.h b/options.h index 7117130b8..f29a7fc8f 100644 --- a/options.h +++ b/options.h @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -44,6 +45,7 @@ enum Variant { VARIANT_0 = 0, VARIANT_1 = 1, VARIANT_2 = 2, + VARIANT_4 = 3, VARIANT_MAX }; @@ -63,6 +65,7 @@ enum Assembly { ASM_AUTO, ASM_INTEL, ASM_RYZEN, + ASM_BULLDOZER, ASM_MAX }; diff --git a/persistent_memory.h b/persistent_memory.h index 171a86acb..a04995740 100644 --- a/persistent_memory.h +++ b/persistent_memory.h @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -47,8 +48,15 @@ extern int persistent_memory_flags; const char * persistent_memory_allocate(); void persistent_memory_free(); -void * persistent_calloc(size_t num, size_t size); +void *persistent_calloc(size_t num, size_t size); void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id); +void *allocate_executable_memory(size_t size); +void flush_instruction_cache(void *p, size_t size); +void init_cn_r(struct cryptonight_ctx *ctx); +void protect_executable_memory(void *p, size_t size); + + + #endif /* XMRIG_PERSISTENT_MEMORY_H */ diff --git a/stratum.c b/stratum.c index e10904762..672b6b583 100644 --- a/stratum.c +++ b/stratum.c @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -709,6 +710,8 @@ static bool job_decode(const json_t *job) { memset(work.job_id, 0, sizeof(work.job_id)); memcpy(work.job_id, job_id, strlen(job_id)); + work.height = (uint64_t) json_integer_value(json_object_get(job, "height")); + return true; } diff --git a/stratum.h b/stratum.h index 00fd426fd..13f8f513f 100644 --- a/stratum.h +++ b/stratum.h @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -42,6 +43,7 @@ struct work { uint32_t target __attribute__((aligned(16))); uint32_t hash[8] __attribute__((aligned(16))); char job_id[64] __attribute__((aligned(16))); + uint64_t height; }; diff --git a/unix/memory_unix.c b/unix/memory_unix.c index 2e794a037..8954aebe8 100644 --- a/unix/memory_unix.c +++ b/unix/memory_unix.c @@ -74,3 +74,23 @@ void persistent_memory_free() { _mm_free(persistent_memory); } } + + +void *allocate_executable_memory(size_t size) +{ + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +} + + +void protect_executable_memory(void *p, size_t size) +{ + mprotect(p, size, PROT_READ | PROT_EXEC); +} + + +void flush_instruction_cache(void *p, size_t size) +{ +# ifndef __FreeBSD__ + __builtin___clear_cache((char*) p, (char*)(p) + size); +# endif +} diff --git a/version.h b/version.h index c99cea7ea..b49ccd92a 100644 --- a/version.h +++ b/version.h @@ -5,7 +5,8 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -27,13 +28,13 @@ #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "Monero (XMR) CPU miner" -#define APP_VERSION "0.9.0-dev" +#define APP_VERSION "0.10.0-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" -#define APP_COPYRIGHT "Copyright (C) 2016-2018 xmrig.com" +#define APP_COPYRIGHT "Copyright (C) 2016-2019 xmrig.com" #define APP_VER_MAJOR 0 -#define APP_VER_MINOR 9 +#define APP_VER_MINOR 10 #define APP_VER_BUILD 0 #define APP_VER_REV 0 diff --git a/win/memory_win.c b/win/memory_win.c index 9ff63dade..34c3c62f1 100644 --- a/win/memory_win.c +++ b/win/memory_win.c @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,9 +22,6 @@ * along with this program. If not, see . */ -#ifndef __MEMORY_H__ -#define __MEMORY_H__ - #include #include #include @@ -172,4 +170,21 @@ void persistent_memory_free() { } } -#endif /* __MEMORY_H__ */ + +void *allocate_executable_memory(size_t size) +{ + return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); +} + + +void protect_executable_memory(void *p, size_t size) +{ + DWORD oldProtect; + VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect); +} + + +void flush_instruction_cache(void *p, size_t size) +{ + FlushInstructionCache(GetCurrentProcess(), p, size); +} diff --git a/xmrig.c b/xmrig.c index d79808db0..c1c3865f6 100644 --- a/xmrig.c +++ b/xmrig.c @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -306,8 +307,10 @@ static void *miner_thread(void *userdata) { struct timeval tv_start; gettimeofday(&tv_start, NULL); + persistentctx[0]->height = work.height; + /* scan nonces for a proof-of-work hash */ - const int rc = scanhash_cryptonight(thr_id, hash, (const uint8_t *) work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx); + const int rc = scanhash_cryptonight(thr_id, hash, (uint8_t *) work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx); stats_add_hashes(thr_id, &tv_start, hashes_done); if (!rc) { @@ -391,6 +394,9 @@ static void *miner_thread_double(void *userdata) { struct timeval tv_start; gettimeofday(&tv_start, NULL); + persistentctx[0]->height = work.height; + persistentctx[1]->height = work.height; + /* scan nonces for a proof-of-work hash */ const int rc = scanhash_cryptonight_double(thr_id, (uint32_t *) double_hash, double_blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx); stats_add_hashes(thr_id, &tv_start, hashes_done);