Merge branch 'feature-cn4' of https://github.com/SChernykh/xmrig into evo

This commit is contained in:
XMRig 2019-02-19 08:15:36 +07:00
commit 41feb756bf
28 changed files with 2623 additions and 266 deletions

View file

@ -54,8 +54,10 @@ MemInfo Mem::create(cryptonight_ctx **ctx, xmrig::Algo algorithm, size_t count)
uint8_t* p = reinterpret_cast<uint8_t*>(allocateExecutableMemory(0x4000));
c->generated_code = reinterpret_cast<cn_mainloop_fun_ms_abi>(p);
c->generated_code_double = reinterpret_cast<cn_mainloop_double_fun_ms_abi>(p + 0x2000);
c->generated_code_height = (uint64_t)(-1);
c->generated_code_double_height = (uint64_t)(-1);
c->generated_code_data.variant = xmrig::VARIANT_MAX;
c->generated_code_data.height = (uint64_t)(-1);
c->generated_code_double_data = c->generated_code_data;
ctx[i] = c;
}

View file

@ -479,18 +479,19 @@ void xmrig::Pool::rebuild()
m_algorithms.push_back(m_algorithm);
# ifndef XMRIG_PROXY_PROJECT
addVariant(VARIANT_WOW);
addVariant(VARIANT_2);
addVariant(VARIANT_1);
addVariant(VARIANT_0);
addVariant(VARIANT_HALF);
addVariant(VARIANT_XTL);
addVariant(VARIANT_TUBE);
addVariant(VARIANT_MSR);
addVariant(VARIANT_XHV);
addVariant(VARIANT_XAO);
addVariant(VARIANT_RTO);
addVariant(VARIANT_GPU);
addVariant(VARIANT_AUTO);
addVariant(xmrig::VARIANT_4);
addVariant(xmrig::VARIANT_WOW);
addVariant(xmrig::VARIANT_2);
addVariant(xmrig::VARIANT_1);
addVariant(xmrig::VARIANT_0);
addVariant(xmrig::VARIANT_HALF);
addVariant(xmrig::VARIANT_XTL);
addVariant(xmrig::VARIANT_TUBE);
addVariant(xmrig::VARIANT_MSR);
addVariant(xmrig::VARIANT_XHV);
addVariant(xmrig::VARIANT_XAO);
addVariant(xmrig::VARIANT_RTO);
addVariant(xmrig::VARIANT_GPU);
addVariant(xmrig::VARIANT_AUTO);
# endif
}

View file

@ -65,6 +65,7 @@ static AlgoData const algorithms[] = {
{ "cryptonight/half", "cn/half", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF },
{ "cryptonight/xtlv9", "cn/xtlv9", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF },
{ "cryptonight/wow", "cn/wow", xmrig::CRYPTONIGHT, xmrig::VARIANT_WOW },
{ "cryptonight/r", "cn/r", xmrig::CRYPTONIGHT, xmrig::VARIANT_4 },
# ifndef XMRIG_NO_AEON
{ "cryptonight-lite", "cn-lite", xmrig::CRYPTONIGHT_LITE, xmrig::VARIANT_AUTO },
@ -130,6 +131,7 @@ static const char *variants[] = {
"trtl",
"gpu",
"wow",
"r",
};

View file

@ -244,7 +244,7 @@ xmrig::Variant xmrig::Job::variant() const
{
switch (m_algorithm.algo()) {
case CRYPTONIGHT:
return (m_blob[0] >= 8) ? VARIANT_2 : VARIANT_1;
return (m_blob[0] >= 10) ? VARIANT_4 : ((m_blob[0] >= 8) ? VARIANT_2 : VARIANT_1);
case CRYPTONIGHT_LITE:
return VARIANT_1;

View file

@ -75,6 +75,7 @@ enum Variant {
VARIANT_TRTL = 10, // CryptoNight Turtle (TRTL)
VARIANT_GPU = 11, // CryptoNight-GPU (Ryo)
VARIANT_WOW = 12, // CryptoNightR (Wownero)
VARIANT_4 = 13, // CryptoNightR (Monero's variant 4)
VARIANT_MAX
};

View file

@ -39,13 +39,20 @@ struct cryptonight_ctx;
typedef void(*cn_mainloop_fun_ms_abi)(cryptonight_ctx*) ABI_ATTRIBUTE;
typedef void(*cn_mainloop_double_fun_ms_abi)(cryptonight_ctx*, cryptonight_ctx*) ABI_ATTRIBUTE;
struct cryptonight_r_data {
int variant;
uint64_t height;
bool match(const int v, const uint64_t h) const { return (v == variant) && (h == height); }
};
struct cryptonight_ctx {
alignas(16) uint8_t state[224];
alignas(16) uint8_t *memory;
cn_mainloop_fun_ms_abi generated_code;
cn_mainloop_double_fun_ms_abi generated_code_double;
uint64_t generated_code_height;
uint64_t generated_code_double_height;
cryptonight_r_data generated_code_data;
cryptonight_r_data generated_code_double_data;
};

View file

@ -431,12 +431,12 @@ static inline __m128i aes_round_tweak_div(const __m128i &in, const __m128i &key)
template<xmrig::Variant VARIANT, xmrig::Variant BASE>
static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i cx)
static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i& cx)
{
uint64_t* mem_out = (uint64_t*)&l[idx];
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1);
VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx);
_mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx));
} else {
__m128i tmp = _mm_xor_si128(bx0, cx);
@ -515,8 +515,12 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
if (VARIANT == xmrig::VARIANT_WOW) {
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
if (VARIANT == xmrig::VARIANT_4) {
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
}
} else {
VARIANT2_INTEGER_MATH(0, cl, cx);
}
@ -525,7 +529,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
lo = __umul128(idx0, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo);
if (VARIANT == xmrig::VARIANT_4) {
VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx);
} else {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo);
}
}
al0 += hi;
@ -686,8 +694,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
if (VARIANT == xmrig::VARIANT_WOW) {
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
if (VARIANT == xmrig::VARIANT_4) {
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
}
} else {
VARIANT2_INTEGER_MATH(0, cl, cx0);
}
@ -696,7 +708,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
lo = __umul128(idx0, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo);
if (VARIANT == xmrig::VARIANT_4) {
VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0);
} else {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo);
}
}
al0 += hi;
@ -736,8 +752,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
if (VARIANT == xmrig::VARIANT_WOW) {
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
if (VARIANT == xmrig::VARIANT_4) {
al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
}
} else {
VARIANT2_INTEGER_MATH(1, cl, cx1);
}
@ -746,7 +766,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
lo = __umul128(idx1, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo);
if (VARIANT == xmrig::VARIANT_4) {
VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1);
} else {
VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo);
}
}
al1 += hi;

View file

@ -127,6 +127,7 @@ template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_0>()
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_1>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_2>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_WOW>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_4>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_XTL>() { return CRYPTONIGHT_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_HALF>() { return CRYPTONIGHT_HALF_ITER; }
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_MSR>() { return CRYPTONIGHT_HALF_ITER; }
@ -197,8 +198,13 @@ template<> inline constexpr Variant cn_base_variant<VARIANT_HALF>() { return VA
template<> inline constexpr Variant cn_base_variant<VARIANT_TRTL>() { return VARIANT_2; }
template<> inline constexpr Variant cn_base_variant<VARIANT_GPU>() { return VARIANT_GPU; }
template<> inline constexpr Variant cn_base_variant<VARIANT_WOW>() { return VARIANT_2; }
template<> inline constexpr Variant cn_base_variant<VARIANT_4>() { return VARIANT_2; }
template<Variant variant> inline constexpr bool cn_is_cryptonight_r() { return false; }
template<> inline constexpr bool cn_is_cryptonight_r<VARIANT_WOW>() { return true; }
template<> inline constexpr bool cn_is_cryptonight_r<VARIANT_4>() { return true; }
} /* namespace xmrig */

View file

@ -83,7 +83,7 @@
sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \
} while (0)
# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \
# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \
do { \
const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
@ -91,6 +91,9 @@
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
if (VARIANT == xmrig::VARIANT_4) { \
_c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \
} \
} while (0)
# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
@ -125,7 +128,7 @@
sqrt_result_##part += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \
} while (0)
# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \
# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \
do { \
const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))); \
const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \
@ -133,6 +136,9 @@
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
if (VARIANT == xmrig::VARIANT_4) { \
_c = veorq_u64(veorq_u64(_c, chunk3), veorq_u64(chunk1, chunk2)); \
} \
} while (0)
# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
@ -152,26 +158,28 @@
#define SWAP64LE(x) x
#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length))
#include "common/xmrig.h"
#include "variant4_random_math.h"
#define VARIANT4_RANDOM_MATH_INIT(part) \
uint32_t r##part[8]; \
uint32_t r##part[9]; \
struct V4_Instruction code##part[256]; \
if (VARIANT == xmrig::VARIANT_WOW) { \
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \
r##part[0] = (uint32_t)(h##part[12]); \
r##part[1] = (uint32_t)(h##part[12] >> 32); \
r##part[2] = (uint32_t)(h##part[13]); \
r##part[3] = (uint32_t)(h##part[13] >> 32); \
} \
v4_random_math_init(code##part, height);
v4_random_math_init<VARIANT>(code##part, height);
#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \
if (VARIANT == xmrig::VARIANT_WOW) { \
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \
cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \
r##part[4] = static_cast<uint32_t>(al); \
r##part[5] = static_cast<uint32_t>(ah); \
r##part[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0)); \
r##part[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1)); \
r##part[8] = static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
v4_random_math(code##part, r##part); \
}

View file

@ -58,8 +58,7 @@ const static uint8_t test_input[380] = {
0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04
};
const static char* test_input_WOW = R"===(
9d47bf4c41b7e8e727e681715acb47fa1677cdba9ca7bcb05ad8cc8abd5daa66 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260
const static char* test_input_WOW = R"===(9d47bf4c41b7e8e727e681715acb47fa1677cdba9ca7bcb05ad8cc8abd5daa66 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260
0d4a495cb844a3ca8ba4edb8e6bcf829ef1c06d9cdea2b62ca46c2a21b8b0a79 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261
a1d6d848b5c5915fccd2f64cf216c6b1a02cf7c77bc80d8d4e51b419e88ff0dd 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262
af3a8544a0221a148c2ac90484b19861e3afca33fe17021efb8ad6496b567915 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263
@ -68,8 +67,18 @@ af3a8544a0221a148c2ac90484b19861e3afca33fe17021efb8ad6496b567915 657420646f6c6f7
2b13000535f3db5f9b9b84a65c4351f386cd2cdedebb8c3ad2eab086e6a3fee5 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266
fc0e1dad8e895749dc90eb690bc1ba059a1cd772afaaf65a106bf9e5e6b80503 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267
b60b0afe144deff7d903ed2d5545e77ebe66a3c51fee7016eeb8fee9eb630c0f 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268
64774b27e7d5fec862fc4c0c13ac6bf09123b6f05bb0e4b75c97f379a2b3a679 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269
)===";
64774b27e7d5fec862fc4c0c13ac6bf09123b6f05bb0e4b75c97f379a2b3a679 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269)===";
const static char* test_input_R = R"===(f759588ad57e758467295443a9bd71490abff8e9dad1b95b6bf2f5d0d78387bc 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260
5bb833deca2bdd7252a9ccd7b4ce0b6a4854515794b56c207262f7a5b9bdb566 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261
1ee6728da60fbd8d7d55b2b1ade487a3cf52a2c3ac6f520db12c27d8921f6cab 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262
6969fe2ddfb758438d48049f302fc2108a4fcc93e37669170e6db4b0b9b4c4cb 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263
7f3048b4e90d0cbe7a57c0394f37338a01fae3adfdc0e5126d863a895eb04e02 71756973206e6f737472756420657865726369746174696f6e20756c6c616d636f206c61626f726973206e697369 1806264
1d290443a4b542af04a82f6b2494a6ee7f20f2754c58e0849032483a56e8e2ef 757420616c697175697020657820656120636f6d6d6f646f20636f6e7365717561742e20447569732061757465 1806265
c43cc6567436a86afbd6aa9eaa7c276e9806830334b614b2bee23cc76634f6fd 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266
87be2479c0c4e8edfdfaa5603e93f4265b3f8224c1c5946feb424819d18990a4 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267
dd9d6a6d8e47465cceac0877ef889b93e7eba979557e3935d7f86dce11b070f3 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268
75c6f2ae49a20521de97285b431e717125847fb8935ed84a61e7f8d36a2c3d8e 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269)===";
// "cn/0"
const static uint8_t test_output_v0[160] = {

View file

@ -457,10 +457,10 @@ static inline __m128i int_sqrt_v2(const uint64_t n0)
template<xmrig::Variant VARIANT, xmrig::Variant BASE>
static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i cx)
static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i& cx)
{
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1);
VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx);
_mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx));
} else {
__m128i tmp = _mm_xor_si128(bx0, cx);
@ -543,8 +543,12 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
if (VARIANT == xmrig::VARIANT_WOW) {
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
if (VARIANT == xmrig::VARIANT_4) {
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
}
} else {
VARIANT2_INTEGER_MATH(0, cl, cx);
}
@ -553,7 +557,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
lo = __umul128(idx0, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo);
if (VARIANT == xmrig::VARIANT_4) {
VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx);
} else {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo);
}
}
al0 += hi;
@ -658,21 +666,46 @@ extern xmrig::CpuThread::cn_mainloop_fun cn_trtl_mainloop_ryzen_asm;
extern xmrig::CpuThread::cn_mainloop_fun cn_trtl_mainloop_bulldozer_asm;
extern xmrig::CpuThread::cn_mainloop_double_fun cn_trtl_double_mainloop_sandybridge_asm;
void wow_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void v4_64_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void wow_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void v4_64_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
template<xmrig::Variant VARIANT>
void cn_r_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
v4_compile_code(code, code_size, machine_code, ASM);
}
template<xmrig::Variant VARIANT>
void cn_r_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
v4_compile_code_double(code, code_size, machine_code, ASM);
}
template<>
void cn_r_compile_code<xmrig::VARIANT_WOW>(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
wow_compile_code(code, code_size, machine_code, ASM);
}
template<>
void cn_r_compile_code_double<xmrig::VARIANT_WOW>(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
wow_compile_code_double(code, code_size, machine_code, ASM);
}
template<xmrig::Algo ALGO, xmrig::Variant VARIANT, xmrig::Assembly ASM>
inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
{
constexpr size_t MEM = xmrig::cn_select_memory<ALGO>();
if ((VARIANT == xmrig::VARIANT_WOW) && (height != ctx[0]->generated_code_height)) {
if (xmrig::cn_is_cryptonight_r<VARIANT>() && !ctx[0]->generated_code_data.match(VARIANT, height)) {
V4_Instruction code[256];
const int code_size = v4_random_math_init(code, height);
v4_compile_code(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code), ASM);
ctx[0]->generated_code_height = height;
const int code_size = v4_random_math_init<VARIANT>(code, height);
cn_r_compile_code<VARIANT>(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code), ASM);
ctx[0]->generated_code_data.variant = VARIANT;
ctx[0]->generated_code_data.height = height;
}
xmrig::keccak(input, size, ctx[0]->state);
@ -711,7 +744,7 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
cn_trtl_mainloop_bulldozer_asm(ctx[0]);
}
}
else if (VARIANT == xmrig::VARIANT_WOW) {
else if (xmrig::cn_is_cryptonight_r<VARIANT>()) {
ctx[0]->generated_code(ctx[0]);
}
@ -726,11 +759,12 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
{
constexpr size_t MEM = xmrig::cn_select_memory<ALGO>();
if ((VARIANT == xmrig::VARIANT_WOW) && (height != ctx[0]->generated_code_double_height)) {
if (xmrig::cn_is_cryptonight_r<VARIANT>() && !ctx[0]->generated_code_double_data.match(VARIANT, height)) {
V4_Instruction code[256];
const int code_size = v4_random_math_init(code, height);
v4_compile_code_double(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code_double), ASM);
ctx[0]->generated_code_double_height = height;
const int code_size = v4_random_math_init<VARIANT>(code, height);
cn_r_compile_code_double<VARIANT>(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code_double), ASM);
ctx[0]->generated_code_double_data.variant = VARIANT;
ctx[0]->generated_code_double_data.height = height;
}
xmrig::keccak(input, size, ctx[0]->state);
@ -748,7 +782,7 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
else if (VARIANT == xmrig::VARIANT_TRTL) {
cn_trtl_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
}
else if (VARIANT == xmrig::VARIANT_WOW) {
else if (xmrig::cn_is_cryptonight_r<VARIANT>()) {
ctx[0]->generated_code_double(ctx[0], ctx[1]);
}
@ -847,8 +881,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
if (VARIANT == xmrig::VARIANT_WOW) {
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
if (VARIANT == xmrig::VARIANT_4) {
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
}
} else {
VARIANT2_INTEGER_MATH(0, cl, cx0);
}
@ -857,7 +895,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
lo = __umul128(idx0, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo);
if (VARIANT == xmrig::VARIANT_4) {
VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0);
} else {
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo);
}
}
al0 += hi;
@ -895,8 +937,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
if (BASE == xmrig::VARIANT_2) {
if (VARIANT == xmrig::VARIANT_WOW) {
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
if (VARIANT == xmrig::VARIANT_4) {
al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
}
} else {
VARIANT2_INTEGER_MATH(1, cl, cx1);
}
@ -905,7 +951,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
lo = __umul128(idx1, cl, &hi);
if (BASE == xmrig::VARIANT_2) {
VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo);
if (VARIANT == xmrig::VARIANT_4) {
VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1);
} else {
VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo);
}
}
al1 += hi;
@ -989,18 +1039,30 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
#define CN_STEP4(part, a, b0, b1, c, l, mc, ptr, idx) \
uint64_t al##part, ah##part; \
if (BASE == xmrig::VARIANT_2) { \
if (VARIANT == xmrig::VARIANT_WOW) { \
const uint64_t al = _mm_cvtsi128_si64(a); \
const uint64_t ah = _mm_cvtsi128_si64(_mm_srli_si128(a, 8)); \
VARIANT4_RANDOM_MATH(part, al, ah, cl##part, b0, b1); \
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \
al##part = _mm_cvtsi128_si64(a); \
ah##part = _mm_cvtsi128_si64(_mm_srli_si128(a, 8)); \
VARIANT4_RANDOM_MATH(part, al##part, ah##part, cl##part, b0, b1); \
if (VARIANT == xmrig::VARIANT_4) { \
al##part ^= r##part[2] | ((uint64_t)(r##part[3]) << 32); \
ah##part ^= r##part[0] | ((uint64_t)(r##part[1]) << 32); \
} \
} else { \
VARIANT2_INTEGER_MATH(part, cl##part, c); \
} \
} \
lo = __umul128(idx, cl##part, &hi); \
if (BASE == xmrig::VARIANT_2) { \
VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo); \
if (VARIANT == xmrig::VARIANT_4) { \
VARIANT2_SHUFFLE(l, idx & MASK, a, b0, b1, c); \
} else { \
VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo); \
} \
} \
if (VARIANT == xmrig::VARIANT_4) { \
a = _mm_set_epi64x(ah##part, al##part); \
} \
a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \
\

View file

@ -58,7 +58,7 @@ static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int
const uint32_t a = inst.dst_index;
const uint32_t b = inst.src_index;
const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (src_index << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
switch (inst.opcode) {
case ROR:
@ -99,6 +99,20 @@ static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int
}
}
void wow_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
uint8_t* p = p0;
add_code(p, CryptonightWOW_template_part1, CryptonightWOW_template_part2);
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(p, CryptonightWOW_template_part2, CryptonightWOW_template_part3);
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_template_mainloop) - ((const uint8_t*)CryptonightWOW_template_part1)) - (p - p0));
add_code(p, CryptonightWOW_template_part3, CryptonightWOW_template_end);
Mem::flushInstructionCache(machine_code, p - p0);
}
void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
@ -113,6 +127,22 @@ void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_co
Mem::flushInstructionCache(machine_code, p - p0);
}
void wow_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
uint8_t* p = p0;
add_code(p, CryptonightWOW_template_double_part1, CryptonightWOW_template_double_part2);
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(p, CryptonightWOW_template_double_part2, CryptonightWOW_template_double_part3);
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(p, CryptonightWOW_template_double_part3, CryptonightWOW_template_double_part4);
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_template_double_mainloop) - ((const uint8_t*)CryptonightWOW_template_double_part1)) - (p - p0));
add_code(p, CryptonightWOW_template_double_part4, CryptonightWOW_template_double_end);
Mem::flushInstructionCache(machine_code, p - p0);
}
void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);

View file

@ -529,6 +529,7 @@ PUBLIC FN_PREFIX(CryptonightR_instruction_mov254)
PUBLIC FN_PREFIX(CryptonightR_instruction_mov255)
PUBLIC FN_PREFIX(CryptonightR_instruction_mov256)
#include "CryptonightWOW_template.inc"
#include "CryptonightR_template.inc"
FN_PREFIX(CryptonightR_instruction0):
@ -538,16 +539,16 @@ FN_PREFIX(CryptonightR_instruction1):
FN_PREFIX(CryptonightR_instruction2):
imul rbx, rbx
FN_PREFIX(CryptonightR_instruction3):
add rbx, rbx
add rbx, r9
add rbx, 2147483647
FN_PREFIX(CryptonightR_instruction4):
sub rbx, rbx
sub rbx, r9
FN_PREFIX(CryptonightR_instruction5):
ror ebx, cl
FN_PREFIX(CryptonightR_instruction6):
rol ebx, cl
FN_PREFIX(CryptonightR_instruction7):
xor rbx, rbx
xor rbx, r9
FN_PREFIX(CryptonightR_instruction8):
imul rsi, rbx
FN_PREFIX(CryptonightR_instruction9):
@ -623,16 +624,16 @@ FN_PREFIX(CryptonightR_instruction41):
FN_PREFIX(CryptonightR_instruction42):
imul rsi, rsi
FN_PREFIX(CryptonightR_instruction43):
add rsi, rsi
add rsi, r9
add rsi, 2147483647
FN_PREFIX(CryptonightR_instruction44):
sub rsi, rsi
sub rsi, r9
FN_PREFIX(CryptonightR_instruction45):
ror esi, cl
FN_PREFIX(CryptonightR_instruction46):
rol esi, cl
FN_PREFIX(CryptonightR_instruction47):
xor rsi, rsi
xor rsi, r9
FN_PREFIX(CryptonightR_instruction48):
imul rdi, rsi
FN_PREFIX(CryptonightR_instruction49):
@ -708,16 +709,16 @@ FN_PREFIX(CryptonightR_instruction81):
FN_PREFIX(CryptonightR_instruction82):
imul rdi, rdi
FN_PREFIX(CryptonightR_instruction83):
add rdi, rdi
add rdi, r9
add rdi, 2147483647
FN_PREFIX(CryptonightR_instruction84):
sub rdi, rdi
sub rdi, r9
FN_PREFIX(CryptonightR_instruction85):
ror edi, cl
FN_PREFIX(CryptonightR_instruction86):
rol edi, cl
FN_PREFIX(CryptonightR_instruction87):
xor rdi, rdi
xor rdi, r9
FN_PREFIX(CryptonightR_instruction88):
imul rbp, rdi
FN_PREFIX(CryptonightR_instruction89):
@ -793,16 +794,16 @@ FN_PREFIX(CryptonightR_instruction121):
FN_PREFIX(CryptonightR_instruction122):
imul rbp, rbp
FN_PREFIX(CryptonightR_instruction123):
add rbp, rbp
add rbp, r9
add rbp, 2147483647
FN_PREFIX(CryptonightR_instruction124):
sub rbp, rbp
sub rbp, r9
FN_PREFIX(CryptonightR_instruction125):
ror ebp, cl
FN_PREFIX(CryptonightR_instruction126):
rol ebp, cl
FN_PREFIX(CryptonightR_instruction127):
xor rbp, rbp
xor rbp, r9
FN_PREFIX(CryptonightR_instruction128):
imul rbx, rsp
FN_PREFIX(CryptonightR_instruction129):

View file

@ -516,6 +516,7 @@ PUBLIC CryptonightR_instruction_mov254
PUBLIC CryptonightR_instruction_mov255
PUBLIC CryptonightR_instruction_mov256
INCLUDE CryptonightWOW_template_win.inc
INCLUDE CryptonightR_template_win.inc
CryptonightR_instruction0:
@ -525,16 +526,16 @@ CryptonightR_instruction1:
CryptonightR_instruction2:
imul rbx, rbx
CryptonightR_instruction3:
add rbx, rbx
add rbx, r9
add rbx, 2147483647
CryptonightR_instruction4:
sub rbx, rbx
sub rbx, r9
CryptonightR_instruction5:
ror ebx, cl
CryptonightR_instruction6:
rol ebx, cl
CryptonightR_instruction7:
xor rbx, rbx
xor rbx, r9
CryptonightR_instruction8:
imul rsi, rbx
CryptonightR_instruction9:
@ -610,16 +611,16 @@ CryptonightR_instruction41:
CryptonightR_instruction42:
imul rsi, rsi
CryptonightR_instruction43:
add rsi, rsi
add rsi, r9
add rsi, 2147483647
CryptonightR_instruction44:
sub rsi, rsi
sub rsi, r9
CryptonightR_instruction45:
ror esi, cl
CryptonightR_instruction46:
rol esi, cl
CryptonightR_instruction47:
xor rsi, rsi
xor rsi, r9
CryptonightR_instruction48:
imul rdi, rsi
CryptonightR_instruction49:
@ -695,16 +696,16 @@ CryptonightR_instruction81:
CryptonightR_instruction82:
imul rdi, rdi
CryptonightR_instruction83:
add rdi, rdi
add rdi, r9
add rdi, 2147483647
CryptonightR_instruction84:
sub rdi, rdi
sub rdi, r9
CryptonightR_instruction85:
ror edi, cl
CryptonightR_instruction86:
rol edi, cl
CryptonightR_instruction87:
xor rdi, rdi
xor rdi, r9
CryptonightR_instruction88:
imul rbp, rdi
CryptonightR_instruction89:
@ -780,16 +781,16 @@ CryptonightR_instruction121:
CryptonightR_instruction122:
imul rbp, rbp
CryptonightR_instruction123:
add rbp, rbp
add rbp, r9
add rbp, 2147483647
CryptonightR_instruction124:
sub rbp, rbp
sub rbp, r9
CryptonightR_instruction125:
ror ebp, cl
CryptonightR_instruction126:
rol ebp, cl
CryptonightR_instruction127:
xor rbp, rbp
xor rbp, r9
CryptonightR_instruction128:
imul rbx, rsp
CryptonightR_instruction129:

View file

@ -2,6 +2,18 @@
extern "C"
{
void CryptonightWOW_template_part1();
void CryptonightWOW_template_mainloop();
void CryptonightWOW_template_part2();
void CryptonightWOW_template_part3();
void CryptonightWOW_template_end();
void CryptonightWOW_template_double_part1();
void CryptonightWOW_template_double_mainloop();
void CryptonightWOW_template_double_part2();
void CryptonightWOW_template_double_part3();
void CryptonightWOW_template_double_part4();
void CryptonightWOW_template_double_end();
void CryptonightR_template_part1();
void CryptonightR_template_mainloop();
void CryptonightR_template_part2();
@ -13,6 +25,7 @@ extern "C"
void CryptonightR_template_double_part3();
void CryptonightR_template_double_part4();
void CryptonightR_template_double_end();
void CryptonightR_instruction0();
void CryptonightR_instruction1();
void CryptonightR_instruction2();

View file

@ -10,6 +10,7 @@ PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
PUBLIC FN_PREFIX(CryptonightR_template_double_end)
ALIGN(64)
FN_PREFIX(CryptonightR_template_part1):
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
@ -68,8 +69,6 @@ FN_PREFIX(CryptonightR_template_mainloop):
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
@ -77,16 +76,23 @@ FN_PREFIX(CryptonightR_template_mainloop):
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movq r12, xmm5
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movq r12, xmm5
movd r10d, xmm5
and r10d, 2097136
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
@ -101,13 +107,23 @@ FN_PREFIX(CryptonightR_template_mainloop):
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
FN_PREFIX(CryptonightR_template_part2):
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor rsp, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r15, rax
mov rax, r13
mul r12
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
@ -115,16 +131,18 @@ FN_PREFIX(CryptonightR_template_part2):
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movaps xmm3, xmm1
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm0, XMMWORD PTR [r10+r11]
pxor xmm1, xmm2
pxor xmm5, xmm0
pxor xmm5, xmm1
paddq xmm3, xmm4
paddq xmm2, xmm6
paddq xmm0, xmm7
movdqu XMMWORD PTR [r9+r11], xmm0
movdqu XMMWORD PTR [r12+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
@ -247,18 +265,21 @@ FN_PREFIX(CryptonightR_template_double_mainloop):
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq rdx, xmm6
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm1
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
movq rdx, xmm6
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
@ -274,15 +295,18 @@ FN_PREFIX(CryptonightR_template_double_mainloop):
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm1
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
@ -303,7 +327,8 @@ FN_PREFIX(CryptonightR_template_double_mainloop):
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+112], rcx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
@ -320,9 +345,22 @@ FN_PREFIX(CryptonightR_template_double_mainloop):
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
FN_PREFIX(CryptonightR_template_double_part2):
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r14, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r12, rax
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
@ -334,28 +372,27 @@ FN_PREFIX(CryptonightR_template_double_part2):
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+112]
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movq xmm1, rdx
movq xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
movdqu xmm1, XMMWORD PTR [rcx+rsi]
pxor xmm6, xmm1
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
pxor xmm6, xmm2
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
pxor xmm6, xmm0
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
@ -383,6 +420,7 @@ FN_PREFIX(CryptonightR_template_double_part2):
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
@ -401,9 +439,24 @@ FN_PREFIX(CryptonightR_template_double_part2):
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
FN_PREFIX(CryptonightR_template_double_part3):
movq r15, xmm13
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r15, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r13, rax
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
@ -414,23 +467,20 @@ FN_PREFIX(CryptonightR_template_double_part3):
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
movq r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movq xmm1, rdx
movq xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
movdqu xmm1, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm1
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm2
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
@ -438,6 +488,7 @@ FN_PREFIX(CryptonightR_template_double_part3):
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm0
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0

View file

@ -10,6 +10,7 @@ PUBLIC CryptonightR_template_double_part3
PUBLIC CryptonightR_template_double_part4
PUBLIC CryptonightR_template_double_end
ALIGN(64)
CryptonightR_template_part1:
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
@ -68,8 +69,6 @@ CryptonightR_template_mainloop:
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
@ -77,16 +76,23 @@ CryptonightR_template_mainloop:
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movq r12, xmm5
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movq r12, xmm5
movd r10d, xmm5
and r10d, 2097136
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
@ -101,13 +107,23 @@ CryptonightR_template_mainloop:
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
CryptonightR_template_part2:
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor rsp, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r15, rax
mov rax, r13
mul r12
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
@ -115,16 +131,18 @@ CryptonightR_template_part2:
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movaps xmm3, xmm1
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm0, XMMWORD PTR [r10+r11]
pxor xmm1, xmm2
pxor xmm5, xmm0
pxor xmm5, xmm1
paddq xmm3, xmm4
paddq xmm2, xmm6
paddq xmm0, xmm7
movdqu XMMWORD PTR [r9+r11], xmm0
movdqu XMMWORD PTR [r12+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
@ -247,18 +265,21 @@ CryptonightR_template_double_mainloop:
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq rdx, xmm6
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm1
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
movq rdx, xmm6
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
@ -274,15 +295,18 @@ CryptonightR_template_double_mainloop:
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm1
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
@ -303,7 +327,8 @@ CryptonightR_template_double_mainloop:
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+112], rcx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
@ -320,9 +345,22 @@ CryptonightR_template_double_mainloop:
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
CryptonightR_template_double_part2:
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r14, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r12, rax
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
@ -334,28 +372,27 @@ CryptonightR_template_double_part2:
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+112]
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movq xmm1, rdx
movq xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
movdqu xmm1, XMMWORD PTR [rcx+rsi]
pxor xmm6, xmm1
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
pxor xmm6, xmm2
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
pxor xmm6, xmm0
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
@ -383,6 +420,7 @@ CryptonightR_template_double_part2:
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
@ -401,9 +439,24 @@ CryptonightR_template_double_part2:
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
CryptonightR_template_double_part3:
movq r15, xmm13
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r15, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r13, rax
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
@ -414,23 +467,20 @@ CryptonightR_template_double_part3:
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
movq r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movq xmm1, rdx
movq xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
movdqu xmm1, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm1
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm2
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
@ -438,6 +488,7 @@ CryptonightR_template_double_part3:
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm0
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0

View file

@ -0,0 +1,486 @@
PUBLIC FN_PREFIX(CryptonightWOW_template_part1)
PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_template_part2)
PUBLIC FN_PREFIX(CryptonightWOW_template_part3)
PUBLIC FN_PREFIX(CryptonightWOW_template_end)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_end)
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_part1):
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_mainloop):
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movq r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
FN_PREFIX(CryptonightWOW_template_part2):
mov rax, r13
mul r12
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz FN_PREFIX(CryptonightWOW_template_mainloop)
FN_PREFIX(CryptonightWOW_template_part3):
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
FN_PREFIX(CryptonightWOW_template_end):
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_double_part1):
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_double_mainloop):
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq rdx, xmm6
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
FN_PREFIX(CryptonightWOW_template_double_part2):
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movq xmm1, rdx
movq xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
FN_PREFIX(CryptonightWOW_template_double_part3):
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
movq r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movq xmm1, rdx
movq xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz FN_PREFIX(CryptonightWOW_template_double_mainloop)
FN_PREFIX(CryptonightWOW_template_double_part4):
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
FN_PREFIX(CryptonightWOW_template_double_end):

View file

@ -0,0 +1,486 @@
PUBLIC CryptonightWOW_template_part1
PUBLIC CryptonightWOW_template_mainloop
PUBLIC CryptonightWOW_template_part2
PUBLIC CryptonightWOW_template_part3
PUBLIC CryptonightWOW_template_end
PUBLIC CryptonightWOW_template_double_part1
PUBLIC CryptonightWOW_template_double_mainloop
PUBLIC CryptonightWOW_template_double_part2
PUBLIC CryptonightWOW_template_double_part3
PUBLIC CryptonightWOW_template_double_part4
PUBLIC CryptonightWOW_template_double_end
ALIGN(64)
CryptonightWOW_template_part1:
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
CryptonightWOW_template_mainloop:
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movq r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
CryptonightWOW_template_part2:
mov rax, r13
mul r12
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz CryptonightWOW_template_mainloop
CryptonightWOW_template_part3:
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
CryptonightWOW_template_end:
ALIGN(64)
CryptonightWOW_template_double_part1:
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
CryptonightWOW_template_double_mainloop:
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq rdx, xmm6
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
CryptonightWOW_template_double_part2:
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movq xmm1, rdx
movq xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
CryptonightWOW_template_double_part3:
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
movq r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movq xmm1, rdx
movq xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz CryptonightWOW_template_double_mainloop
CryptonightWOW_template_double_part4:
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
CryptonightWOW_template_double_end:

View file

@ -538,16 +538,16 @@ FN_PREFIX(CryptonightR_instruction1):
FN_PREFIX(CryptonightR_instruction2):
imul rbx, rbx
FN_PREFIX(CryptonightR_instruction3):
add rbx, rbx
add rbx, r9
add rbx, 2147483647
FN_PREFIX(CryptonightR_instruction4):
sub rbx, rbx
sub rbx, r9
FN_PREFIX(CryptonightR_instruction5):
ror ebx, cl
FN_PREFIX(CryptonightR_instruction6):
rol ebx, cl
FN_PREFIX(CryptonightR_instruction7):
xor rbx, rbx
xor rbx, r9
FN_PREFIX(CryptonightR_instruction8):
imul rsi, rbx
FN_PREFIX(CryptonightR_instruction9):
@ -623,16 +623,16 @@ FN_PREFIX(CryptonightR_instruction41):
FN_PREFIX(CryptonightR_instruction42):
imul rsi, rsi
FN_PREFIX(CryptonightR_instruction43):
add rsi, rsi
add rsi, r9
add rsi, 2147483647
FN_PREFIX(CryptonightR_instruction44):
sub rsi, rsi
sub rsi, r9
FN_PREFIX(CryptonightR_instruction45):
ror esi, cl
FN_PREFIX(CryptonightR_instruction46):
rol esi, cl
FN_PREFIX(CryptonightR_instruction47):
xor rsi, rsi
xor rsi, r9
FN_PREFIX(CryptonightR_instruction48):
imul rdi, rsi
FN_PREFIX(CryptonightR_instruction49):
@ -708,16 +708,16 @@ FN_PREFIX(CryptonightR_instruction81):
FN_PREFIX(CryptonightR_instruction82):
imul rdi, rdi
FN_PREFIX(CryptonightR_instruction83):
add rdi, rdi
add rdi, r9
add rdi, 2147483647
FN_PREFIX(CryptonightR_instruction84):
sub rdi, rdi
sub rdi, r9
FN_PREFIX(CryptonightR_instruction85):
ror edi, cl
FN_PREFIX(CryptonightR_instruction86):
rol edi, cl
FN_PREFIX(CryptonightR_instruction87):
xor rdi, rdi
xor rdi, r9
FN_PREFIX(CryptonightR_instruction88):
imul rbp, rdi
FN_PREFIX(CryptonightR_instruction89):
@ -793,16 +793,16 @@ FN_PREFIX(CryptonightR_instruction121):
FN_PREFIX(CryptonightR_instruction122):
imul rbp, rbp
FN_PREFIX(CryptonightR_instruction123):
add rbp, rbp
add rbp, r9
add rbp, 2147483647
FN_PREFIX(CryptonightR_instruction124):
sub rbp, rbp
sub rbp, r9
FN_PREFIX(CryptonightR_instruction125):
ror ebp, cl
FN_PREFIX(CryptonightR_instruction126):
rol ebp, cl
FN_PREFIX(CryptonightR_instruction127):
xor rbp, rbp
xor rbp, r9
FN_PREFIX(CryptonightR_instruction128):
imul rbx, rsp
FN_PREFIX(CryptonightR_instruction129):

View file

@ -525,16 +525,16 @@ CryptonightR_instruction1:
CryptonightR_instruction2:
imul rbx, rbx
CryptonightR_instruction3:
add rbx, rbx
add rbx, r9
add rbx, 2147483647
CryptonightR_instruction4:
sub rbx, rbx
sub rbx, r9
CryptonightR_instruction5:
ror ebx, cl
CryptonightR_instruction6:
rol ebx, cl
CryptonightR_instruction7:
xor rbx, rbx
xor rbx, r9
CryptonightR_instruction8:
imul rsi, rbx
CryptonightR_instruction9:
@ -610,16 +610,16 @@ CryptonightR_instruction41:
CryptonightR_instruction42:
imul rsi, rsi
CryptonightR_instruction43:
add rsi, rsi
add rsi, r9
add rsi, 2147483647
CryptonightR_instruction44:
sub rsi, rsi
sub rsi, r9
CryptonightR_instruction45:
ror esi, cl
CryptonightR_instruction46:
rol esi, cl
CryptonightR_instruction47:
xor rsi, rsi
xor rsi, r9
CryptonightR_instruction48:
imul rdi, rsi
CryptonightR_instruction49:
@ -695,16 +695,16 @@ CryptonightR_instruction81:
CryptonightR_instruction82:
imul rdi, rdi
CryptonightR_instruction83:
add rdi, rdi
add rdi, r9
add rdi, 2147483647
CryptonightR_instruction84:
sub rdi, rdi
sub rdi, r9
CryptonightR_instruction85:
ror edi, cl
CryptonightR_instruction86:
rol edi, cl
CryptonightR_instruction87:
xor rdi, rdi
xor rdi, r9
CryptonightR_instruction88:
imul rbp, rdi
CryptonightR_instruction89:
@ -780,16 +780,16 @@ CryptonightR_instruction121:
CryptonightR_instruction122:
imul rbp, rbp
CryptonightR_instruction123:
add rbp, rbp
add rbp, r9
add rbp, 2147483647
CryptonightR_instruction124:
sub rbp, rbp
sub rbp, r9
CryptonightR_instruction125:
ror ebp, cl
CryptonightR_instruction126:
rol ebp, cl
CryptonightR_instruction127:
xor rbp, rbp
xor rbp, r9
CryptonightR_instruction128:
imul rbx, rsp
CryptonightR_instruction129:

View file

@ -10,6 +10,7 @@ PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
PUBLIC FN_PREFIX(CryptonightR_template_double_end)
ALIGN(64)
FN_PREFIX(CryptonightR_template_part1):
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
@ -68,8 +69,6 @@ FN_PREFIX(CryptonightR_template_mainloop):
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
@ -77,16 +76,23 @@ FN_PREFIX(CryptonightR_template_mainloop):
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movd r12, xmm5
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movd r12, xmm5
movd r10d, xmm5
and r10d, 2097136
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
@ -101,13 +107,23 @@ FN_PREFIX(CryptonightR_template_mainloop):
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
FN_PREFIX(CryptonightR_template_part2):
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor rsp, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r15, rax
mov rax, r13
mul r12
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
@ -115,16 +131,18 @@ FN_PREFIX(CryptonightR_template_part2):
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movaps xmm3, xmm1
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm0, XMMWORD PTR [r10+r11]
pxor xmm1, xmm2
pxor xmm5, xmm0
pxor xmm5, xmm1
paddq xmm3, xmm4
paddq xmm2, xmm6
paddq xmm0, xmm7
movdqu XMMWORD PTR [r9+r11], xmm0
movdqu XMMWORD PTR [r12+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
@ -247,18 +265,21 @@ FN_PREFIX(CryptonightR_template_double_mainloop):
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movd rdx, xmm6
movd xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm1
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
movd rdx, xmm6
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
@ -274,15 +295,18 @@ FN_PREFIX(CryptonightR_template_double_mainloop):
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm1
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
@ -303,7 +327,8 @@ FN_PREFIX(CryptonightR_template_double_mainloop):
movd xmm11, rbp
movd xmm12, r15
movd xmm13, rdx
mov [rsp+112], rcx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
@ -320,9 +345,22 @@ FN_PREFIX(CryptonightR_template_double_mainloop):
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
FN_PREFIX(CryptonightR_template_double_part2):
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r14, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r12, rax
movd rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
@ -334,28 +372,27 @@ FN_PREFIX(CryptonightR_template_double_part2):
movd rbp, xmm11
movd r15, xmm12
movd rdx, xmm13
mov rcx, [rsp+112]
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movd xmm1, rdx
movd xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
movdqu xmm1, XMMWORD PTR [rcx+rsi]
pxor xmm6, xmm1
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
pxor xmm6, xmm2
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
pxor xmm6, xmm0
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
@ -383,6 +420,7 @@ FN_PREFIX(CryptonightR_template_double_part2):
movd xmm12, rbp
movd xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
@ -401,9 +439,24 @@ FN_PREFIX(CryptonightR_template_double_part2):
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
FN_PREFIX(CryptonightR_template_double_part3):
movd r15, xmm13
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r15, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r13, rax
movd rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
@ -414,23 +467,20 @@ FN_PREFIX(CryptonightR_template_double_part3):
movd rsi, xmm2
movd rdi, xmm11
movd rbp, xmm12
movd r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movd xmm1, rdx
movd xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
movdqu xmm1, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm1
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm2
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
@ -438,6 +488,7 @@ FN_PREFIX(CryptonightR_template_double_part3):
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm0
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0

View file

@ -10,6 +10,7 @@ PUBLIC CryptonightR_template_double_part3
PUBLIC CryptonightR_template_double_part4
PUBLIC CryptonightR_template_double_end
ALIGN(64)
CryptonightR_template_part1:
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
@ -68,8 +69,6 @@ CryptonightR_template_mainloop:
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
@ -77,16 +76,23 @@ CryptonightR_template_mainloop:
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movd r12, xmm5
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movd r12, xmm5
movd r10d, xmm5
and r10d, 2097136
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
@ -101,13 +107,23 @@ CryptonightR_template_mainloop:
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
CryptonightR_template_part2:
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor rsp, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r15, rax
mov rax, r13
mul r12
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
@ -115,16 +131,18 @@ CryptonightR_template_part2:
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movaps xmm3, xmm1
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm0, XMMWORD PTR [r10+r11]
pxor xmm1, xmm2
pxor xmm5, xmm0
pxor xmm5, xmm1
paddq xmm3, xmm4
paddq xmm2, xmm6
paddq xmm0, xmm7
movdqu XMMWORD PTR [r9+r11], xmm0
movdqu XMMWORD PTR [r12+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
@ -247,18 +265,21 @@ CryptonightR_template_double_mainloop:
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movd rdx, xmm6
movd xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm1
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
movd rdx, xmm6
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
@ -274,15 +295,18 @@ CryptonightR_template_double_mainloop:
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm1
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
@ -303,7 +327,8 @@ CryptonightR_template_double_mainloop:
movd xmm11, rbp
movd xmm12, r15
movd xmm13, rdx
mov [rsp+112], rcx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
@ -320,9 +345,22 @@ CryptonightR_template_double_mainloop:
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
CryptonightR_template_double_part2:
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r14, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r12, rax
movd rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
@ -334,28 +372,27 @@ CryptonightR_template_double_part2:
movd rbp, xmm11
movd r15, xmm12
movd rdx, xmm13
mov rcx, [rsp+112]
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movd xmm1, rdx
movd xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
movdqu xmm1, XMMWORD PTR [rcx+rsi]
pxor xmm6, xmm1
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
pxor xmm6, xmm2
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
pxor xmm6, xmm0
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
@ -383,6 +420,7 @@ CryptonightR_template_double_part2:
movd xmm12, rbp
movd xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
@ -401,9 +439,24 @@ CryptonightR_template_double_part2:
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
CryptonightR_template_double_part3:
movd r15, xmm13
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r15, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r13, rax
movd rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
@ -414,23 +467,20 @@ CryptonightR_template_double_part3:
movd rsi, xmm2
movd rdi, xmm11
movd rbp, xmm12
movd r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movd xmm1, rdx
movd xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
movdqu xmm1, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm1
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm2
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
@ -438,6 +488,7 @@ CryptonightR_template_double_part3:
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm0
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0

View file

@ -0,0 +1,486 @@
PUBLIC FN_PREFIX(CryptonightWOW_template_part1)
PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_template_part2)
PUBLIC FN_PREFIX(CryptonightWOW_template_part3)
PUBLIC FN_PREFIX(CryptonightWOW_template_end)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_end)
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_part1):
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movd xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movd xmm0, r12
movd xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movd xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_mainloop):
movdqa xmm5, XMMWORD PTR [r9+r11]
movd xmm0, r15
movd xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movd r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
FN_PREFIX(CryptonightWOW_template_part2):
mov rax, r13
mul r12
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz FN_PREFIX(CryptonightWOW_template_mainloop)
FN_PREFIX(CryptonightWOW_template_part3):
movd rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
FN_PREFIX(CryptonightWOW_template_end):
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_double_part1):
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movd xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movd xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movd xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movd xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movd xmm0, rcx
mov r11d, 524288
movd xmm10, rax
punpcklqdq xmm10, xmm0
movd xmm14, QWORD PTR [rsp+128]
movd xmm15, QWORD PTR [rsp+136]
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_double_mainloop):
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movd xmm0, r12
mov ecx, ebx
movd xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movd rdx, xmm6
movd xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movd xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movd rdi, xmm5
movd rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movd xmm0, rsp
movd xmm1, rsi
movd xmm2, rdi
movd xmm11, rbp
movd xmm12, r15
movd xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
FN_PREFIX(CryptonightWOW_template_double_part2):
movd rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movd rsi, xmm1
movd rdi, xmm2
movd rbp, xmm11
movd r15, xmm12
movd rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movd xmm1, rdx
movd xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movd rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movd xmm0, rsp
movd xmm1, rbx
movd xmm2, rsi
movd xmm11, rdi
movd xmm12, rbp
movd xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
FN_PREFIX(CryptonightWOW_template_double_part3):
movd rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movd rbx, xmm1
movd rsi, xmm2
movd rdi, xmm11
movd rbp, xmm12
movd r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movd xmm1, rdx
movd xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movd rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz FN_PREFIX(CryptonightWOW_template_double_mainloop)
FN_PREFIX(CryptonightWOW_template_double_part4):
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
FN_PREFIX(CryptonightWOW_template_double_end):

View file

@ -0,0 +1,486 @@
PUBLIC CryptonightWOW_template_part1
PUBLIC CryptonightWOW_template_mainloop
PUBLIC CryptonightWOW_template_part2
PUBLIC CryptonightWOW_template_part3
PUBLIC CryptonightWOW_template_end
PUBLIC CryptonightWOW_template_double_part1
PUBLIC CryptonightWOW_template_double_mainloop
PUBLIC CryptonightWOW_template_double_part2
PUBLIC CryptonightWOW_template_double_part3
PUBLIC CryptonightWOW_template_double_part4
PUBLIC CryptonightWOW_template_double_end
ALIGN(64)
CryptonightWOW_template_part1:
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movd xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movd xmm0, r12
movd xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movd xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
CryptonightWOW_template_mainloop:
movdqa xmm5, XMMWORD PTR [r9+r11]
movd xmm0, r15
movd xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movd r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
CryptonightWOW_template_part2:
mov rax, r13
mul r12
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz CryptonightWOW_template_mainloop
CryptonightWOW_template_part3:
movd rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
CryptonightWOW_template_end:
ALIGN(64)
CryptonightWOW_template_double_part1:
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movd xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movd xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movd xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movd xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movd xmm0, rcx
mov r11d, 524288
movd xmm10, rax
punpcklqdq xmm10, xmm0
movd xmm14, QWORD PTR [rsp+128]
movd xmm15, QWORD PTR [rsp+136]
ALIGN(64)
CryptonightWOW_template_double_mainloop:
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movd xmm0, r12
mov ecx, ebx
movd xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movd rdx, xmm6
movd xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movd xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movd rdi, xmm5
movd rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movd xmm0, rsp
movd xmm1, rsi
movd xmm2, rdi
movd xmm11, rbp
movd xmm12, r15
movd xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
CryptonightWOW_template_double_part2:
movd rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movd rsi, xmm1
movd rdi, xmm2
movd rbp, xmm11
movd r15, xmm12
movd rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movd xmm1, rdx
movd xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movd rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movd xmm0, rsp
movd xmm1, rbx
movd xmm2, rsi
movd xmm11, rdi
movd xmm12, rbp
movd xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
CryptonightWOW_template_double_part3:
movd rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movd rbx, xmm1
movd rsi, xmm2
movd rdi, xmm11
movd rbp, xmm12
movd r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movd xmm1, rdx
movd xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movd rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz CryptonightWOW_template_double_mainloop
CryptonightWOW_template_double_part4:
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
CryptonightWOW_template_double_end:

View file

@ -12,8 +12,11 @@ enum V4_Settings
TOTAL_LATENCY = 15 * 3,
// Always generate at least 60 instructions
NUM_INSTRUCTIONS = 60,
NUM_INSTRUCTIONS_MIN = 60,
// Never generate more than 70 instructions (final RET instruction doesn't count here)
NUM_INSTRUCTIONS_MAX = 70,
// Available ALUs for MUL
// Modern CPUs typically have only 1 ALU which can do multiplications
ALU_COUNT_MUL = 1,
@ -38,10 +41,9 @@ enum V4_InstructionList
// V4_InstructionDefinition is used to generate code from random data
// Every random sequence of bytes is a valid code
//
// There are 8 registers in total:
// There are 9 registers in total:
// - 4 variable registers
// - 4 constant registers initialized from loop variables
//
// - 5 constant registers initialized from loop variables
// This is why dst_index is 2 bits
enum V4_InstructionDefinition
{
@ -144,16 +146,16 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
// I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
//
// 60 28495
// 61 106077
// 62 2455855
// 63 5114930
// 64 1020868
// 65 1109026
// 66 151756
// 67 8429
// 68 4477
// 69 87
// 60 27960
// 61 105054
// 62 2452759
// 63 5115997
// 64 1022269
// 65 1109635
// 66 153145
// 67 8550
// 68 4529
// 69 102
// Unroll 70 instructions here
V4_EXEC_10(0); // instructions 0-9
@ -179,6 +181,8 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed
}
// Generates as many random math operations as possible with given latency and ALU restrictions
// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
template<xmrig::Variant VARIANT>
static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
{
// MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
@ -200,6 +204,10 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
memset(data, 0, sizeof(data));
uint64_t tmp = SWAP64LE(height);
memcpy(data, &tmp, sizeof(uint64_t));
if (VARIANT == xmrig::VARIANT_4)
{
data[20] = -38;
}
// Set data_index past the last byte in data
// to trigger full data update with blake hash
@ -207,18 +215,22 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
size_t data_index = sizeof(data);
int code_size;
// There is a small chance (1.8%) that register R8 won't be used in the generated program
// So we keep track of it and try again if it's not used
bool r8_used;
do {
int latency[8];
int asic_latency[8];
int latency[9];
int asic_latency[9];
// Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
// byte 0: current value of the destination register
// byte 1: instruction opcode
// byte 2: current value of the source register
//
// Registers R4-R7 are constant and are treated as having the same value because when we do
// Registers R4-R8 are constant and are treated as having the same value because when we do
// the same operation twice with two constant source registers, it can be optimized into a single operation
uint32_t inst_data[8] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
bool is_rotation[V4_INSTRUCTION_COUNT];
@ -237,6 +249,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
code_size = 0;
int total_iterations = 0;
r8_used = (VARIANT == xmrig::VARIANT_WOW);
// Generate random code to achieve minimal required latency for our abstract CPU
// Try to get this latency for all 4 registers
@ -281,7 +294,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
{
// a is always < 4, so we don't need to check bounds here
b = a + 4;
b = (VARIANT == xmrig::VARIANT_WOW) ? (a + 4) : 8;
src_index = b;
}
@ -362,6 +375,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
code[code_size].src_index = src_index;
code[code_size].C = 0;
if (src_index == 8)
{
r8_used = true;
}
if (opcode == ADD)
{
// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
@ -376,7 +394,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
}
++code_size;
if (code_size >= NUM_INSTRUCTIONS)
if (code_size >= NUM_INSTRUCTIONS_MIN)
{
break;
}
@ -391,7 +409,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
// Get this latency for at least 1 of the 4 registers
const int prev_code_size = code_size;
while ((asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
{
int min_idx = 0;
int max_idx = 0;
@ -413,9 +431,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
++code_size;
}
// There is ~99.8% chance that code_size >= NUM_INSTRUCTIONS here, so second iteration is required rarely
} while (code_size < NUM_INSTRUCTIONS);
// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
// It never does more than 4 iterations for all block heights < 10,000,000
} while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
// Add final instruction to stop the interpreter
code[code_size].opcode = RET;
code[code_size].dst_index = 0;

View file

@ -174,6 +174,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
add_asm_func<CRYPTONIGHT, VARIANT_2>(asm_func_map);
add_asm_func<CRYPTONIGHT, VARIANT_HALF>(asm_func_map);
add_asm_func<CRYPTONIGHT, VARIANT_WOW>(asm_func_map);
add_asm_func<CRYPTONIGHT, VARIANT_4>(asm_func_map);
add_asm_func<CRYPTONIGHT_PICO, VARIANT_HALF>(asm_func_map);
@ -309,6 +310,17 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
cryptonight_quad_hash<CRYPTONIGHT, true, VARIANT_WOW>,
cryptonight_penta_hash<CRYPTONIGHT, true, VARIANT_WOW>,
cryptonight_single_hash<CRYPTONIGHT, false, VARIANT_4>,
cryptonight_double_hash<CRYPTONIGHT, false, VARIANT_4>,
cryptonight_single_hash<CRYPTONIGHT, true, VARIANT_4>,
cryptonight_double_hash<CRYPTONIGHT, true, VARIANT_4>,
cryptonight_triple_hash<CRYPTONIGHT, false, VARIANT_4>,
cryptonight_quad_hash<CRYPTONIGHT, false, VARIANT_4>,
cryptonight_penta_hash<CRYPTONIGHT, false, VARIANT_4>,
cryptonight_triple_hash<CRYPTONIGHT, true, VARIANT_4>,
cryptonight_quad_hash<CRYPTONIGHT, true, VARIANT_4>,
cryptonight_penta_hash<CRYPTONIGHT, true, VARIANT_4>,
# ifndef XMRIG_NO_AEON
cryptonight_single_hash<CRYPTONIGHT_LITE, false, VARIANT_0>,
cryptonight_double_hash<CRYPTONIGHT_LITE, false, VARIANT_0>,
@ -343,6 +355,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4
# else
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1
@ -357,6 +370,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4
# endif
# ifndef XMRIG_NO_SUMO
@ -405,6 +419,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4
# else
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1
@ -419,6 +434,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4
# endif
# ifndef XMRIG_NO_CN_PICO
@ -446,6 +462,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4
# else
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1
@ -460,6 +477,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4
# endif
};

View file

@ -60,6 +60,10 @@ bool MultiWorker<N>::selfTest()
LOG_WARN("CryptonightR (Wownero) self-test failed");
return false;
}
if (!verify2(VARIANT_4, test_input_R)) {
LOG_WARN("CryptonightR self-test failed");
return false;
}
const bool rc = verify(VARIANT_0, test_output_v0) &&
verify(VARIANT_1, test_output_v1) &&