Optimized soft AES implementations

cn-pico: +6.7%
cn/half: +6.2%
cn/2: +4.3%
cn-heavy: +9.1%
cn/wow, cn/r: 2.4-2.6 times faster
This commit is contained in:
SChernykh 2019-02-24 20:04:09 +01:00
parent a5dcd6dd1f
commit 488cec09dd
18 changed files with 2380 additions and 1090 deletions

View file

@ -49,6 +49,10 @@ struct cryptonight_r_data {
struct cryptonight_ctx {
alignas(16) uint8_t state[224];
alignas(16) uint8_t *memory;
uint8_t unused[40];
const uint32_t* saes_table;
cn_mainloop_fun_ms_abi generated_code;
cn_mainloop_double_fun_ms_abi generated_code_double;
cryptonight_r_data generated_code_data;

View file

@ -158,6 +158,16 @@
#define SWAP64LE(x) x
#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length))
#ifndef NOINLINE
#ifdef __GNUC__
#define NOINLINE __attribute__ ((noinline))
#elif _MSC_VER
#define NOINLINE __declspec(noinline)
#else
#define NOINLINE
#endif
#endif
#include "common/xmrig.h"
#include "variant4_random_math.h"

View file

@ -192,20 +192,93 @@ static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, _
}
template<bool SOFT_AES>
static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
static FORCEINLINE void soft_aesenc(void* __restrict ptr, const void* __restrict key, const uint32_t* __restrict t)
{
if (SOFT_AES) {
*x0 = soft_aesenc((uint32_t*)x0, key);
*x1 = soft_aesenc((uint32_t*)x1, key);
*x2 = soft_aesenc((uint32_t*)x2, key);
*x3 = soft_aesenc((uint32_t*)x3, key);
*x4 = soft_aesenc((uint32_t*)x4, key);
*x5 = soft_aesenc((uint32_t*)x5, key);
*x6 = soft_aesenc((uint32_t*)x6, key);
*x7 = soft_aesenc((uint32_t*)x7, key);
uint32_t x0 = ((const uint32_t*)(ptr))[0];
uint32_t x1 = ((const uint32_t*)(ptr))[1];
uint32_t x2 = ((const uint32_t*)(ptr))[2];
uint32_t x3 = ((const uint32_t*)(ptr))[3];
uint32_t y0 = t[x0 & 0xff]; x0 >>= 8;
uint32_t y1 = t[x1 & 0xff]; x1 >>= 8;
uint32_t y2 = t[x2 & 0xff]; x2 >>= 8;
uint32_t y3 = t[x3 & 0xff]; x3 >>= 8;
t += 256;
y0 ^= t[x1 & 0xff]; x1 >>= 8;
y1 ^= t[x2 & 0xff]; x2 >>= 8;
y2 ^= t[x3 & 0xff]; x3 >>= 8;
y3 ^= t[x0 & 0xff]; x0 >>= 8;
t += 256;
y0 ^= t[x2 & 0xff]; x2 >>= 8;
y1 ^= t[x3 & 0xff]; x3 >>= 8;
y2 ^= t[x0 & 0xff]; x0 >>= 8;
y3 ^= t[x1 & 0xff]; x1 >>= 8;
t += 256;
y0 ^= t[x3];
y1 ^= t[x0];
y2 ^= t[x1];
y3 ^= t[x2];
((uint32_t*)ptr)[0] = y0 ^ ((uint32_t*)key)[0];
((uint32_t*)ptr)[1] = y1 ^ ((uint32_t*)key)[1];
((uint32_t*)ptr)[2] = y2 ^ ((uint32_t*)key)[2];
((uint32_t*)ptr)[3] = y3 ^ ((uint32_t*)key)[3];
}
else {
static FORCEINLINE __m128i soft_aesenc(const void* __restrict ptr, const __m128i key, const uint32_t* __restrict t)
{
uint32_t x0 = ((const uint32_t*)(ptr))[0];
uint32_t x1 = ((const uint32_t*)(ptr))[1];
uint32_t x2 = ((const uint32_t*)(ptr))[2];
uint32_t x3 = ((const uint32_t*)(ptr))[3];
uint32_t y0 = t[x0 & 0xff]; x0 >>= 8;
uint32_t y1 = t[x1 & 0xff]; x1 >>= 8;
uint32_t y2 = t[x2 & 0xff]; x2 >>= 8;
uint32_t y3 = t[x3 & 0xff]; x3 >>= 8;
t += 256;
y0 ^= t[x1 & 0xff]; x1 >>= 8;
y1 ^= t[x2 & 0xff]; x2 >>= 8;
y2 ^= t[x3 & 0xff]; x3 >>= 8;
y3 ^= t[x0 & 0xff]; x0 >>= 8;
t += 256;
y0 ^= t[x2 & 0xff]; x2 >>= 8;
y1 ^= t[x3 & 0xff]; x3 >>= 8;
y2 ^= t[x0 & 0xff]; x0 >>= 8;
y3 ^= t[x1 & 0xff]; x1 >>= 8;
y0 ^= t[x3 + 256];
y1 ^= t[x0 + 256];
y2 ^= t[x1 + 256];
y3 ^= t[x2 + 256];
return _mm_xor_si128(_mm_set_epi32(y3, y2, y1, y0), key);
}
template<bool SOFT_AES>
void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7);
template<>
static NOINLINE void aes_round<true>(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
{
*x0 = soft_aesenc((uint32_t*)x0, key, (const uint32_t*)saes_table);
*x1 = soft_aesenc((uint32_t*)x1, key, (const uint32_t*)saes_table);
*x2 = soft_aesenc((uint32_t*)x2, key, (const uint32_t*)saes_table);
*x3 = soft_aesenc((uint32_t*)x3, key, (const uint32_t*)saes_table);
*x4 = soft_aesenc((uint32_t*)x4, key, (const uint32_t*)saes_table);
*x5 = soft_aesenc((uint32_t*)x5, key, (const uint32_t*)saes_table);
*x6 = soft_aesenc((uint32_t*)x6, key, (const uint32_t*)saes_table);
*x7 = soft_aesenc((uint32_t*)x7, key, (const uint32_t*)saes_table);
}
template<>
static FORCEINLINE void aes_round<false>(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
{
*x0 = _mm_aesenc_si128(*x0, key);
*x1 = _mm_aesenc_si128(*x1, key);
*x2 = _mm_aesenc_si128(*x2, key);
@ -215,8 +288,6 @@ static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2,
*x6 = _mm_aesenc_si128(*x6, key);
*x7 = _mm_aesenc_si128(*x7, key);
}
}
inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
{
@ -478,6 +549,8 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l,
}
}
void wow_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
void v4_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
@ -498,9 +571,31 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
#ifndef XMRIG_NO_ASM
if (SOFT_AES && xmrig::cn_is_cryptonight_r<VARIANT>())
{
if (!ctx[0]->generated_code_data.match(VARIANT, height)) {
V4_Instruction code[256];
const int code_size = v4_random_math_init<VARIANT>(code, height);
if (VARIANT == xmrig::VARIANT_WOW)
wow_soft_aes_compile_code(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code), xmrig::ASM_NONE);
else if (VARIANT == xmrig::VARIANT_4)
v4_soft_aes_compile_code(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code), xmrig::ASM_NONE);
ctx[0]->generated_code_data.variant = VARIANT;
ctx[0]->generated_code_data.height = height;
}
ctx[0]->saes_table = (const uint32_t*)saes_table;
ctx[0]->generated_code(ctx[0]);
} else {
#endif
const uint8_t* l0 = ctx[0]->memory;
VARIANT1_INIT(0);
VARIANT2_INIT(0);
VARIANT2_SET_ROUNDING_MODE();
@ -524,7 +619,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
cx = aes_round_tweak_div(cx, ax0);
}
else if (SOFT_AES) {
cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0, (const uint32_t*)saes_table);
}
else {
cx = _mm_aesenc_si128(cx, ax0);
@ -602,6 +697,10 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
bx0 = cx;
}
#ifndef XMRIG_NO_ASM
}
#endif
cn_implode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
xmrig::keccakf(h0, 24);
@ -857,8 +956,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
cx1 = aes_round_tweak_div(cx1, ax1);
}
else if (SOFT_AES) {
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1);
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0, (const uint32_t*)saes_table);
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1, (const uint32_t*)saes_table);
}
else {
cx0 = _mm_aesenc_si128(cx0, ax0);
@ -1019,7 +1118,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
c = aes_round_tweak_div(c, a); \
} \
else if (SOFT_AES) { \
c = soft_aesenc(c, a); \
c = soft_aesenc(&c, a, (const uint32_t*)saes_table); \
} else { \
c = _mm_aesenc_si128(c, a); \
} \

View file

@ -159,4 +159,32 @@ void v4_compile_code_double(const V4_Instruction* code, int code_size, void* mac
Mem::flushInstructionCache(machine_code, p - p0);
}
void wow_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
uint8_t* p = p0;
add_code(p, CryptonightWOW_soft_aes_template_part1, CryptonightWOW_soft_aes_template_part2);
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(p, CryptonightWOW_soft_aes_template_part2, CryptonightWOW_soft_aes_template_part3);
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightWOW_soft_aes_template_part1)) - (p - p0));
add_code(p, CryptonightWOW_soft_aes_template_part3, CryptonightWOW_soft_aes_template_end);
Mem::flushInstructionCache(machine_code, p - p0);
}
void v4_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
{
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
uint8_t* p = p0;
add_code(p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2);
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3);
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0));
add_code(p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end);
Mem::flushInstructionCache(machine_code, p - p0);
}
#endif

View file

@ -0,0 +1,279 @@
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
ALIGN(64)
FN_PREFIX(CryptonightR_soft_aes_template_part1):
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
pxor xmm6, xmm1
pxor xmm6, xmm0
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
movaps xmm0, xmm5
psrldq xmm0, 8
movd r9d, xmm0
FN_PREFIX(CryptonightR_soft_aes_template_part2):
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov edi, edi
shl rbp, 32
or rbp, rdi
xor r8, rbp
mov ebx, ebx
shl rsi, 32
or rsi, rbx
xor QWORD PTR [rsp+320], rsi
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm1
paddq xmm1, xmm7
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm6, xmm0
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
FN_PREFIX(CryptonightR_soft_aes_template_part3):
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
FN_PREFIX(CryptonightR_soft_aes_template_end):

View file

@ -0,0 +1,279 @@
PUBLIC CryptonightR_soft_aes_template_part1
PUBLIC CryptonightR_soft_aes_template_mainloop
PUBLIC CryptonightR_soft_aes_template_part2
PUBLIC CryptonightR_soft_aes_template_part3
PUBLIC CryptonightR_soft_aes_template_end
ALIGN(64)
CryptonightR_soft_aes_template_part1:
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
CryptonightR_soft_aes_template_mainloop:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
pxor xmm6, xmm1
pxor xmm6, xmm0
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
movaps xmm0, xmm5
psrldq xmm0, 8
movd r9d, xmm0
CryptonightR_soft_aes_template_part2:
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov edi, edi
shl rbp, 32
or rbp, rdi
xor r8, rbp
mov ebx, ebx
shl rsi, 32
or rsi, rbx
xor QWORD PTR [rsp+320], rsi
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm1
paddq xmm1, xmm7
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm6, xmm0
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne CryptonightR_soft_aes_template_mainloop
CryptonightR_soft_aes_template_part3:
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
CryptonightR_soft_aes_template_end:

View file

@ -531,6 +531,8 @@ PUBLIC FN_PREFIX(CryptonightR_instruction_mov256)
#include "CryptonightWOW_template.inc"
#include "CryptonightR_template.inc"
#include "CryptonightWOW_soft_aes_template.inc"
#include "CryptonightR_soft_aes_template.inc"
FN_PREFIX(CryptonightR_instruction0):
imul rbx, rbx

View file

@ -518,6 +518,8 @@ PUBLIC CryptonightR_instruction_mov256
INCLUDE CryptonightWOW_template_win.inc
INCLUDE CryptonightR_template_win.inc
INCLUDE CryptonightWOW_soft_aes_template_win.inc
INCLUDE CryptonightR_soft_aes_template_win.inc
CryptonightR_instruction0:
imul rbx, rbx

View file

@ -26,6 +26,30 @@ extern "C"
void CryptonightR_template_double_part4();
void CryptonightR_template_double_end();
void CryptonightWOW_soft_aes_template_part1();
void CryptonightWOW_soft_aes_template_mainloop();
void CryptonightWOW_soft_aes_template_part2();
void CryptonightWOW_soft_aes_template_part3();
void CryptonightWOW_soft_aes_template_end();
void CryptonightWOW_soft_aes_template_double_part1();
void CryptonightWOW_soft_aes_template_double_mainloop();
void CryptonightWOW_soft_aes_template_double_part2();
void CryptonightWOW_soft_aes_template_double_part3();
void CryptonightWOW_soft_aes_template_double_part4();
void CryptonightWOW_soft_aes_template_double_end();
void CryptonightR_soft_aes_template_part1();
void CryptonightR_soft_aes_template_mainloop();
void CryptonightR_soft_aes_template_part2();
void CryptonightR_soft_aes_template_part3();
void CryptonightR_soft_aes_template_end();
void CryptonightR_soft_aes_template_double_part1();
void CryptonightR_soft_aes_template_double_mainloop();
void CryptonightR_soft_aes_template_double_part2();
void CryptonightR_soft_aes_template_double_part3();
void CryptonightR_soft_aes_template_double_part4();
void CryptonightR_soft_aes_template_double_end();
void CryptonightR_instruction0();
void CryptonightR_instruction1();
void CryptonightR_instruction2();

View file

@ -0,0 +1,266 @@
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part1)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part2)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part3)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_end)
ALIGN(64)
FN_PREFIX(CryptonightWOW_soft_aes_template_part1):
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop):
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
FN_PREFIX(CryptonightWOW_soft_aes_template_part2):
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movq xmm0, rax
movq xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
FN_PREFIX(CryptonightWOW_soft_aes_template_part3):
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
FN_PREFIX(CryptonightWOW_soft_aes_template_end):

View file

@ -0,0 +1,266 @@
PUBLIC CryptonightWOW_soft_aes_template_part1
PUBLIC CryptonightWOW_soft_aes_template_mainloop
PUBLIC CryptonightWOW_soft_aes_template_part2
PUBLIC CryptonightWOW_soft_aes_template_part3
PUBLIC CryptonightWOW_soft_aes_template_end
ALIGN(64)
CryptonightWOW_soft_aes_template_part1:
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
CryptonightWOW_soft_aes_template_mainloop:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
CryptonightWOW_soft_aes_template_part2:
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movq xmm0, rax
movq xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne CryptonightWOW_soft_aes_template_mainloop
CryptonightWOW_soft_aes_template_part3:
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
CryptonightWOW_soft_aes_template_end:

View file

@ -0,0 +1,279 @@
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
ALIGN(64)
FN_PREFIX(CryptonightR_soft_aes_template_part1):
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movd xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movd xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movd xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movd xmm10, QWORD PTR [r10+96]
movd xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movd xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movd xmm13, rcx
mov r12d, 524288
ALIGN(64)
FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movd xmm0, r9
mov r10d, DWORD PTR [r13+4]
movd xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movd r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
pxor xmm6, xmm1
pxor xmm6, xmm0
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movd rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movd rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
movaps xmm0, xmm5
psrldq xmm0, 8
movd r9d, xmm0
FN_PREFIX(CryptonightR_soft_aes_template_part2):
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov edi, edi
shl rbp, 32
or rbp, rdi
xor r8, rbp
mov ebx, ebx
shl rsi, 32
or rsi, rbx
xor QWORD PTR [rsp+320], rsi
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm1
paddq xmm1, xmm7
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm6, xmm0
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
FN_PREFIX(CryptonightR_soft_aes_template_part3):
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
FN_PREFIX(CryptonightR_soft_aes_template_end):

View file

@ -0,0 +1,279 @@
PUBLIC CryptonightR_soft_aes_template_part1
PUBLIC CryptonightR_soft_aes_template_mainloop
PUBLIC CryptonightR_soft_aes_template_part2
PUBLIC CryptonightR_soft_aes_template_part3
PUBLIC CryptonightR_soft_aes_template_end
ALIGN(64)
CryptonightR_soft_aes_template_part1:
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movd xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movd xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movd xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movd xmm10, QWORD PTR [r10+96]
movd xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movd xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movd xmm13, rcx
mov r12d, 524288
ALIGN(64)
CryptonightR_soft_aes_template_mainloop:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movd xmm0, r9
mov r10d, DWORD PTR [r13+4]
movd xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movd r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
pxor xmm6, xmm1
pxor xmm6, xmm0
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movd rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movd rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
movaps xmm0, xmm5
psrldq xmm0, 8
movd r9d, xmm0
CryptonightR_soft_aes_template_part2:
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov edi, edi
shl rbp, 32
or rbp, rdi
xor r8, rbp
mov ebx, ebx
shl rsi, 32
or rsi, rbx
xor QWORD PTR [rsp+320], rsi
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm1
paddq xmm1, xmm7
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm6, xmm0
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne CryptonightR_soft_aes_template_mainloop
CryptonightR_soft_aes_template_part3:
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
CryptonightR_soft_aes_template_end:

View file

@ -531,6 +531,8 @@ PUBLIC FN_PREFIX(CryptonightR_instruction_mov256)
#include "CryptonightWOW_template.inc"
#include "CryptonightR_template.inc"
#include "CryptonightWOW_soft_aes_template.inc"
#include "CryptonightR_soft_aes_template.inc"
FN_PREFIX(CryptonightR_instruction0):
imul rbx, rbx

View file

@ -518,6 +518,8 @@ PUBLIC CryptonightR_instruction_mov256
INCLUDE CryptonightWOW_template_win.inc
INCLUDE CryptonightR_template_win.inc
INCLUDE CryptonightWOW_soft_aes_template_win.inc
INCLUDE CryptonightR_soft_aes_template_win.inc
CryptonightR_instruction0:
imul rbx, rbx

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,266 @@
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part1)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part2)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part3)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_end)
ALIGN(64)
FN_PREFIX(CryptonightWOW_soft_aes_template_part1):
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movd xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movd xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movd xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movd xmm10, QWORD PTR [r10+96]
movd xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movd xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movd xmm13, rcx
mov r12d, 524288
ALIGN(64)
FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop):
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movd xmm0, r9
mov r10d, DWORD PTR [r13+4]
movd xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movd r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movd rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movd rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
FN_PREFIX(CryptonightWOW_soft_aes_template_part2):
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movd xmm0, rax
movd xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
FN_PREFIX(CryptonightWOW_soft_aes_template_part3):
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
FN_PREFIX(CryptonightWOW_soft_aes_template_end):

View file

@ -0,0 +1,266 @@
PUBLIC CryptonightWOW_soft_aes_template_part1
PUBLIC CryptonightWOW_soft_aes_template_mainloop
PUBLIC CryptonightWOW_soft_aes_template_part2
PUBLIC CryptonightWOW_soft_aes_template_part3
PUBLIC CryptonightWOW_soft_aes_template_end
ALIGN(64)
CryptonightWOW_soft_aes_template_part1:
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movd xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movd xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movd xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movd xmm10, QWORD PTR [r10+96]
movd xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movd xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movd xmm13, rcx
mov r12d, 524288
ALIGN(64)
CryptonightWOW_soft_aes_template_mainloop:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movd xmm0, r9
mov r10d, DWORD PTR [r13+4]
movd xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movd r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movd rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movd rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
CryptonightWOW_soft_aes_template_part2:
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movd xmm0, rax
movd xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne CryptonightWOW_soft_aes_template_mainloop
CryptonightWOW_soft_aes_template_part3:
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
CryptonightWOW_soft_aes_template_end: