Merge pull request #2296 from SChernykh/dev

Fixed Zen3 asm for cn/upx2
This commit is contained in:
xmrig 2021-04-21 19:52:52 +07:00 committed by GitHub
commit ae6c536e98
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 14 deletions

View file

@ -377,12 +377,15 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
_mm_store_si128(output + 1, xin1); _mm_store_si128(output + 1, xin1);
_mm_store_si128(output + 2, xin2); _mm_store_si128(output + 2, xin2);
_mm_store_si128(output + 3, xin3); _mm_store_si128(output + 3, xin3);
output += (64 << interleave) / sizeof(__m128i);
_mm_store_si128(output + 0, xin4); constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
_mm_store_si128(output + 1, xin5);
_mm_store_si128(output + 2, xin6); _mm_store_si128(output + output_increment + 0, xin4);
_mm_store_si128(output + 3, xin7); _mm_store_si128(output + output_increment + 1, xin5);
output += (64 << interleave) / sizeof(__m128i); _mm_store_si128(output + output_increment + 2, xin6);
_mm_store_si128(output + output_increment + 3, xin7);
output += output_increment * 2;
} }
} }
@ -414,13 +417,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1); xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2); xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3); xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);
input += (64 << interleave) / sizeof(__m128i);
xout4 = _mm_xor_si128(_mm_load_si128(input + 0), xout4);
xout5 = _mm_xor_si128(_mm_load_si128(input + 1), xout5);
xout6 = _mm_xor_si128(_mm_load_si128(input + 2), xout6);
xout7 = _mm_xor_si128(_mm_load_si128(input + 3), xout7);
input += (64 << interleave) / sizeof(__m128i);
constexpr int input_increment = (64 << interleave) / sizeof(__m128i);
xout4 = _mm_xor_si128(_mm_load_si128(input + input_increment + 0), xout4);
xout5 = _mm_xor_si128(_mm_load_si128(input + input_increment + 1), xout5);
xout6 = _mm_xor_si128(_mm_load_si128(input + input_increment + 2), xout6);
xout7 = _mm_xor_si128(_mm_load_si128(input + input_increment + 3), xout7);
input += input_increment * 2;
i += 8; i += 8;
if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) { if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) {

View file

@ -34,7 +34,7 @@
movdqa XMMWORD PTR [rsp+32], xmm0 movdqa XMMWORD PTR [rsp+32], xmm0
stmxcsr DWORD PTR [rsp+24] stmxcsr DWORD PTR [rsp+24]
mov DWORD PTR [rsp+28], 24448 mov DWORD PTR [rsp+28], 16256
ldmxcsr DWORD PTR [rsp+28] ldmxcsr DWORD PTR [rsp+28]
mov rcx, QWORD PTR [rbx+56] mov rcx, QWORD PTR [rbx+56]

View file

@ -34,7 +34,7 @@
movdqa XMMWORD PTR [rsp+32], xmm0 movdqa XMMWORD PTR [rsp+32], xmm0
stmxcsr DWORD PTR [rsp+24] stmxcsr DWORD PTR [rsp+24]
mov DWORD PTR [rsp+28], 24448 mov DWORD PTR [rsp+28], 16256
ldmxcsr DWORD PTR [rsp+28] ldmxcsr DWORD PTR [rsp+28]
mov rcx, QWORD PTR [rbx+56] mov rcx, QWORD PTR [rbx+56]