diff --git a/src/crypto/cn/CryptoNight_x86.h b/src/crypto/cn/CryptoNight_x86.h index 25eeb908..cc88342b 100644 --- a/src/crypto/cn/CryptoNight_x86.h +++ b/src/crypto/cn/CryptoNight_x86.h @@ -377,12 +377,15 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) _mm_store_si128(output + 1, xin1); _mm_store_si128(output + 2, xin2); _mm_store_si128(output + 3, xin3); - output += (64 << interleave) / sizeof(__m128i); - _mm_store_si128(output + 0, xin4); - _mm_store_si128(output + 1, xin5); - _mm_store_si128(output + 2, xin6); - _mm_store_si128(output + 3, xin7); - output += (64 << interleave) / sizeof(__m128i); + + constexpr int output_increment = (64 << interleave) / sizeof(__m128i); + + _mm_store_si128(output + output_increment + 0, xin4); + _mm_store_si128(output + output_increment + 1, xin5); + _mm_store_si128(output + output_increment + 2, xin6); + _mm_store_si128(output + output_increment + 3, xin7); + + output += output_increment * 2; } } @@ -414,13 +417,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1); xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2); xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3); - input += (64 << interleave) / sizeof(__m128i); - xout4 = _mm_xor_si128(_mm_load_si128(input + 0), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + 1), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + 2), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + 3), xout7); - input += (64 << interleave) / sizeof(__m128i); + constexpr int input_increment = (64 << interleave) / sizeof(__m128i); + + xout4 = _mm_xor_si128(_mm_load_si128(input + input_increment + 0), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(input + input_increment + 1), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(input + input_increment + 2), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(input + input_increment + 3), xout7); + + input += input_increment * 2; i += 8; if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) { diff --git a/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc b/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc index 4f6b70a0..14222dac 100644 --- a/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc +++ b/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc @@ -34,7 +34,7 @@ movdqa XMMWORD PTR [rsp+32], xmm0 stmxcsr DWORD PTR [rsp+24] - mov DWORD PTR [rsp+28], 24448 + mov DWORD PTR [rsp+28], 16256 ldmxcsr DWORD PTR [rsp+28] mov rcx, QWORD PTR [rbx+56] diff --git a/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc b/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc index 854fbf11..00fabd6d 100644 --- a/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc +++ b/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc @@ -34,7 +34,7 @@ movdqa XMMWORD PTR [rsp+32], xmm0 stmxcsr DWORD PTR [rsp+24] - mov DWORD PTR [rsp+28], 24448 + mov DWORD PTR [rsp+28], 16256 ldmxcsr DWORD PTR [rsp+28] mov rcx, QWORD PTR [rbx+56]