From 1a9eaaad8f9eb03efa5b139059f12c85677c62cd Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 29 Nov 2021 18:27:52 +0100 Subject: [PATCH 1/2] VAES crash fixes --- src/backend/cpu/platform/BasicCpuInfo.cpp | 2 +- src/crypto/cn/CryptoNight_x86_vaes.cpp | 40 +++++++++++------------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index d64612e57..42a82f6a6 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -140,7 +140,7 @@ static inline bool has_osxsave() { return has_feature(PROCESSOR_INFO, static inline bool has_aes_ni() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 25); } static inline bool has_avx() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); } static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); } -static inline bool has_vaes() { return has_feature(EXTENDED_FEATURES, ECX_Reg, 1 << 9); } +static inline bool has_vaes() { return has_feature(EXTENDED_FEATURES, ECX_Reg, 1 << 9) && has_osxsave() && has_xcr_avx(); } static inline bool has_avx512f() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); } static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 8); } static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); } diff --git a/src/crypto/cn/CryptoNight_x86_vaes.cpp b/src/crypto/cn/CryptoNight_x86_vaes.cpp index 177da813e..45b26d3a9 100644 --- a/src/crypto/cn/CryptoNight_x86_vaes.cpp +++ b/src/crypto/cn/CryptoNight_x86_vaes.cpp @@ -179,16 +179,16 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) if (props.half_mem() && !ctx->first_half) { const __m256i* p = reinterpret_cast(ctx->save_state); - xin01 = _mm256_load_si256(p + 0); - xin23 = _mm256_load_si256(p + 1); - xin45 = _mm256_load_si256(p + 2); - xin67 = _mm256_load_si256(p + 3); + xin01 = _mm256_loadu_si256(p + 0); + xin23 = _mm256_loadu_si256(p + 1); + xin45 = _mm256_loadu_si256(p + 2); + xin67 = _mm256_loadu_si256(p + 3); } else { - xin01 = _mm256_load_si256(reinterpret_cast(input + 4)); - xin23 = _mm256_load_si256(reinterpret_cast(input + 6)); - xin45 = _mm256_load_si256(reinterpret_cast(input + 8)); - xin67 = _mm256_load_si256(reinterpret_cast(input + 10)); + xin01 = _mm256_loadu_si256(reinterpret_cast(input + 4)); + xin23 = _mm256_loadu_si256(reinterpret_cast(input + 6)); + xin45 = _mm256_loadu_si256(reinterpret_cast(input + 8)); + xin67 = _mm256_loadu_si256(reinterpret_cast(input + 10)); } constexpr int output_increment = 64 / sizeof(__m256i); @@ -228,10 +228,10 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) if (props.half_mem() && ctx->first_half) { __m256i* p = reinterpret_cast<__m256i*>(ctx->save_state); - _mm256_store_si256(p + 0, xin01); - _mm256_store_si256(p + 1, xin23); - _mm256_store_si256(p + 2, xin45); - _mm256_store_si256(p + 3, xin67); + _mm256_storeu_si256(p + 0, xin01); + _mm256_storeu_si256(p + 1, xin23); + _mm256_storeu_si256(p + 2, xin45); + _mm256_storeu_si256(p + 3, xin67); } _mm256_zeroupper(); @@ -347,10 +347,10 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx) vaes_genkey(reinterpret_cast<__m128i*>(output) + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); - xout01 = _mm256_load_si256(output + 2); - xout23 = _mm256_load_si256(output + 3); - xout45 = _mm256_load_si256(output + 4); - xout67 = _mm256_load_si256(output + 5); + xout01 = _mm256_loadu_si256(output + 2); + xout23 = _mm256_loadu_si256(output + 3); + xout45 = _mm256_loadu_si256(output + 4); + xout67 = _mm256_loadu_si256(output + 5); const __m256i* input_begin = input; for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) { @@ -390,10 +390,10 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx) } } - _mm256_store_si256(output + 2, xout01); - _mm256_store_si256(output + 3, xout23); - _mm256_store_si256(output + 4, xout45); - _mm256_store_si256(output + 5, xout67); + _mm256_storeu_si256(output + 2, xout01); + _mm256_storeu_si256(output + 3, xout23); + _mm256_storeu_si256(output + 4, xout45); + _mm256_storeu_si256(output + 5, xout67); _mm256_zeroupper(); } From a98db529fb08d86bd62665831f3d818a41ceaa85 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 29 Nov 2021 21:58:24 +0100 Subject: [PATCH 2/2] Explicitly use QueryPerformanceCounter() on Windows --- src/crypto/ghostrider/ghostrider.cpp | 29 ++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/crypto/ghostrider/ghostrider.cpp b/src/crypto/ghostrider/ghostrider.cpp index a23150fce..0a526dd41 100644 --- a/src/crypto/ghostrider/ghostrider.cpp +++ b/src/crypto/ghostrider/ghostrider.cpp @@ -61,6 +61,10 @@ # include #endif +#ifdef XMRIG_OS_WIN +# include +#endif + #define CORE_HASH(i, x) static void h##i(const uint8_t* data, size_t size, uint8_t* output) \ { \ sph_##x##_context ctx; \ @@ -328,7 +332,16 @@ void benchmark() LOG_VERBOSE("%24s | N | Hashrate", "Algorithm"); LOG_VERBOSE("-------------------------|-----|-------------"); +# ifdef XMRIG_OS_WIN + LARGE_INTEGER timer_freq; + QueryPerformanceFrequency(&timer_freq); + auto measure_time = []() { LARGE_INTEGER t; QueryPerformanceCounter(&t); return t.QuadPart; }; + auto delta_time = [&timer_freq](LONGLONG t1, LONGLONG t2) { return static_cast(t2 - t1) / timer_freq.QuadPart; }; +# else using namespace std::chrono; + auto measure_time = []() { return high_resolution_clock::now(); }; + auto delta_time = [](const high_resolution_clock::time_point& t1, const high_resolution_clock::time_point& t2) { return duration_cast(t2 - t1).count() / 1e9; }; +# endif for (uint32_t algo = 0; algo < 6; ++algo) { for (uint64_t step : { 1, 2, 4}) { @@ -339,20 +352,20 @@ void benchmark() auto f = CnHash::fn(cn_hash[algo], av[step], Assembly::AUTO); - const high_resolution_clock::time_point start_time = high_resolution_clock::now(); + auto start_time = measure_time(); double min_dt = 1e10; for (uint32_t iter = 0;; ++iter) { - const high_resolution_clock::time_point t1 = high_resolution_clock::now(); + auto t1 = measure_time(); // Stop after 15 milliseconds, but only if at least 10 iterations were done - if ((iter >= 10) && (duration_cast(t1 - start_time).count() >= 15)) { + if ((iter >= 10) && (delta_time(start_time, t1) >= 0.015)) { break; } f(buf, sizeof(buf), hash, ctx, 0); - const double dt = duration_cast(high_resolution_clock::now() - t1).count() / 1e9; + const double dt = delta_time(t1, measure_time()); if (dt < min_dt) { min_dt = dt; } @@ -388,14 +401,14 @@ void benchmark() auto f = CnHash::fn(cn_hash[algo], av[step], Assembly::AUTO); - const high_resolution_clock::time_point start_time = high_resolution_clock::now(); + auto start_time = measure_time(); double min_dt = 1e10; for (uint32_t iter = 0;; ++iter) { - const high_resolution_clock::time_point t1 = high_resolution_clock::now(); + auto t1 = measure_time(); // Stop after 30 milliseconds, but only if at least 10 iterations were done - if ((iter >= 10) && (duration_cast(t1 - start_time).count() >= 30)) { + if ((iter >= 10) && (delta_time(start_time, t1) >= 0.03)) { break; } @@ -403,7 +416,7 @@ void benchmark() f(buf, sizeof(buf), hash, ctx, 0); helper->wait(); - const double dt = duration_cast(high_resolution_clock::now() - t1).count() / 1e9; + const double dt = delta_time(t1, measure_time()); if (dt < min_dt) { min_dt = dt; }