Merge pull request #2751 from SChernykh/dev

VAES crash fixes
2025-01-09 12:29:24 +00:00 · 2021-11-30 09:49:37 +07:00 · 2021-11-30 09:49:37 +07:00 · e673d541c1
commit e673d541c1
parent be5fbca9b6 a98db529fb
3 changed files with 42 additions and 29 deletions
--- a/src/backend/cpu/platform/BasicCpuInfo.cpp
+++ b/src/backend/cpu/platform/BasicCpuInfo.cpp
@ -140,7 +140,7 @@ static inline bool has_osxsave()    { return has_feature(PROCESSOR_INFO,
 static inline bool has_aes_ni()     { return has_feature(PROCESSOR_INFO,        ECX_Reg, 1 << 25); }
 static inline bool has_avx()        { return has_feature(PROCESSOR_INFO,        ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); }
 static inline bool has_avx2()       { return has_feature(EXTENDED_FEATURES,     EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); }
-static inline bool has_vaes()       { return has_feature(EXTENDED_FEATURES,     ECX_Reg, 1 << 9); }
+static inline bool has_vaes()       { return has_feature(EXTENDED_FEATURES,     ECX_Reg, 1 << 9) && has_osxsave() && has_xcr_avx(); }
 static inline bool has_avx512f()    { return has_feature(EXTENDED_FEATURES,     EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); }
 static inline bool has_bmi2()       { return has_feature(EXTENDED_FEATURES,     EBX_Reg, 1 << 8); }
 static inline bool has_pdpe1gb()    { return has_feature(PROCESSOR_EXT_INFO,    EDX_Reg, 1 << 26); }
--- a/src/crypto/cn/CryptoNight_x86_vaes.cpp
+++ b/src/crypto/cn/CryptoNight_x86_vaes.cpp
@ -179,16 +179,16 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx)

    if (props.half_mem() && !ctx->first_half) {
        const __m256i* p = reinterpret_cast<const __m256i*>(ctx->save_state);
-        xin01 = _mm256_load_si256(p + 0);
-        xin23 = _mm256_load_si256(p + 1);
-        xin45 = _mm256_load_si256(p + 2);
-        xin67 = _mm256_load_si256(p + 3);
+        xin01 = _mm256_loadu_si256(p + 0);
+        xin23 = _mm256_loadu_si256(p + 1);
+        xin45 = _mm256_loadu_si256(p + 2);
+        xin67 = _mm256_loadu_si256(p + 3);
    }
    else {
-        xin01 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 4));
-        xin23 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 6));
-        xin45 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 8));
-        xin67 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 10));
+        xin01 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + 4));
+        xin23 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + 6));
+        xin45 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + 8));
+        xin67 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + 10));
    }

    constexpr int output_increment = 64 / sizeof(__m256i);
@ -228,10 +228,10 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx)

    if (props.half_mem() && ctx->first_half) {
        __m256i* p = reinterpret_cast<__m256i*>(ctx->save_state);
-        _mm256_store_si256(p + 0, xin01);
-        _mm256_store_si256(p + 1, xin23);
-        _mm256_store_si256(p + 2, xin45);
-        _mm256_store_si256(p + 3, xin67);
+        _mm256_storeu_si256(p + 0, xin01);
+        _mm256_storeu_si256(p + 1, xin23);
+        _mm256_storeu_si256(p + 2, xin45);
+        _mm256_storeu_si256(p + 3, xin67);
    }

    _mm256_zeroupper();
@ -347,10 +347,10 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx)

    vaes_genkey(reinterpret_cast<__m128i*>(output) + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);

-    xout01 = _mm256_load_si256(output + 2);
-    xout23 = _mm256_load_si256(output + 3);
-    xout45 = _mm256_load_si256(output + 4);
-    xout67 = _mm256_load_si256(output + 5);
+    xout01 = _mm256_loadu_si256(output + 2);
+    xout23 = _mm256_loadu_si256(output + 3);
+    xout45 = _mm256_loadu_si256(output + 4);
+    xout67 = _mm256_loadu_si256(output + 5);

    const __m256i* input_begin = input;
    for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
@ -390,10 +390,10 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx)
        }
    }

-    _mm256_store_si256(output + 2, xout01);
-    _mm256_store_si256(output + 3, xout23);
-    _mm256_store_si256(output + 4, xout45);
-    _mm256_store_si256(output + 5, xout67);
+    _mm256_storeu_si256(output + 2, xout01);
+    _mm256_storeu_si256(output + 3, xout23);
+    _mm256_storeu_si256(output + 4, xout45);
+    _mm256_storeu_si256(output + 5, xout67);

    _mm256_zeroupper();
 }
--- a/src/crypto/ghostrider/ghostrider.cpp
+++ b/src/crypto/ghostrider/ghostrider.cpp
@ -61,6 +61,10 @@
 #   include <intrin.h>
 #endif

+#ifdef XMRIG_OS_WIN
+#   include <Windows.h>
+#endif
+
 #define CORE_HASH(i, x) static void h##i(const uint8_t* data, size_t size, uint8_t* output) \
 { \
    sph_##x##_context ctx; \
@ -328,7 +332,16 @@ void benchmark()
        LOG_VERBOSE("%24s |  N  | Hashrate", "Algorithm");
        LOG_VERBOSE("-------------------------|-----|-------------");

+#       ifdef XMRIG_OS_WIN
+        LARGE_INTEGER timer_freq;
+        QueryPerformanceFrequency(&timer_freq);
+        auto measure_time = []() { LARGE_INTEGER t; QueryPerformanceCounter(&t); return t.QuadPart; };
+        auto delta_time = [&timer_freq](LONGLONG t1, LONGLONG t2) { return static_cast<double>(t2 - t1) / timer_freq.QuadPart; };
+#       else
        using namespace std::chrono;
+        auto measure_time = []() { return high_resolution_clock::now(); };
+        auto delta_time = [](const high_resolution_clock::time_point& t1, const high_resolution_clock::time_point& t2) { return duration_cast<nanoseconds>(t2 - t1).count() / 1e9; };
+#       endif

        for (uint32_t algo = 0; algo < 6; ++algo) {
            for (uint64_t step : { 1, 2, 4}) {
@ -339,20 +352,20 @@ void benchmark()

                auto f = CnHash::fn(cn_hash[algo], av[step], Assembly::AUTO);

-                const high_resolution_clock::time_point start_time = high_resolution_clock::now();
+                auto start_time = measure_time();

                double min_dt = 1e10;
                for (uint32_t iter = 0;; ++iter) {
-                    const high_resolution_clock::time_point t1 = high_resolution_clock::now();
+                    auto t1 = measure_time();

                    // Stop after 15 milliseconds, but only if at least 10 iterations were done
-                    if ((iter >= 10) && (duration_cast<milliseconds>(t1 - start_time).count() >= 15)) {
+                    if ((iter >= 10) && (delta_time(start_time, t1) >= 0.015)) {
                        break;
                    }

                    f(buf, sizeof(buf), hash, ctx, 0);

-                    const double dt = duration_cast<nanoseconds>(high_resolution_clock::now() - t1).count() / 1e9;
+                    const double dt = delta_time(t1, measure_time());
                    if (dt < min_dt) {
                        min_dt = dt;
                    }
@ -388,14 +401,14 @@ void benchmark()

                auto f = CnHash::fn(cn_hash[algo], av[step], Assembly::AUTO);

-                const high_resolution_clock::time_point start_time = high_resolution_clock::now();
+                auto start_time = measure_time();

                double min_dt = 1e10;
                for (uint32_t iter = 0;; ++iter) {
-                    const high_resolution_clock::time_point t1 = high_resolution_clock::now();
+                    auto t1 = measure_time();

                    // Stop after 30 milliseconds, but only if at least 10 iterations were done
-                    if ((iter >= 10) && (duration_cast<milliseconds>(t1 - start_time).count() >= 30)) {
+                    if ((iter >= 10) && (delta_time(start_time, t1) >= 0.03)) {
                        break;
                    }

@ -403,7 +416,7 @@ void benchmark()
                    f(buf, sizeof(buf), hash, ctx, 0);
                    helper->wait();

-                    const double dt = duration_cast<nanoseconds>(high_resolution_clock::now() - t1).count() / 1e9;
+                    const double dt = delta_time(t1, measure_time());
                    if (dt < min_dt) {
                        min_dt = dt;
                    }