Merge pull request #1830 from SChernykh/dev

RandomX: added performance profiler (for developers)
2025-03-29 10:38:57 +00:00 · 2020-09-13 04:38:03 +07:00 · 2020-09-13 04:38:03 +07:00 · fa0bb0e1bf
commit fa0bb0e1bf
parent adf833b60a a05393727c
19 changed files with 390 additions and 481 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -23,6 +23,7 @@ option(WITH_NVML            "Enable NVML (NVIDIA Management Library) support (on
 option(WITH_ADL             "Enable ADL (AMD Display Library) or sysfs support (only if OpenCL backend enabled)" ON)
 option(WITH_STRICT_CACHE    "Enable strict checks for OpenCL cache" ON)
 option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF)
+option(WITH_PROFILING       "Enable profiling for developers" OFF)

 option(BUILD_STATIC         "Build static binary" OFF)
 option(ARM_TARGET           "Force use specific ARM target 8 or 7" 0)
--- a/src/base/base.cmake
+++ b/src/base/base.cmake
@ -222,3 +222,15 @@ if (WITH_KAWPOW)
        src/base/net/stratum/EthStratumClient.cpp
        )
 endif()
+
+if (WITH_PROFILING)
+    add_definitions(/DXMRIG_FEATURE_PROFILING)
+
+    list(APPEND HEADERS_BASE
+        src/base/tools/Profiler.h
+        )
+
+    list(APPEND SOURCES_BASE
+        src/base/tools/Profiler.cpp
+        )
+endif()
--- a/src/base/io/log/Tags.cpp
+++ b/src/base/io/log/Tags.cpp
@ -101,3 +101,13 @@ const char *xmrig::Tags::opencl()
    return tag;
 }
 #endif
+
+
+#ifdef XMRIG_FEATURE_PROFILING
+const char* xmrig::Tags::profiler()
+{
+    static const char* tag = CYAN_BG_BOLD(WHITE_BOLD_S " profile ");
+
+    return tag;
+}
+#endif
--- a/src/base/io/log/Tags.h
+++ b/src/base/io/log/Tags.h
@ -53,6 +53,10 @@ public:
 #   ifdef XMRIG_FEATURE_OPENCL
    static const char *opencl();
 #   endif
+
+#   ifdef XMRIG_FEATURE_PROFILING
+    static const char* profiler();
+#   endif
 };


--- a/src/base/tools/Profiler.cpp
+++ b/src/base/tools/Profiler.cpp
@ -0,0 +1,100 @@
+/* XMRig
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "base/tools/Profiler.h"
+#include "base/io/log/Log.h"
+#include "base/io/log/Tags.h"
+#include <sstream>
+#include <thread>
+#include <chrono>
+#include <algorithm>
+
+
+#ifdef XMRIG_FEATURE_PROFILING
+
+
+ProfileScopeData* ProfileScopeData::s_data[MAX_DATA_COUNT] = {};
+volatile long ProfileScopeData::s_dataCount = 0;
+double ProfileScopeData::s_tscSpeed = 0.0;
+
+
+#ifndef NOINLINE
+#ifdef __GNUC__
+#define NOINLINE __attribute__ ((noinline))
+#elif _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE
+#endif
+#endif
+
+
+static std::string get_thread_id()
+{
+    std::stringstream ss;
+    ss << std::this_thread::get_id();
+
+    std::string s = ss.str();
+    if (s.length() > ProfileScopeData::MAX_THREAD_ID_LENGTH) {
+        s.resize(ProfileScopeData::MAX_THREAD_ID_LENGTH);
+    }
+
+    return s;
+}
+
+
+NOINLINE void ProfileScopeData::Register(ProfileScopeData* data)
+{
+#ifdef _MSC_VER
+    const long id = _InterlockedIncrement(&s_dataCount) - 1;
+#else
+    const long id = __sync_fetch_and_add(&s_dataCount, 1);
+#endif
+
+    if (static_cast<unsigned long>(id) < MAX_DATA_COUNT) {
+        s_data[id] = data;
+
+        const std::string s = get_thread_id();
+        memcpy(data->m_threadId, s.c_str(), s.length() + 1);
+    }
+}
+
+
+NOINLINE void ProfileScopeData::Init()
+{
+    using namespace std::chrono;
+
+    const uint64_t t1 = static_cast<uint64_t>(time_point_cast<nanoseconds>(high_resolution_clock::now()).time_since_epoch().count());
+    const uint64_t count1 = ReadTSC();
+
+    for (;;)
+    {
+        const uint64_t t2 = static_cast<uint64_t>(time_point_cast<nanoseconds>(high_resolution_clock::now()).time_since_epoch().count());
+        const uint64_t count2 = ReadTSC();
+
+        if (t2 - t1 > 1000000000) {
+            s_tscSpeed = (count2 - count1) * 1e9 / (t2 - t1);
+            LOG_INFO("%s TSC speed = %.3f GHz", xmrig::Tags::profiler(), s_tscSpeed / 1e9);
+            return;
+        }
+    }
+}
+
+
+#endif /* XMRIG_FEATURE_PROFILING */
--- a/src/base/tools/Profiler.h
+++ b/src/base/tools/Profiler.h
@ -0,0 +1,132 @@
+/* XMRig
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_PROFILER_H
+#define XMRIG_PROFILER_H
+
+
+#ifndef FORCE_INLINE
+#if defined(_MSC_VER)
+#define FORCE_INLINE __forceinline
+#elif defined(__GNUC__)
+#define FORCE_INLINE __attribute__((always_inline)) inline
+#elif defined(__clang__)
+#define FORCE_INLINE __inline__
+#else
+#define FORCE_INLINE
+#endif
+#endif
+
+
+#ifdef XMRIG_FEATURE_PROFILING
+
+
+#include <cstdint>
+#include <type_traits>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+
+static FORCE_INLINE uint64_t ReadTSC()
+{
+#ifdef _MSC_VER
+    return __rdtsc();
+#else
+    uint32_t hi, lo;
+    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+    return (((uint64_t)hi) << 32) | lo;
+#endif
+}
+
+
+struct ProfileScopeData
+{
+    const char* m_name;
+    uint64_t m_totalCycles;
+    uint32_t m_totalSamples;
+
+    enum
+    {
+        MAX_THREAD_ID_LENGTH = 11,
+        MAX_SAMPLE_COUNT = 128,
+        MAX_DATA_COUNT = 1024
+    };
+
+    char m_threadId[MAX_THREAD_ID_LENGTH + 1];
+
+    static ProfileScopeData* s_data[MAX_DATA_COUNT];
+    static volatile long s_dataCount;
+    static double s_tscSpeed;
+
+    static void Register(ProfileScopeData* data);
+    static void Init();
+};
+
+static_assert(std::is_trivial<ProfileScopeData>::value, "ProfileScopeData must be a trivial struct");
+static_assert(sizeof(ProfileScopeData) <= 32, "ProfileScopeData struct is too big");
+
+
+class ProfileScope
+{
+public:
+    FORCE_INLINE ProfileScope(ProfileScopeData& data)
+        : m_data(data)
+    {
+        if (m_data.m_totalCycles == 0) {
+            ProfileScopeData::Register(&data);
+        }
+
+        m_startCounter = ReadTSC();
+    }
+
+    FORCE_INLINE ~ProfileScope()
+    {
+        m_data.m_totalCycles += ReadTSC() - m_startCounter;
+        ++m_data.m_totalSamples;
+    }
+
+private:
+    ProfileScopeData& m_data;
+    uint64_t m_startCounter;
+};
+
+
+#define PROFILE_SCOPE(x) static thread_local ProfileScopeData x##_data{#x}; ProfileScope x(x##_data);
+
+
+#else /* XMRIG_FEATURE_PROFILING */
+#define PROFILE_SCOPE(x)
+#endif /* XMRIG_FEATURE_PROFILING */
+
+
+#include "crypto/randomx/blake2/blake2.h"
+
+
+struct rx_blake2b_wrapper
+{
+    FORCE_INLINE static void run(void* out, size_t outlen, const void* in, size_t inlen)
+    {
+        PROFILE_SCOPE(RandomX_Blake2b);
+        rx_blake2b(out, outlen, in, inlen);
+    }
+};
+
+
+#endif /* XMRIG_PROFILER_H */
--- a/src/core/Miner.cpp
+++ b/src/core/Miner.cpp
@ -38,6 +38,7 @@
 #include "base/kernel/Platform.h"
 #include "base/net/stratum/Job.h"
 #include "base/tools/Object.h"
+#include "base/tools/Profiler.h"
 #include "base/tools/Timer.h"
 #include "core/config/Config.h"
 #include "core/Controller.h"
@ -267,6 +268,44 @@ public:
            h = "MH/s";
        }

+#       ifdef XMRIG_FEATURE_PROFILING
+        ProfileScopeData* data[ProfileScopeData::MAX_DATA_COUNT];
+
+        const uint32_t n = std::min<uint32_t>(ProfileScopeData::s_dataCount, ProfileScopeData::MAX_DATA_COUNT);
+        memcpy(data, ProfileScopeData::s_data, n * sizeof(ProfileScopeData*));
+
+        std::sort(data, data + n, [](ProfileScopeData* a, ProfileScopeData* b) {
+            return strcmp(a->m_threadId, b->m_threadId) < 0;
+        });
+
+        for (uint32_t i = 0; i < n;)
+        {
+            uint32_t n1 = i;
+            while ((n1 < n) && (strcmp(data[i]->m_threadId, data[n1]->m_threadId) == 0)) {
+                ++n1;
+            }
+
+            std::sort(data + i, data + n1, [](ProfileScopeData* a, ProfileScopeData* b) {
+                return a->m_totalCycles > b->m_totalCycles;
+            });
+
+            for (uint32_t j = i; j < n1; ++j) {
+                ProfileScopeData* p = data[j];
+                LOG_INFO("%s Thread %6s | %-30s | %7.3f%% | %9.0f ns",
+                    Tags::profiler(),
+                    p->m_threadId,
+                    p->m_name,
+                    p->m_totalCycles * 100.0 / data[i]->m_totalCycles,
+                    p->m_totalCycles / p->m_totalSamples * 1e9 / ProfileScopeData::s_tscSpeed
+                );
+            }
+
+            LOG_INFO("%s --------------|--------------------------------|----------|-------------", Tags::profiler());
+
+            i = n1;
+        }
+#       endif
+
        LOG_INFO("%s " WHITE_BOLD("speed") " 10s/60s/15m " CYAN_BOLD("%s") CYAN(" %s %s ") CYAN_BOLD("%s") " max " CYAN_BOLD("%s %s"),
                 Tags::miner(),
                 Hashrate::format(speed[0] * scale,                 num,          sizeof(num) / 4),
@ -311,6 +350,10 @@ xmrig::Miner::Miner(Controller *controller)
        Platform::setThreadPriority(std::min(priority + 1, 5));
    }

+#   ifdef XMRIG_FEATURE_PROFILING
+    ProfileScopeData::Init();
+#   endif
+
 #   ifdef XMRIG_ALGO_RANDOMX
    Rx::init(this);
 #   endif
--- a/src/crypto/randomx/aes_hash.cpp
+++ b/src/crypto/randomx/aes_hash.cpp
@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "crypto/randomx/soft_aes.h"
 #include "crypto/randomx/randomx.h"
+#include "base/tools/Profiler.h"

 #define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
 #define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e
@ -215,6 +216,8 @@ template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);

 template<bool softAes>
 void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
+	PROFILE_SCOPE(RandomX_AES);
+
 	uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
 	const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;

--- a/src/crypto/randomx/blake2/blake2.h
+++ b/src/crypto/randomx/blake2/blake2.h
@ -92,7 +92,7 @@ extern "C" {
    int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen);

 	/* Simple API */
-    int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen);
+    int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen);

 	/* Argon2 Team - Begin Code */
 	int rxa2_blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
--- a/src/crypto/randomx/blake2/blake2b-load-sse41.h
+++ b/src/crypto/randomx/blake2/blake2b-load-sse41.h
@ -1,402 +0,0 @@
-/*
-   BLAKE2 reference source code package - optimized C implementations
-
-   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
-   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
-   your option.  The terms of these licenses can be found at:
-
-   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
-   - OpenSSL license   : https://www.openssl.org/source/license.html
-   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
-
-   More information about the BLAKE2 hash function can be found at
-   https://blake2.net.
-*/
-#ifndef BLAKE2B_LOAD_SSE41_H
-#define BLAKE2B_LOAD_SSE41_H
-
-#define LOAD_MSG_0_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m0, m1); \
-b1 = _mm_unpacklo_epi64(m2, m3); \
-} while(0)
-
-
-#define LOAD_MSG_0_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m0, m1); \
-b1 = _mm_unpackhi_epi64(m2, m3); \
-} while(0)
-
-
-#define LOAD_MSG_0_3(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m4, m5); \
-b1 = _mm_unpacklo_epi64(m6, m7); \
-} while(0)
-
-
-#define LOAD_MSG_0_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m4, m5); \
-b1 = _mm_unpackhi_epi64(m6, m7); \
-} while(0)
-
-
-#define LOAD_MSG_1_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m7, m2); \
-b1 = _mm_unpackhi_epi64(m4, m6); \
-} while(0)
-
-
-#define LOAD_MSG_1_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m5, m4); \
-b1 = _mm_alignr_epi8(m3, m7, 8); \
-} while(0)
-
-
-#define LOAD_MSG_1_3(b0, b1) \
-do \
-{ \
-b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
-b1 = _mm_unpackhi_epi64(m5, m2); \
-} while(0)
-
-
-#define LOAD_MSG_1_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m6, m1); \
-b1 = _mm_unpackhi_epi64(m3, m1); \
-} while(0)
-
-
-#define LOAD_MSG_2_1(b0, b1) \
-do \
-{ \
-b0 = _mm_alignr_epi8(m6, m5, 8); \
-b1 = _mm_unpackhi_epi64(m2, m7); \
-} while(0)
-
-
-#define LOAD_MSG_2_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m4, m0); \
-b1 = _mm_blend_epi16(m1, m6, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_2_3(b0, b1) \
-do \
-{ \
-b0 = _mm_blend_epi16(m5, m1, 0xF0); \
-b1 = _mm_unpackhi_epi64(m3, m4); \
-} while(0)
-
-
-#define LOAD_MSG_2_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m7, m3); \
-b1 = _mm_alignr_epi8(m2, m0, 8); \
-} while(0)
-
-
-#define LOAD_MSG_3_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m3, m1); \
-b1 = _mm_unpackhi_epi64(m6, m5); \
-} while(0)
-
-
-#define LOAD_MSG_3_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m4, m0); \
-b1 = _mm_unpacklo_epi64(m6, m7); \
-} while(0)
-
-
-#define LOAD_MSG_3_3(b0, b1) \
-do \
-{ \
-b0 = _mm_blend_epi16(m1, m2, 0xF0); \
-b1 = _mm_blend_epi16(m2, m7, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_3_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m3, m5); \
-b1 = _mm_unpacklo_epi64(m0, m4); \
-} while(0)
-
-
-#define LOAD_MSG_4_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m4, m2); \
-b1 = _mm_unpacklo_epi64(m1, m5); \
-} while(0)
-
-
-#define LOAD_MSG_4_2(b0, b1) \
-do \
-{ \
-b0 = _mm_blend_epi16(m0, m3, 0xF0); \
-b1 = _mm_blend_epi16(m2, m7, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_4_3(b0, b1) \
-do \
-{ \
-b0 = _mm_blend_epi16(m7, m5, 0xF0); \
-b1 = _mm_blend_epi16(m3, m1, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_4_4(b0, b1) \
-do \
-{ \
-b0 = _mm_alignr_epi8(m6, m0, 8); \
-b1 = _mm_blend_epi16(m4, m6, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_5_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m1, m3); \
-b1 = _mm_unpacklo_epi64(m0, m4); \
-} while(0)
-
-
-#define LOAD_MSG_5_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m6, m5); \
-b1 = _mm_unpackhi_epi64(m5, m1); \
-} while(0)
-
-
-#define LOAD_MSG_5_3(b0, b1) \
-do \
-{ \
-b0 = _mm_blend_epi16(m2, m3, 0xF0); \
-b1 = _mm_unpackhi_epi64(m7, m0); \
-} while(0)
-
-
-#define LOAD_MSG_5_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m6, m2); \
-b1 = _mm_blend_epi16(m7, m4, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_6_1(b0, b1) \
-do \
-{ \
-b0 = _mm_blend_epi16(m6, m0, 0xF0); \
-b1 = _mm_unpacklo_epi64(m7, m2); \
-} while(0)
-
-
-#define LOAD_MSG_6_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m2, m7); \
-b1 = _mm_alignr_epi8(m5, m6, 8); \
-} while(0)
-
-
-#define LOAD_MSG_6_3(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m0, m3); \
-b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
-} while(0)
-
-
-#define LOAD_MSG_6_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m3, m1); \
-b1 = _mm_blend_epi16(m1, m5, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_7_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m6, m3); \
-b1 = _mm_blend_epi16(m6, m1, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_7_2(b0, b1) \
-do \
-{ \
-b0 = _mm_alignr_epi8(m7, m5, 8); \
-b1 = _mm_unpackhi_epi64(m0, m4); \
-} while(0)
-
-
-#define LOAD_MSG_7_3(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m2, m7); \
-b1 = _mm_unpacklo_epi64(m4, m1); \
-} while(0)
-
-
-#define LOAD_MSG_7_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m0, m2); \
-b1 = _mm_unpacklo_epi64(m3, m5); \
-} while(0)
-
-
-#define LOAD_MSG_8_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m3, m7); \
-b1 = _mm_alignr_epi8(m0, m5, 8); \
-} while(0)
-
-
-#define LOAD_MSG_8_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m7, m4); \
-b1 = _mm_alignr_epi8(m4, m1, 8); \
-} while(0)
-
-
-#define LOAD_MSG_8_3(b0, b1) \
-do \
-{ \
-b0 = m6; \
-b1 = _mm_alignr_epi8(m5, m0, 8); \
-} while(0)
-
-
-#define LOAD_MSG_8_4(b0, b1) \
-do \
-{ \
-b0 = _mm_blend_epi16(m1, m3, 0xF0); \
-b1 = m2; \
-} while(0)
-
-
-#define LOAD_MSG_9_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m5, m4); \
-b1 = _mm_unpackhi_epi64(m3, m0); \
-} while(0)
-
-
-#define LOAD_MSG_9_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m1, m2); \
-b1 = _mm_blend_epi16(m3, m2, 0xF0); \
-} while(0)
-
-
-#define LOAD_MSG_9_3(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m7, m4); \
-b1 = _mm_unpackhi_epi64(m1, m6); \
-} while(0)
-
-
-#define LOAD_MSG_9_4(b0, b1) \
-do \
-{ \
-b0 = _mm_alignr_epi8(m7, m5, 8); \
-b1 = _mm_unpacklo_epi64(m6, m0); \
-} while(0)
-
-
-#define LOAD_MSG_10_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m0, m1); \
-b1 = _mm_unpacklo_epi64(m2, m3); \
-} while(0)
-
-
-#define LOAD_MSG_10_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m0, m1); \
-b1 = _mm_unpackhi_epi64(m2, m3); \
-} while(0)
-
-
-#define LOAD_MSG_10_3(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m4, m5); \
-b1 = _mm_unpacklo_epi64(m6, m7); \
-} while(0)
-
-
-#define LOAD_MSG_10_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpackhi_epi64(m4, m5); \
-b1 = _mm_unpackhi_epi64(m6, m7); \
-} while(0)
-
-
-#define LOAD_MSG_11_1(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m7, m2); \
-b1 = _mm_unpackhi_epi64(m4, m6); \
-} while(0)
-
-
-#define LOAD_MSG_11_2(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m5, m4); \
-b1 = _mm_alignr_epi8(m3, m7, 8); \
-} while(0)
-
-
-#define LOAD_MSG_11_3(b0, b1) \
-do \
-{ \
-b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
-b1 = _mm_unpackhi_epi64(m5, m2); \
-} while(0)
-
-
-#define LOAD_MSG_11_4(b0, b1) \
-do \
-{ \
-b0 = _mm_unpacklo_epi64(m6, m1); \
-b1 = _mm_unpackhi_epi64(m3, m1); \
-} while(0)
-
-
-#endif
--- a/src/crypto/randomx/blake2/blake2b-round.h
+++ b/src/crypto/randomx/blake2/blake2b-round.h
@ -102,17 +102,21 @@
  row4l = t1; \
  row4h = t0;

-#include "blake2b-load-sse41.h"
+#define LOAD_MSG(r, i, b0, b1) \
+do { \
+  b0 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 1]], m[blake2b_sigma_sse41[r][i * 4 + 0]]); \
+  b1 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 3]], m[blake2b_sigma_sse41[r][i * 4 + 2]]); \
+} while(0)

 #define ROUND(r) \
-  LOAD_MSG_ ##r ##_1(b0, b1); \
+  LOAD_MSG(r, 0, b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
-  LOAD_MSG_ ##r ##_2(b0, b1); \
+  LOAD_MSG(r, 1, b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
-  LOAD_MSG_ ##r ##_3(b0, b1); \
+  LOAD_MSG(r, 2, b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
-  LOAD_MSG_ ##r ##_4(b0, b1); \
+  LOAD_MSG(r, 3, b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);

--- a/src/crypto/randomx/blake2/blake2b.c
+++ b/src/crypto/randomx/blake2/blake2b.c
@ -56,6 +56,23 @@ static const uint64_t blake2b_IV[8] = {
 	UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
 	UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) };

+#if defined(_M_X64) || defined(__x86_64__)
+static const uint8_t blake2b_sigma_sse41[12][16] = {
+	{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
+	{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
+	{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
+	{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
+	{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
+	{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
+	{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
+	{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
+	{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
+	{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
+	{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
+	{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
+};
+#endif
+
 static const uint8_t blake2b_sigma[12][16] = {
 	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
@ -203,15 +220,6 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
 	const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
 	const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);

-	const __m128i m0 = LOADU(block + 00);
-	const __m128i m1 = LOADU(block + 16);
-	const __m128i m2 = LOADU(block + 32);
-	const __m128i m3 = LOADU(block + 48);
-	const __m128i m4 = LOADU(block + 64);
-	const __m128i m5 = LOADU(block + 80);
-	const __m128i m6 = LOADU(block + 96);
-	const __m128i m7 = LOADU(block + 112);
-
 	row1l = LOADU(&S->h[0]);
 	row1h = LOADU(&S->h[2]);
 	row2l = LOADU(&S->h[4]);
@ -221,18 +229,11 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
 	row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
 	row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));

-	ROUND(0);
-	ROUND(1);
-	ROUND(2);
-	ROUND(3);
-	ROUND(4);
-	ROUND(5);
-	ROUND(6);
-	ROUND(7);
-	ROUND(8);
-	ROUND(9);
-	ROUND(10);
-	ROUND(11);
+	const uint64_t* m = (const uint64_t*)(block);
+
+	for (uint32_t r = 0; r < 12; ++r) {
+		ROUND(r);
+	}

 	row1l = _mm_xor_si128(row3l, row1l);
 	row1h = _mm_xor_si128(row3h, row1h);
@ -388,8 +389,7 @@ int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen) {
 	return 0;
 }

-int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen,
-	const void *key, size_t keylen) {
+int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen) {
 	blake2b_state S;
 	int ret = -1;

@ -402,25 +402,14 @@ int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen,
 		goto fail;
 	}

-	if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
+	if (rx_blake2b_init(&S, outlen) < 0) {
 		goto fail;
 	}

-	if (keylen > 0) {
-        if (rx_blake2b_init_key(&S, outlen, key, keylen) < 0) {
-			goto fail;
-		}
-	}
-	else {
-        if (rx_blake2b_init(&S, outlen) < 0) {
-			goto fail;
-		}
-	}
-
-    if (rx_blake2b_update(&S, in, inlen) < 0) {
+	if (rx_blake2b_update(&S, in, inlen) < 0) {
 		goto fail;
 	}
-    ret = rx_blake2b_final(&S, out, outlen);
+	ret = rx_blake2b_final(&S, out, outlen);

 fail:
 	//clear_internal_memory(&S, sizeof(S));
@ -442,43 +431,42 @@ int rxa2_blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
 	store32(outlen_bytes, (uint32_t)outlen);

 #define TRY(statement)                                                         \
-    do {                                                                       \
-        ret = statement;                                                       \
-        if (ret < 0) {                                                         \
-            goto fail;                                                         \
-        }                                                                      \
-    } while ((void)0, 0)
+	do {                                                                       \
+		ret = statement;                                                       \
+		if (ret < 0) {                                                         \
+			goto fail;                                                         \
+		}                                                                      \
+	} while ((void)0, 0)

 	if (outlen <= BLAKE2B_OUTBYTES) {
-        TRY(rx_blake2b_init(&blake_state, outlen));
-        TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
-        TRY(rx_blake2b_update(&blake_state, in, inlen));
-        TRY(rx_blake2b_final(&blake_state, out, outlen));
+		TRY(rx_blake2b_init(&blake_state, outlen));
+		TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+		TRY(rx_blake2b_update(&blake_state, in, inlen));
+		TRY(rx_blake2b_final(&blake_state, out, outlen));
 	}
 	else {
 		uint32_t toproduce;
 		uint8_t out_buffer[BLAKE2B_OUTBYTES];
 		uint8_t in_buffer[BLAKE2B_OUTBYTES];
-        TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
-        TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
-        TRY(rx_blake2b_update(&blake_state, in, inlen));
-        TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
+		TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
+		TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+		TRY(rx_blake2b_update(&blake_state, in, inlen));
+		TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
 		memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
 		out += BLAKE2B_OUTBYTES / 2;
 		toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;

 		while (toproduce > BLAKE2B_OUTBYTES) {
 			memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
-            TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
-				BLAKE2B_OUTBYTES, NULL, 0));
+			TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
+				BLAKE2B_OUTBYTES));
 			memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
 			out += BLAKE2B_OUTBYTES / 2;
 			toproduce -= BLAKE2B_OUTBYTES / 2;
 		}

 		memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
-        TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
-			0));
+		TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES));
 		memcpy(out, out_buffer, toproduce);
 	}
 fail:
--- a/src/crypto/randomx/blake2_generator.cpp
+++ b/src/crypto/randomx/blake2_generator.cpp
@ -55,7 +55,7 @@ namespace randomx {

 	void Blake2Generator::checkData(const size_t bytesNeeded) {
 		if (dataIndex + bytesNeeded > sizeof(data)) {
-            rx_blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0);
+			rx_blake2b(data, sizeof(data), data, sizeof(data));
 			dataIndex = 0;
 		}
 	}
--- a/src/crypto/randomx/jit_compiler_x86.cpp
+++ b/src/crypto/randomx/jit_compiler_x86.cpp
@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "crypto/randomx/program.hpp"
 #include "crypto/randomx/reciprocal.h"
 #include "crypto/randomx/virtual_memory.hpp"
+#include "base/tools/Profiler.h"

 #ifdef XMRIG_FIX_RYZEN
 #   include "crypto/rx/Rx.h"
@ -255,6 +256,8 @@ namespace randomx {
 	}

 	void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) {
+		PROFILE_SCOPE(RandomX_JIT_compile);
+
 		vm_flags = flags;

 		generateProgramPrologue(prog, pcfg);
--- a/src/crypto/randomx/randomx.cpp
+++ b/src/crypto/randomx/randomx.cpp
@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <cassert>

+#include "base/tools/Profiler.h"
+
 RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
 {
 	ArgonSalt = "RandomWOW\x01";
@ -574,33 +576,35 @@ extern "C" {
 		assert(inputSize == 0 || input != nullptr);
 		assert(output != nullptr);
 		alignas(16) uint64_t tempHash[8];
-		rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
+		rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize);
 		machine->initScratchpad(&tempHash);
 		machine->resetRoundingMode();
 		for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
 			machine->run(&tempHash);
-			rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
+			rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile));
 		}
 		machine->run(&tempHash);
-		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
+		machine->getFinalResult(output);
 	}

 	void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) {
-		rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
+		rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize);
 		machine->initScratchpad(tempHash);
 	}

 	void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) {
+		PROFILE_SCOPE(RandomX_hash);
+
 		machine->resetRoundingMode();
 		for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
 			machine->run(&tempHash);
-			rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
+			rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile));
 		}
 		machine->run(&tempHash);

 		// Finish current hash and fill the scratchpad for the next hash at the same time
-		rx_blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0);
-		machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash);
+		rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), nextInput, nextInputSize);
+		machine->hashAndFill(output, tempHash);
 	}

 }
--- a/src/crypto/randomx/virtual_machine.cpp
+++ b/src/crypto/randomx/virtual_machine.cpp
@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "crypto/randomx/blake2/blake2.h"
 #include "crypto/randomx/intrin_portable.h"
 #include "crypto/randomx/allocator.hpp"
+#include "base/tools/Profiler.h"

 randomx_vm::~randomx_vm() {

@ -109,15 +110,15 @@ namespace randomx {
 	}

 	template<bool softAes>
-	void VmBase<softAes>::getFinalResult(void* out, size_t outSize) {
+	void VmBase<softAes>::getFinalResult(void* out) {
 		hashAes1Rx4<softAes>(scratchpad, ScratchpadSize, &reg.a);
-        rx_blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
+		rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, &reg, sizeof(RegisterFile));
 	}

 	template<bool softAes>
-	void VmBase<softAes>::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) {
+	void VmBase<softAes>::hashAndFill(void* out, uint64_t (&fill_state)[8]) {
 		hashAndFillAes1Rx4<softAes>(scratchpad, ScratchpadSize, &reg.a, fill_state);
-        rx_blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
+		rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, &reg, sizeof(RegisterFile));
 	}

 	template<bool softAes>
@ -127,6 +128,7 @@ namespace randomx {

 	template<bool softAes>
 	void VmBase<softAes>::generateProgram(void* seed) {
+		PROFILE_SCOPE(RandomX_generate_program);
 		fillAes4Rx4<softAes>(seed, 128 + RandomX_CurrentConfig.ProgramSize * 8, &program);
 	}

--- a/src/crypto/randomx/virtual_machine.hpp
+++ b/src/crypto/randomx/virtual_machine.hpp
@ -38,8 +38,8 @@ class randomx_vm
 public:
 	virtual ~randomx_vm() = 0;
 	virtual void setScratchpad(uint8_t *scratchpad) = 0;
-	virtual void getFinalResult(void* out, size_t outSize) = 0;
-	virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0;
+	virtual void getFinalResult(void* out) = 0;
+	virtual void hashAndFill(void* out, uint64_t (&fill_state)[8]) = 0;
 	virtual void setDataset(randomx_dataset* dataset) { }
 	virtual void setCache(randomx_cache* cache) { }
 	virtual void initScratchpad(void* seed) = 0;
@ -86,8 +86,8 @@ namespace randomx {
 		~VmBase() override;
 		void setScratchpad(uint8_t *scratchpad) override;
 		void initScratchpad(void* seed) override;
-		void getFinalResult(void* out, size_t outSize) override;
-		void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override;
+		void getFinalResult(void* out) override;
+		void hashAndFill(void* out, uint64_t (&fill_state)[8]) override;

 	protected:
 		void generateProgram(void* seed);
--- a/src/crypto/randomx/vm_compiled.cpp
+++ b/src/crypto/randomx/vm_compiled.cpp
@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "crypto/randomx/vm_compiled.hpp"
 #include "crypto/randomx/common.hpp"
+#include "base/tools/Profiler.h"

 namespace randomx {

@ -41,6 +42,8 @@ namespace randomx {

 	template<bool softAes>
 	void CompiledVm<softAes>::run(void* seed) {
+		PROFILE_SCOPE(RandomX_run);
+
 		compiler.prepare();
 		VmBase<softAes>::generateProgram(seed);
 		randomx_vm::initialize();
@ -51,6 +54,8 @@ namespace randomx {

 	template<bool softAes>
 	void CompiledVm<softAes>::execute() {
+		PROFILE_SCOPE(RandomX_JIT_execute);
+
 #ifdef XMRIG_ARM
 		memcpy(reg.f, config.eMask, sizeof(config.eMask));
 #endif
--- a/src/crypto/rx/RxConfig.cpp
+++ b/src/crypto/rx/RxConfig.cpp
@ -120,8 +120,8 @@ bool xmrig::RxConfig::read(const rapidjson::Value &value)
        }
 #       endif

-        const int mode = Json::getInt(value, kScratchpadPrefetchMode, static_cast<int>(m_scratchpadPrefetchMode));
-        if ((mode >= ScratchpadPrefetchOff) && (mode < ScratchpadPrefetchMax)) {
+        const uint32_t mode = static_cast<uint32_t>(Json::getInt(value, kScratchpadPrefetchMode, static_cast<int>(m_scratchpadPrefetchMode)));
+        if (mode < ScratchpadPrefetchMax) {
            m_scratchpadPrefetchMode = static_cast<ScratchpadPrefetchMode>(mode);
        }