From a05393727c980c333f60fc7a532533a1bdf5867c Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sat, 12 Sep 2020 23:07:52 +0200 Subject: [PATCH] RandomX: added performance profiler (for developers) Also optimized Blake2b SSE4.1 code size to avoid code cache pollution. --- CMakeLists.txt | 1 + src/base/base.cmake | 12 + src/base/io/log/Tags.cpp | 10 + src/base/io/log/Tags.h | 4 + src/base/tools/Profiler.cpp | 100 +++++ src/base/tools/Profiler.h | 132 ++++++ src/core/Miner.cpp | 43 ++ src/crypto/randomx/aes_hash.cpp | 3 + src/crypto/randomx/blake2/blake2.h | 2 +- .../randomx/blake2/blake2b-load-sse41.h | 402 ------------------ src/crypto/randomx/blake2/blake2b-round.h | 14 +- src/crypto/randomx/blake2/blake2b.c | 98 ++--- src/crypto/randomx/blake2_generator.cpp | 2 +- src/crypto/randomx/jit_compiler_x86.cpp | 3 + src/crypto/randomx/randomx.cpp | 18 +- src/crypto/randomx/virtual_machine.cpp | 10 +- src/crypto/randomx/virtual_machine.hpp | 8 +- src/crypto/randomx/vm_compiled.cpp | 5 + src/crypto/rx/RxConfig.cpp | 4 +- 19 files changed, 390 insertions(+), 481 deletions(-) create mode 100644 src/base/tools/Profiler.cpp create mode 100644 src/base/tools/Profiler.h delete mode 100644 src/crypto/randomx/blake2/blake2b-load-sse41.h diff --git a/CMakeLists.txt b/CMakeLists.txt index dff77ee58..5bacc9697 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ option(WITH_NVML "Enable NVML (NVIDIA Management Library) support (on option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support (only if OpenCL backend enabled)" ON) option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON) option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF) +option(WITH_PROFILING "Enable profiling for developers" OFF) option(BUILD_STATIC "Build static binary" OFF) option(ARM_TARGET "Force use specific ARM target 8 or 7" 0) diff --git a/src/base/base.cmake b/src/base/base.cmake index 0c82201f2..da53d5ea8 100644 --- a/src/base/base.cmake +++ b/src/base/base.cmake @@ -222,3 +222,15 @@ if (WITH_KAWPOW) src/base/net/stratum/EthStratumClient.cpp ) endif() + +if (WITH_PROFILING) + add_definitions(/DXMRIG_FEATURE_PROFILING) + + list(APPEND HEADERS_BASE + src/base/tools/Profiler.h + ) + + list(APPEND SOURCES_BASE + src/base/tools/Profiler.cpp + ) +endif() diff --git a/src/base/io/log/Tags.cpp b/src/base/io/log/Tags.cpp index 0b4f7a0a9..af36b0baa 100644 --- a/src/base/io/log/Tags.cpp +++ b/src/base/io/log/Tags.cpp @@ -101,3 +101,13 @@ const char *xmrig::Tags::opencl() return tag; } #endif + + +#ifdef XMRIG_FEATURE_PROFILING +const char* xmrig::Tags::profiler() +{ + static const char* tag = CYAN_BG_BOLD(WHITE_BOLD_S " profile "); + + return tag; +} +#endif diff --git a/src/base/io/log/Tags.h b/src/base/io/log/Tags.h index e6d470be2..072d7d414 100644 --- a/src/base/io/log/Tags.h +++ b/src/base/io/log/Tags.h @@ -53,6 +53,10 @@ public: # ifdef XMRIG_FEATURE_OPENCL static const char *opencl(); # endif + +# ifdef XMRIG_FEATURE_PROFILING + static const char* profiler(); +# endif }; diff --git a/src/base/tools/Profiler.cpp b/src/base/tools/Profiler.cpp new file mode 100644 index 000000000..f6f066f37 --- /dev/null +++ b/src/base/tools/Profiler.cpp @@ -0,0 +1,100 @@ +/* XMRig + * Copyright 2018-2020 SChernykh + * Copyright 2016-2020 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "base/tools/Profiler.h" +#include "base/io/log/Log.h" +#include "base/io/log/Tags.h" +#include +#include +#include +#include + + +#ifdef XMRIG_FEATURE_PROFILING + + +ProfileScopeData* ProfileScopeData::s_data[MAX_DATA_COUNT] = {}; +volatile long ProfileScopeData::s_dataCount = 0; +double ProfileScopeData::s_tscSpeed = 0.0; + + +#ifndef NOINLINE +#ifdef __GNUC__ +#define NOINLINE __attribute__ ((noinline)) +#elif _MSC_VER +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE +#endif +#endif + + +static std::string get_thread_id() +{ + std::stringstream ss; + ss << std::this_thread::get_id(); + + std::string s = ss.str(); + if (s.length() > ProfileScopeData::MAX_THREAD_ID_LENGTH) { + s.resize(ProfileScopeData::MAX_THREAD_ID_LENGTH); + } + + return s; +} + + +NOINLINE void ProfileScopeData::Register(ProfileScopeData* data) +{ +#ifdef _MSC_VER + const long id = _InterlockedIncrement(&s_dataCount) - 1; +#else + const long id = __sync_fetch_and_add(&s_dataCount, 1); +#endif + + if (static_cast(id) < MAX_DATA_COUNT) { + s_data[id] = data; + + const std::string s = get_thread_id(); + memcpy(data->m_threadId, s.c_str(), s.length() + 1); + } +} + + +NOINLINE void ProfileScopeData::Init() +{ + using namespace std::chrono; + + const uint64_t t1 = static_cast(time_point_cast(high_resolution_clock::now()).time_since_epoch().count()); + const uint64_t count1 = ReadTSC(); + + for (;;) + { + const uint64_t t2 = static_cast(time_point_cast(high_resolution_clock::now()).time_since_epoch().count()); + const uint64_t count2 = ReadTSC(); + + if (t2 - t1 > 1000000000) { + s_tscSpeed = (count2 - count1) * 1e9 / (t2 - t1); + LOG_INFO("%s TSC speed = %.3f GHz", xmrig::Tags::profiler(), s_tscSpeed / 1e9); + return; + } + } +} + + +#endif /* XMRIG_FEATURE_PROFILING */ diff --git a/src/base/tools/Profiler.h b/src/base/tools/Profiler.h new file mode 100644 index 000000000..c74277151 --- /dev/null +++ b/src/base/tools/Profiler.h @@ -0,0 +1,132 @@ +/* XMRig + * Copyright 2018-2020 SChernykh + * Copyright 2016-2020 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef XMRIG_PROFILER_H +#define XMRIG_PROFILER_H + + +#ifndef FORCE_INLINE +#if defined(_MSC_VER) +#define FORCE_INLINE __forceinline +#elif defined(__GNUC__) +#define FORCE_INLINE __attribute__((always_inline)) inline +#elif defined(__clang__) +#define FORCE_INLINE __inline__ +#else +#define FORCE_INLINE +#endif +#endif + + +#ifdef XMRIG_FEATURE_PROFILING + + +#include +#include + +#if defined(_MSC_VER) +#include +#endif + + +static FORCE_INLINE uint64_t ReadTSC() +{ +#ifdef _MSC_VER + return __rdtsc(); +#else + uint32_t hi, lo; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return (((uint64_t)hi) << 32) | lo; +#endif +} + + +struct ProfileScopeData +{ + const char* m_name; + uint64_t m_totalCycles; + uint32_t m_totalSamples; + + enum + { + MAX_THREAD_ID_LENGTH = 11, + MAX_SAMPLE_COUNT = 128, + MAX_DATA_COUNT = 1024 + }; + + char m_threadId[MAX_THREAD_ID_LENGTH + 1]; + + static ProfileScopeData* s_data[MAX_DATA_COUNT]; + static volatile long s_dataCount; + static double s_tscSpeed; + + static void Register(ProfileScopeData* data); + static void Init(); +}; + +static_assert(std::is_trivial::value, "ProfileScopeData must be a trivial struct"); +static_assert(sizeof(ProfileScopeData) <= 32, "ProfileScopeData struct is too big"); + + +class ProfileScope +{ +public: + FORCE_INLINE ProfileScope(ProfileScopeData& data) + : m_data(data) + { + if (m_data.m_totalCycles == 0) { + ProfileScopeData::Register(&data); + } + + m_startCounter = ReadTSC(); + } + + FORCE_INLINE ~ProfileScope() + { + m_data.m_totalCycles += ReadTSC() - m_startCounter; + ++m_data.m_totalSamples; + } + +private: + ProfileScopeData& m_data; + uint64_t m_startCounter; +}; + + +#define PROFILE_SCOPE(x) static thread_local ProfileScopeData x##_data{#x}; ProfileScope x(x##_data); + + +#else /* XMRIG_FEATURE_PROFILING */ +#define PROFILE_SCOPE(x) +#endif /* XMRIG_FEATURE_PROFILING */ + + +#include "crypto/randomx/blake2/blake2.h" + + +struct rx_blake2b_wrapper +{ + FORCE_INLINE static void run(void* out, size_t outlen, const void* in, size_t inlen) + { + PROFILE_SCOPE(RandomX_Blake2b); + rx_blake2b(out, outlen, in, inlen); + } +}; + + +#endif /* XMRIG_PROFILER_H */ diff --git a/src/core/Miner.cpp b/src/core/Miner.cpp index 1a8c29f99..12be05ecc 100644 --- a/src/core/Miner.cpp +++ b/src/core/Miner.cpp @@ -38,6 +38,7 @@ #include "base/kernel/Platform.h" #include "base/net/stratum/Job.h" #include "base/tools/Object.h" +#include "base/tools/Profiler.h" #include "base/tools/Timer.h" #include "core/config/Config.h" #include "core/Controller.h" @@ -267,6 +268,44 @@ public: h = "MH/s"; } +# ifdef XMRIG_FEATURE_PROFILING + ProfileScopeData* data[ProfileScopeData::MAX_DATA_COUNT]; + + const uint32_t n = std::min(ProfileScopeData::s_dataCount, ProfileScopeData::MAX_DATA_COUNT); + memcpy(data, ProfileScopeData::s_data, n * sizeof(ProfileScopeData*)); + + std::sort(data, data + n, [](ProfileScopeData* a, ProfileScopeData* b) { + return strcmp(a->m_threadId, b->m_threadId) < 0; + }); + + for (uint32_t i = 0; i < n;) + { + uint32_t n1 = i; + while ((n1 < n) && (strcmp(data[i]->m_threadId, data[n1]->m_threadId) == 0)) { + ++n1; + } + + std::sort(data + i, data + n1, [](ProfileScopeData* a, ProfileScopeData* b) { + return a->m_totalCycles > b->m_totalCycles; + }); + + for (uint32_t j = i; j < n1; ++j) { + ProfileScopeData* p = data[j]; + LOG_INFO("%s Thread %6s | %-30s | %7.3f%% | %9.0f ns", + Tags::profiler(), + p->m_threadId, + p->m_name, + p->m_totalCycles * 100.0 / data[i]->m_totalCycles, + p->m_totalCycles / p->m_totalSamples * 1e9 / ProfileScopeData::s_tscSpeed + ); + } + + LOG_INFO("%s --------------|--------------------------------|----------|-------------", Tags::profiler()); + + i = n1; + } +# endif + LOG_INFO("%s " WHITE_BOLD("speed") " 10s/60s/15m " CYAN_BOLD("%s") CYAN(" %s %s ") CYAN_BOLD("%s") " max " CYAN_BOLD("%s %s"), Tags::miner(), Hashrate::format(speed[0] * scale, num, sizeof(num) / 4), @@ -311,6 +350,10 @@ xmrig::Miner::Miner(Controller *controller) Platform::setThreadPriority(std::min(priority + 1, 5)); } +# ifdef XMRIG_FEATURE_PROFILING + ProfileScopeData::Init(); +# endif + # ifdef XMRIG_ALGO_RANDOMX Rx::init(this); # endif diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index 1898a2c55..571b4ca73 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/soft_aes.h" #include "crypto/randomx/randomx.h" +#include "base/tools/Profiler.h" #define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d #define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e @@ -215,6 +216,8 @@ template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { + PROFILE_SCOPE(RandomX_AES); + uint8_t* scratchpadPtr = (uint8_t*)scratchpad; const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize; diff --git a/src/crypto/randomx/blake2/blake2.h b/src/crypto/randomx/blake2/blake2.h index 4d364c36c..52f05b396 100644 --- a/src/crypto/randomx/blake2/blake2.h +++ b/src/crypto/randomx/blake2/blake2.h @@ -92,7 +92,7 @@ extern "C" { int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen); /* Simple API */ - int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen); + int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen); /* Argon2 Team - Begin Code */ int rxa2_blake2b_long(void *out, size_t outlen, const void *in, size_t inlen); diff --git a/src/crypto/randomx/blake2/blake2b-load-sse41.h b/src/crypto/randomx/blake2/blake2b-load-sse41.h deleted file mode 100644 index 0eca86599..000000000 --- a/src/crypto/randomx/blake2/blake2b-load-sse41.h +++ /dev/null @@ -1,402 +0,0 @@ -/* - BLAKE2 reference source code package - optimized C implementations - - Copyright 2012, Samuel Neves . You may use this under the - terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at - your option. The terms of these licenses can be found at: - - - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - - OpenSSL license : https://www.openssl.org/source/license.html - - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - - More information about the BLAKE2 hash function can be found at - https://blake2.net. -*/ -#ifndef BLAKE2B_LOAD_SSE41_H -#define BLAKE2B_LOAD_SSE41_H - -#define LOAD_MSG_0_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m0, m1); \ -b1 = _mm_unpacklo_epi64(m2, m3); \ -} while(0) - - -#define LOAD_MSG_0_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m0, m1); \ -b1 = _mm_unpackhi_epi64(m2, m3); \ -} while(0) - - -#define LOAD_MSG_0_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m4, m5); \ -b1 = _mm_unpacklo_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_0_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m4, m5); \ -b1 = _mm_unpackhi_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_1_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m7, m2); \ -b1 = _mm_unpackhi_epi64(m4, m6); \ -} while(0) - - -#define LOAD_MSG_1_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m5, m4); \ -b1 = _mm_alignr_epi8(m3, m7, 8); \ -} while(0) - - -#define LOAD_MSG_1_3(b0, b1) \ -do \ -{ \ -b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ -b1 = _mm_unpackhi_epi64(m5, m2); \ -} while(0) - - -#define LOAD_MSG_1_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m6, m1); \ -b1 = _mm_unpackhi_epi64(m3, m1); \ -} while(0) - - -#define LOAD_MSG_2_1(b0, b1) \ -do \ -{ \ -b0 = _mm_alignr_epi8(m6, m5, 8); \ -b1 = _mm_unpackhi_epi64(m2, m7); \ -} while(0) - - -#define LOAD_MSG_2_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m4, m0); \ -b1 = _mm_blend_epi16(m1, m6, 0xF0); \ -} while(0) - - -#define LOAD_MSG_2_3(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m5, m1, 0xF0); \ -b1 = _mm_unpackhi_epi64(m3, m4); \ -} while(0) - - -#define LOAD_MSG_2_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m7, m3); \ -b1 = _mm_alignr_epi8(m2, m0, 8); \ -} while(0) - - -#define LOAD_MSG_3_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m3, m1); \ -b1 = _mm_unpackhi_epi64(m6, m5); \ -} while(0) - - -#define LOAD_MSG_3_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m4, m0); \ -b1 = _mm_unpacklo_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_3_3(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m1, m2, 0xF0); \ -b1 = _mm_blend_epi16(m2, m7, 0xF0); \ -} while(0) - - -#define LOAD_MSG_3_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m3, m5); \ -b1 = _mm_unpacklo_epi64(m0, m4); \ -} while(0) - - -#define LOAD_MSG_4_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m4, m2); \ -b1 = _mm_unpacklo_epi64(m1, m5); \ -} while(0) - - -#define LOAD_MSG_4_2(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m0, m3, 0xF0); \ -b1 = _mm_blend_epi16(m2, m7, 0xF0); \ -} while(0) - - -#define LOAD_MSG_4_3(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m7, m5, 0xF0); \ -b1 = _mm_blend_epi16(m3, m1, 0xF0); \ -} while(0) - - -#define LOAD_MSG_4_4(b0, b1) \ -do \ -{ \ -b0 = _mm_alignr_epi8(m6, m0, 8); \ -b1 = _mm_blend_epi16(m4, m6, 0xF0); \ -} while(0) - - -#define LOAD_MSG_5_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m1, m3); \ -b1 = _mm_unpacklo_epi64(m0, m4); \ -} while(0) - - -#define LOAD_MSG_5_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m6, m5); \ -b1 = _mm_unpackhi_epi64(m5, m1); \ -} while(0) - - -#define LOAD_MSG_5_3(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m2, m3, 0xF0); \ -b1 = _mm_unpackhi_epi64(m7, m0); \ -} while(0) - - -#define LOAD_MSG_5_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m6, m2); \ -b1 = _mm_blend_epi16(m7, m4, 0xF0); \ -} while(0) - - -#define LOAD_MSG_6_1(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m6, m0, 0xF0); \ -b1 = _mm_unpacklo_epi64(m7, m2); \ -} while(0) - - -#define LOAD_MSG_6_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m2, m7); \ -b1 = _mm_alignr_epi8(m5, m6, 8); \ -} while(0) - - -#define LOAD_MSG_6_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m0, m3); \ -b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ -} while(0) - - -#define LOAD_MSG_6_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m3, m1); \ -b1 = _mm_blend_epi16(m1, m5, 0xF0); \ -} while(0) - - -#define LOAD_MSG_7_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m6, m3); \ -b1 = _mm_blend_epi16(m6, m1, 0xF0); \ -} while(0) - - -#define LOAD_MSG_7_2(b0, b1) \ -do \ -{ \ -b0 = _mm_alignr_epi8(m7, m5, 8); \ -b1 = _mm_unpackhi_epi64(m0, m4); \ -} while(0) - - -#define LOAD_MSG_7_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m2, m7); \ -b1 = _mm_unpacklo_epi64(m4, m1); \ -} while(0) - - -#define LOAD_MSG_7_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m0, m2); \ -b1 = _mm_unpacklo_epi64(m3, m5); \ -} while(0) - - -#define LOAD_MSG_8_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m3, m7); \ -b1 = _mm_alignr_epi8(m0, m5, 8); \ -} while(0) - - -#define LOAD_MSG_8_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m7, m4); \ -b1 = _mm_alignr_epi8(m4, m1, 8); \ -} while(0) - - -#define LOAD_MSG_8_3(b0, b1) \ -do \ -{ \ -b0 = m6; \ -b1 = _mm_alignr_epi8(m5, m0, 8); \ -} while(0) - - -#define LOAD_MSG_8_4(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m1, m3, 0xF0); \ -b1 = m2; \ -} while(0) - - -#define LOAD_MSG_9_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m5, m4); \ -b1 = _mm_unpackhi_epi64(m3, m0); \ -} while(0) - - -#define LOAD_MSG_9_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m1, m2); \ -b1 = _mm_blend_epi16(m3, m2, 0xF0); \ -} while(0) - - -#define LOAD_MSG_9_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m7, m4); \ -b1 = _mm_unpackhi_epi64(m1, m6); \ -} while(0) - - -#define LOAD_MSG_9_4(b0, b1) \ -do \ -{ \ -b0 = _mm_alignr_epi8(m7, m5, 8); \ -b1 = _mm_unpacklo_epi64(m6, m0); \ -} while(0) - - -#define LOAD_MSG_10_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m0, m1); \ -b1 = _mm_unpacklo_epi64(m2, m3); \ -} while(0) - - -#define LOAD_MSG_10_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m0, m1); \ -b1 = _mm_unpackhi_epi64(m2, m3); \ -} while(0) - - -#define LOAD_MSG_10_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m4, m5); \ -b1 = _mm_unpacklo_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_10_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m4, m5); \ -b1 = _mm_unpackhi_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_11_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m7, m2); \ -b1 = _mm_unpackhi_epi64(m4, m6); \ -} while(0) - - -#define LOAD_MSG_11_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m5, m4); \ -b1 = _mm_alignr_epi8(m3, m7, 8); \ -} while(0) - - -#define LOAD_MSG_11_3(b0, b1) \ -do \ -{ \ -b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ -b1 = _mm_unpackhi_epi64(m5, m2); \ -} while(0) - - -#define LOAD_MSG_11_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m6, m1); \ -b1 = _mm_unpackhi_epi64(m3, m1); \ -} while(0) - - -#endif diff --git a/src/crypto/randomx/blake2/blake2b-round.h b/src/crypto/randomx/blake2/blake2b-round.h index 1edc2cc4c..bf4f1ffed 100644 --- a/src/crypto/randomx/blake2/blake2b-round.h +++ b/src/crypto/randomx/blake2/blake2b-round.h @@ -102,17 +102,21 @@ row4l = t1; \ row4h = t0; -#include "blake2b-load-sse41.h" +#define LOAD_MSG(r, i, b0, b1) \ +do { \ + b0 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 1]], m[blake2b_sigma_sse41[r][i * 4 + 0]]); \ + b1 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 3]], m[blake2b_sigma_sse41[r][i * 4 + 2]]); \ +} while(0) #define ROUND(r) \ - LOAD_MSG_ ##r ##_1(b0, b1); \ + LOAD_MSG(r, 0, b0, b1); \ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_2(b0, b1); \ + LOAD_MSG(r, 1, b0, b1); \ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ - LOAD_MSG_ ##r ##_3(b0, b1); \ + LOAD_MSG(r, 2, b0, b1); \ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_4(b0, b1); \ + LOAD_MSG(r, 3, b0, b1); \ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c index 6a0889cbb..7a1b9daeb 100644 --- a/src/crypto/randomx/blake2/blake2b.c +++ b/src/crypto/randomx/blake2/blake2b.c @@ -56,6 +56,23 @@ static const uint64_t blake2b_IV[8] = { UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; +#if defined(_M_X64) || defined(__x86_64__) +static const uint8_t blake2b_sigma_sse41[12][16] = { + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, + {11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4}, + {7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, + {9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13}, + {2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9}, + {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, + {13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, + {6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5}, + {10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0}, + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, +}; +#endif + static const uint8_t blake2b_sigma[12][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, @@ -203,15 +220,6 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block) const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - const __m128i m0 = LOADU(block + 00); - const __m128i m1 = LOADU(block + 16); - const __m128i m2 = LOADU(block + 32); - const __m128i m3 = LOADU(block + 48); - const __m128i m4 = LOADU(block + 64); - const __m128i m5 = LOADU(block + 80); - const __m128i m6 = LOADU(block + 96); - const __m128i m7 = LOADU(block + 112); - row1l = LOADU(&S->h[0]); row1h = LOADU(&S->h[2]); row2l = LOADU(&S->h[4]); @@ -221,18 +229,11 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block) row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - ROUND(10); - ROUND(11); + const uint64_t* m = (const uint64_t*)(block); + + for (uint32_t r = 0; r < 12; ++r) { + ROUND(r); + } row1l = _mm_xor_si128(row3l, row1l); row1h = _mm_xor_si128(row3h, row1h); @@ -388,8 +389,7 @@ int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen) { return 0; } -int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, - const void *key, size_t keylen) { +int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen) { blake2b_state S; int ret = -1; @@ -402,25 +402,14 @@ int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, goto fail; } - if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) { + if (rx_blake2b_init(&S, outlen) < 0) { goto fail; } - if (keylen > 0) { - if (rx_blake2b_init_key(&S, outlen, key, keylen) < 0) { - goto fail; - } - } - else { - if (rx_blake2b_init(&S, outlen) < 0) { - goto fail; - } - } - - if (rx_blake2b_update(&S, in, inlen) < 0) { + if (rx_blake2b_update(&S, in, inlen) < 0) { goto fail; } - ret = rx_blake2b_final(&S, out, outlen); + ret = rx_blake2b_final(&S, out, outlen); fail: //clear_internal_memory(&S, sizeof(S)); @@ -442,43 +431,42 @@ int rxa2_blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { store32(outlen_bytes, (uint32_t)outlen); #define TRY(statement) \ - do { \ - ret = statement; \ - if (ret < 0) { \ - goto fail; \ - } \ - } while ((void)0, 0) + do { \ + ret = statement; \ + if (ret < 0) { \ + goto fail; \ + } \ + } while ((void)0, 0) if (outlen <= BLAKE2B_OUTBYTES) { - TRY(rx_blake2b_init(&blake_state, outlen)); - TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); - TRY(rx_blake2b_update(&blake_state, in, inlen)); - TRY(rx_blake2b_final(&blake_state, out, outlen)); + TRY(rx_blake2b_init(&blake_state, outlen)); + TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(rx_blake2b_update(&blake_state, in, inlen)); + TRY(rx_blake2b_final(&blake_state, out, outlen)); } else { uint32_t toproduce; uint8_t out_buffer[BLAKE2B_OUTBYTES]; uint8_t in_buffer[BLAKE2B_OUTBYTES]; - TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); - TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); - TRY(rx_blake2b_update(&blake_state, in, inlen)); - TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); + TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); + TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(rx_blake2b_update(&blake_state, in, inlen)); + TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); out += BLAKE2B_OUTBYTES / 2; toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2; while (toproduce > BLAKE2B_OUTBYTES) { memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); - TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, - BLAKE2B_OUTBYTES, NULL, 0)); + TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, + BLAKE2B_OUTBYTES)); memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); out += BLAKE2B_OUTBYTES / 2; toproduce -= BLAKE2B_OUTBYTES / 2; } memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); - TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL, - 0)); + TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES)); memcpy(out, out_buffer, toproduce); } fail: diff --git a/src/crypto/randomx/blake2_generator.cpp b/src/crypto/randomx/blake2_generator.cpp index edfe2e34c..ef3894d88 100644 --- a/src/crypto/randomx/blake2_generator.cpp +++ b/src/crypto/randomx/blake2_generator.cpp @@ -55,7 +55,7 @@ namespace randomx { void Blake2Generator::checkData(const size_t bytesNeeded) { if (dataIndex + bytesNeeded > sizeof(data)) { - rx_blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + rx_blake2b(data, sizeof(data), data, sizeof(data)); dataIndex = 0; } } diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 7aae54fd4..09746b901 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/program.hpp" #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/virtual_memory.hpp" +#include "base/tools/Profiler.h" #ifdef XMRIG_FIX_RYZEN # include "crypto/rx/Rx.h" @@ -255,6 +256,8 @@ namespace randomx { } void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) { + PROFILE_SCOPE(RandomX_JIT_compile); + vm_flags = flags; generateProgramPrologue(prog, pcfg); diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 8a6053638..89d319de2 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "base/tools/Profiler.h" + RandomX_ConfigurationWownero::RandomX_ConfigurationWownero() { ArgonSalt = "RandomWOW\x01"; @@ -574,33 +576,35 @@ extern "C" { assert(inputSize == 0 || input != nullptr); assert(output != nullptr); alignas(16) uint64_t tempHash[8]; - rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize); machine->initScratchpad(&tempHash); machine->resetRoundingMode(); for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) { machine->run(&tempHash); - rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile)); } machine->run(&tempHash); - machine->getFinalResult(output, RANDOMX_HASH_SIZE); + machine->getFinalResult(output); } void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) { - rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize); machine->initScratchpad(tempHash); } void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) { + PROFILE_SCOPE(RandomX_hash); + machine->resetRoundingMode(); for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) { machine->run(&tempHash); - rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile)); } machine->run(&tempHash); // Finish current hash and fill the scratchpad for the next hash at the same time - rx_blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0); - machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), nextInput, nextInputSize); + machine->hashAndFill(output, tempHash); } } diff --git a/src/crypto/randomx/virtual_machine.cpp b/src/crypto/randomx/virtual_machine.cpp index ecd187e2f..f00213a30 100644 --- a/src/crypto/randomx/virtual_machine.cpp +++ b/src/crypto/randomx/virtual_machine.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/blake2/blake2.h" #include "crypto/randomx/intrin_portable.h" #include "crypto/randomx/allocator.hpp" +#include "base/tools/Profiler.h" randomx_vm::~randomx_vm() { @@ -109,15 +110,15 @@ namespace randomx { } template - void VmBase::getFinalResult(void* out, size_t outSize) { + void VmBase::getFinalResult(void* out) { hashAes1Rx4(scratchpad, ScratchpadSize, ®.a); - rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile)); } template - void VmBase::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) { + void VmBase::hashAndFill(void* out, uint64_t (&fill_state)[8]) { hashAndFillAes1Rx4(scratchpad, ScratchpadSize, ®.a, fill_state); - rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile)); } template @@ -127,6 +128,7 @@ namespace randomx { template void VmBase::generateProgram(void* seed) { + PROFILE_SCOPE(RandomX_generate_program); fillAes4Rx4(seed, 128 + RandomX_CurrentConfig.ProgramSize * 8, &program); } diff --git a/src/crypto/randomx/virtual_machine.hpp b/src/crypto/randomx/virtual_machine.hpp index 3fdd86df4..a60e693ae 100644 --- a/src/crypto/randomx/virtual_machine.hpp +++ b/src/crypto/randomx/virtual_machine.hpp @@ -38,8 +38,8 @@ class randomx_vm public: virtual ~randomx_vm() = 0; virtual void setScratchpad(uint8_t *scratchpad) = 0; - virtual void getFinalResult(void* out, size_t outSize) = 0; - virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0; + virtual void getFinalResult(void* out) = 0; + virtual void hashAndFill(void* out, uint64_t (&fill_state)[8]) = 0; virtual void setDataset(randomx_dataset* dataset) { } virtual void setCache(randomx_cache* cache) { } virtual void initScratchpad(void* seed) = 0; @@ -86,8 +86,8 @@ namespace randomx { ~VmBase() override; void setScratchpad(uint8_t *scratchpad) override; void initScratchpad(void* seed) override; - void getFinalResult(void* out, size_t outSize) override; - void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override; + void getFinalResult(void* out) override; + void hashAndFill(void* out, uint64_t (&fill_state)[8]) override; protected: void generateProgram(void* seed); diff --git a/src/crypto/randomx/vm_compiled.cpp b/src/crypto/randomx/vm_compiled.cpp index 501bb8c70..a61797e85 100644 --- a/src/crypto/randomx/vm_compiled.cpp +++ b/src/crypto/randomx/vm_compiled.cpp @@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/vm_compiled.hpp" #include "crypto/randomx/common.hpp" +#include "base/tools/Profiler.h" namespace randomx { @@ -41,6 +42,8 @@ namespace randomx { template void CompiledVm::run(void* seed) { + PROFILE_SCOPE(RandomX_run); + compiler.prepare(); VmBase::generateProgram(seed); randomx_vm::initialize(); @@ -51,6 +54,8 @@ namespace randomx { template void CompiledVm::execute() { + PROFILE_SCOPE(RandomX_JIT_execute); + #ifdef XMRIG_ARM memcpy(reg.f, config.eMask, sizeof(config.eMask)); #endif diff --git a/src/crypto/rx/RxConfig.cpp b/src/crypto/rx/RxConfig.cpp index 55dae35d6..d480d17b9 100644 --- a/src/crypto/rx/RxConfig.cpp +++ b/src/crypto/rx/RxConfig.cpp @@ -120,8 +120,8 @@ bool xmrig::RxConfig::read(const rapidjson::Value &value) } # endif - const int mode = Json::getInt(value, kScratchpadPrefetchMode, static_cast(m_scratchpadPrefetchMode)); - if ((mode >= ScratchpadPrefetchOff) && (mode < ScratchpadPrefetchMax)) { + const uint32_t mode = static_cast(Json::getInt(value, kScratchpadPrefetchMode, static_cast(m_scratchpadPrefetchMode))); + if (mode < ScratchpadPrefetchMax) { m_scratchpadPrefetchMode = static_cast(mode); }