From a5b6383f7baa02dcec55d9807bf5ee1b124f92fc Mon Sep 17 00:00:00 2001 From: XMRig Date: Fri, 28 Aug 2020 23:50:16 +0700 Subject: [PATCH 01/13] v6.3.4 --- src/version.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.h b/src/version.h index 87b5e7225..243377299 100644 --- a/src/version.h +++ b/src/version.h @@ -28,7 +28,7 @@ #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "XMRig miner" -#define APP_VERSION "6.3.3" +#define APP_VERSION "6.3.4-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com" @@ -36,7 +36,7 @@ #define APP_VER_MAJOR 6 #define APP_VER_MINOR 3 -#define APP_VER_PATCH 3 +#define APP_VER_PATCH 4 #ifdef _MSC_VER # if (_MSC_VER >= 1920) From a84b45b1bb46b829e2caeeb14ae2897e5cd85fec Mon Sep 17 00:00:00 2001 From: SChernykh Date: Fri, 4 Sep 2020 16:16:07 +0200 Subject: [PATCH 02/13] RandomX: added parameter for scratchpad prefetch mode `scratchpad_prefetch_mode` can have 4 values: 0: off 1: use `prefetcht0` instruction (default, same as previous XMRig versions) 2: use `prefetchnta` instruction (faster on Coffee Lake and a few other CPUs) 3: use `mov` instruction --- src/config.json | 3 ++- src/core/config/Config_default.h | 7 +++++- src/crypto/randomx/randomx.cpp | 37 ++++++++++++++++++++++++++++++++ src/crypto/randomx/randomx.h | 2 ++ src/crypto/rx/Rx.cpp | 3 +++ src/crypto/rx/RxConfig.cpp | 9 ++++++++ src/crypto/rx/RxConfig.h | 12 +++++++++++ 7 files changed, 71 insertions(+), 2 deletions(-) diff --git a/src/config.json b/src/config.json index f3f282679..c2bac8614 100644 --- a/src/config.json +++ b/src/config.json @@ -21,7 +21,8 @@ "rdmsr": true, "wrmsr": true, "cache_qos": false, - "numa": true + "numa": true, + "scratchpad_prefetch_mode": 1 }, "cpu": { "enabled": true, diff --git a/src/core/config/Config_default.h b/src/core/config/Config_default.h index dd01fa12b..7e4ed2707 100644 --- a/src/core/config/Config_default.h +++ b/src/core/config/Config_default.h @@ -51,7 +51,12 @@ R"===( "randomx": { "init": -1, "mode": "auto", - "numa": true + "1gb-pages": false, + "rdmsr": true, + "wrmsr": true, + "cache_qos": false, + "numa": true, + "scratchpad_prefetch_mode": 1 }, "cpu": { "enabled": true, diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index fdd7bd067..8a6053638 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -211,6 +211,13 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase() static uint32_t Log2(size_t value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; } #endif +static int scratchpadPrefetchMode = 1; + +void randomx_set_scratchpad_prefetch_mode(int mode) +{ + scratchpadPrefetchMode = mode; +} + void RandomX_ConfigurationBase::Apply() { const uint32_t ScratchpadL1Mask_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) - 1) * 8; @@ -240,6 +247,36 @@ void RandomX_ConfigurationBase::Apply() *(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated; *(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated; + // Apply scratchpad prefetch mode + { + uint32_t* a = (uint32_t*)(codePrefetchScratchpadTweaked + 8); + uint32_t* b = (uint32_t*)(codePrefetchScratchpadTweaked + 22); + + switch (scratchpadPrefetchMode) + { + case 0: + *a = 0x00401F0FUL; // 4-byte nop + *b = 0x00401F0FUL; // 4-byte nop + break; + + case 1: + default: + *a = 0x060C180FUL; // prefetcht0 [rsi+rax] + *b = 0x160C180FUL; // prefetcht0 [rsi+rdx] + break; + + case 2: + *a = 0x0604180FUL; // prefetchnta [rsi+rax] + *b = 0x1604180FUL; // prefetchnta [rsi+rdx] + break; + + case 3: + *a = 0x060C8B48UL; // mov rcx, [rsi+rax] + *b = 0x160C8B48UL; // mov rcx, [rsi+rdx] + break; + } + } + #define JIT_HANDLE(x, prev) randomx::JitCompilerX86::engine[k] = &randomx::JitCompilerX86::h_##x #elif defined(XMRIG_ARMv8) diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index ce3f6ebb3..3379e2242 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -200,6 +200,8 @@ void randomx_apply_config(const T& config) RandomX_CurrentConfig.Apply(); } +void randomx_set_scratchpad_prefetch_mode(int mode); + #if defined(__cplusplus) extern "C" { #endif diff --git a/src/crypto/rx/Rx.cpp b/src/crypto/rx/Rx.cpp index 4c98d85ad..a6a1f5c95 100644 --- a/src/crypto/rx/Rx.cpp +++ b/src/crypto/rx/Rx.cpp @@ -32,6 +32,7 @@ #include "base/io/log/Log.h" #include "crypto/rx/RxConfig.h" #include "crypto/rx/RxQueue.h" +#include "crypto/randomx/randomx.h" namespace xmrig { @@ -99,6 +100,8 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu return true; } + randomx_set_scratchpad_prefetch_mode(config.scratchpadPrefetchMode()); + if (isReady(seed)) { return true; } diff --git a/src/crypto/rx/RxConfig.cpp b/src/crypto/rx/RxConfig.cpp index f1cd09e97..55dae35d6 100644 --- a/src/crypto/rx/RxConfig.cpp +++ b/src/crypto/rx/RxConfig.cpp @@ -57,6 +57,8 @@ static const char *kCacheQoS = "cache_qos"; static const char *kNUMA = "numa"; #endif +static const char *kScratchpadPrefetchMode = "scratchpad_prefetch_mode"; + static const std::array modeNames = { "auto", "fast", "light" }; @@ -118,6 +120,11 @@ bool xmrig::RxConfig::read(const rapidjson::Value &value) } # endif + const int mode = Json::getInt(value, kScratchpadPrefetchMode, static_cast(m_scratchpadPrefetchMode)); + if ((mode >= ScratchpadPrefetchOff) && (mode < ScratchpadPrefetchMax)) { + m_scratchpadPrefetchMode = static_cast(mode); + } + return true; } @@ -171,6 +178,8 @@ rapidjson::Value xmrig::RxConfig::toJSON(rapidjson::Document &doc) const } # endif + obj.AddMember(StringRef(kScratchpadPrefetchMode), static_cast(m_scratchpadPrefetchMode), allocator); + return obj; } diff --git a/src/crypto/rx/RxConfig.h b/src/crypto/rx/RxConfig.h index e3e06326e..c8d715d07 100644 --- a/src/crypto/rx/RxConfig.h +++ b/src/crypto/rx/RxConfig.h @@ -50,6 +50,14 @@ public: ModeMax }; + enum ScratchpadPrefetchMode : uint32_t { + ScratchpadPrefetchOff, + ScratchpadPrefetchT0, + ScratchpadPrefetchNTA, + ScratchpadPrefetchMov, + ScratchpadPrefetchMax, + }; + bool read(const rapidjson::Value &value); rapidjson::Value toJSON(rapidjson::Document &doc) const; @@ -68,6 +76,8 @@ public: inline bool cacheQoS() const { return m_cacheQoS; } inline Mode mode() const { return m_mode; } + inline ScratchpadPrefetchMode scratchpadPrefetchMode() const { return m_scratchpadPrefetchMode; } + # ifdef XMRIG_FEATURE_MSR const char *msrPresetName() const; const MsrItems &msrPreset() const; @@ -94,6 +104,8 @@ private: int m_threads = -1; Mode m_mode = AutoMode; + ScratchpadPrefetchMode m_scratchpadPrefetchMode = ScratchpadPrefetchT0; + # ifdef XMRIG_FEATURE_HWLOC std::vector m_nodeset; # endif From b826985d05cbffd252c38c137298c8d7bc7e135d Mon Sep 17 00:00:00 2001 From: cohcho Date: Sat, 5 Sep 2020 08:46:56 +0000 Subject: [PATCH 03/13] nonce iteration optimization efficient and correct nonce iteration without duplicates --- src/backend/common/WorkerJob.h | 44 +++++++------------------- src/crypto/common/Nonce.cpp | 57 ++++++++++++++-------------------- src/crypto/common/Nonce.h | 7 +++-- 3 files changed, 39 insertions(+), 69 deletions(-) diff --git a/src/backend/common/WorkerJob.h b/src/backend/common/WorkerJob.h index 2f01ea3d9..62b8adee8 100644 --- a/src/backend/common/WorkerJob.h +++ b/src/backend/common/WorkerJob.h @@ -66,14 +66,12 @@ public: inline bool nextRound(uint32_t rounds, uint32_t roundSize) { - bool ok = true; m_rounds[index()]++; if ((m_rounds[index()] % rounds) == 0) { for (size_t i = 0; i < N; ++i) { - *nonce(i) = Nonce::next(index(), *nonce(i), rounds * roundSize, currentJob().isNicehash(), &ok); - if (!ok) { - break; + if (!Nonce::next(index(), nonce(i), rounds * roundSize, currentJob().isNicehash(), nonceSize())) { + return false; } } } @@ -83,7 +81,7 @@ public: } } - return ok; + return true; } @@ -102,7 +100,7 @@ private: for (size_t i = 0; i < N; ++i) { memcpy(m_blobs[index()] + (i * size), job.blob(), size); - *nonce(i) = Nonce::next(index(), *nonce(i), reserveCount, job.isNicehash()); + Nonce::next(index(), nonce(i), reserveCount, job.isNicehash(), nonceSize()); } } @@ -125,41 +123,23 @@ inline uint32_t *xmrig::WorkerJob<1>::nonce(size_t) template<> inline bool xmrig::WorkerJob<1>::nextRound(uint32_t rounds, uint32_t roundSize) { - bool ok = true; m_rounds[index()]++; uint32_t* n = nonce(); - const uint32_t prev_nonce = *n; if ((m_rounds[index()] % rounds) == 0) { - *n = Nonce::next(index(), *n, rounds * roundSize, currentJob().isNicehash(), &ok); + if (!Nonce::next(index(), n, rounds * roundSize, currentJob().isNicehash(), nonceSize())) { + return false; + } + if (nonceSize() == sizeof(uint64_t)) { + m_jobs[index()].nonce()[1] = n[1]; + } } else { *n += roundSize; } - // Increment higher 32 bits of a 64-bit nonce when lower 32 bits overflow - if (!currentJob().isNicehash() && (nonceSize() == sizeof(uint64_t))) { - const bool wrapped = (*n < prev_nonce); - const bool wraps_this_round = (static_cast(*n) + roundSize > (1ULL << 32)); - - // Account for the case when starting nonce hasn't wrapped yet, but some nonces in the current round will wrap - if (wrapped || wraps_this_round) { - // Set lower 32 bits to 0 when higher 32 bits change - Nonce::reset(index()); - - // Sets *n to 0 and Nonce::m_nonce[index] to the correct next value - *n = 0; - Nonce::next(index(), *n, rounds * roundSize, currentJob().isNicehash(), &ok); - - ++n[1]; - - Job& job = m_jobs[index()]; - memcpy(job.blob(), blob(), job.size()); - } - } - - return ok; + return true; } @@ -173,7 +153,7 @@ inline void xmrig::WorkerJob<1>::save(const Job &job, uint32_t reserveCount, Non m_jobs[index()].setBackend(backend); memcpy(blob(), job.blob(), job.size()); - *nonce() = Nonce::next(index(), *nonce(), reserveCount, currentJob().isNicehash()); + Nonce::next(index(), nonce(), reserveCount, currentJob().isNicehash(), nonceSize()); } diff --git a/src/crypto/common/Nonce.cpp b/src/crypto/common/Nonce.cpp index f896c5612..133cef97d 100644 --- a/src/crypto/common/Nonce.cpp +++ b/src/crypto/common/Nonce.cpp @@ -26,18 +26,14 @@ #include "crypto/common/Nonce.h" -#include - - namespace xmrig { std::atomic Nonce::m_paused; std::atomic Nonce::m_sequence[Nonce::MAX]; -uint32_t Nonce::m_nonces[2] = { 0, 0 }; +std::atomic Nonce::m_nonces[2] = { {0}, {0} }; -static std::mutex mutex; static Nonce nonce; @@ -54,40 +50,33 @@ xmrig::Nonce::Nonce() } -uint32_t xmrig::Nonce::next(uint8_t index, uint32_t nonce, uint32_t reserveCount, bool nicehash, bool *ok) +bool xmrig::Nonce::next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, bool nicehash, size_t nonceSize) { - uint32_t next; - - std::lock_guard lock(mutex); - - if (nicehash) { - if ((m_nonces[index] + reserveCount) > 0x1000000) { - if (ok) { - *ok = false; - } + const uint64_t mask = nicehash ? 0xFFFFFFULL : (nonceSize == sizeof(uint64_t) ? 0x7FFFFFFFFFFFFFFFULL : 0xFFFFFFFFULL); + if (reserveCount == 0 || mask < reserveCount - 1) { + return false; + } + uint64_t counter = m_nonces[index].fetch_add(reserveCount, std::memory_order_relaxed); + while (true) { + if (mask < counter) { + return false; + } else if (mask - counter <= reserveCount - 1) { pause(true); - - return 0; + if (mask - counter < reserveCount - 1) { + return false; + } } - - next = (nonce & 0xFF000000) | m_nonces[index]; + else if (0xFFFFFFFFUL - (uint32_t)counter < reserveCount - 1) { + counter = m_nonces[index].fetch_add(reserveCount, std::memory_order_relaxed); + continue; + } + *nonce = (nonce[0] & ~mask) | counter; + if (mask > 0xFFFFFFFFULL) { + nonce[1] = (counter >> 32); + } + return true; } - else { - next = m_nonces[index]; - } - - m_nonces[index] += reserveCount; - - return next; -} - - -void xmrig::Nonce::reset(uint8_t index) -{ - std::lock_guard lock(mutex); - - m_nonces[index] = 0; } diff --git a/src/crypto/common/Nonce.h b/src/crypto/common/Nonce.h index 4fa47b873..c4d7927f9 100644 --- a/src/crypto/common/Nonce.h +++ b/src/crypto/common/Nonce.h @@ -27,6 +27,7 @@ #include +#include namespace xmrig { @@ -49,18 +50,18 @@ public: static inline bool isPaused() { return m_paused.load(std::memory_order_relaxed); } static inline uint64_t sequence(Backend backend) { return m_sequence[backend].load(std::memory_order_relaxed); } static inline void pause(bool paused) { m_paused = paused; } + static inline void reset(uint8_t index) { m_nonces[index] = 0; } static inline void stop(Backend backend) { m_sequence[backend] = 0; } static inline void touch(Backend backend) { m_sequence[backend]++; } - static uint32_t next(uint8_t index, uint32_t nonce, uint32_t reserveCount, bool nicehash, bool *ok = nullptr); - static void reset(uint8_t index); + static bool next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, bool nicehash, size_t nonceSize); static void stop(); static void touch(); private: static std::atomic m_paused; static std::atomic m_sequence[MAX]; - static uint32_t m_nonces[2]; + static std::atomic m_nonces[2]; }; From 060c1af4c48a98b4c292b559c6d21669eafd9a8f Mon Sep 17 00:00:00 2001 From: cohcho Date: Wed, 9 Sep 2020 19:34:43 +0000 Subject: [PATCH 04/13] fix nonce mask --- src/backend/common/WorkerJob.h | 12 ++++++++---- src/crypto/common/Nonce.cpp | 5 ++--- src/crypto/common/Nonce.h | 3 +-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/backend/common/WorkerJob.h b/src/backend/common/WorkerJob.h index 62b8adee8..51b849ac2 100644 --- a/src/backend/common/WorkerJob.h +++ b/src/backend/common/WorkerJob.h @@ -70,7 +70,7 @@ public: if ((m_rounds[index()] % rounds) == 0) { for (size_t i = 0; i < N; ++i) { - if (!Nonce::next(index(), nonce(i), rounds * roundSize, currentJob().isNicehash(), nonceSize())) { + if (!Nonce::next(index(), nonce(i), rounds * roundSize, nonceMask())) { return false; } } @@ -88,6 +88,7 @@ public: private: inline int32_t nonceOffset() const { return currentJob().nonceOffset(); } inline size_t nonceSize() const { return currentJob().nonceSize(); } + inline uint64_t nonceMask() const { return m_nonce_mask[index()]; } inline void save(const Job &job, uint32_t reserveCount, Nonce::Backend backend) { @@ -95,12 +96,13 @@ private: const size_t size = job.size(); m_jobs[index()] = job; m_rounds[index()] = 0; + m_nonce_mask[index()] = job.isNicehash() ? 0xFFFFFFULL : (nonceSize() == sizeof(uint64_t) ? (-1ull >> (job.extraNonce().size() * 4 + 1)): 0xFFFFFFFFULL); m_jobs[index()].setBackend(backend); for (size_t i = 0; i < N; ++i) { memcpy(m_blobs[index()] + (i * size), job.blob(), size); - Nonce::next(index(), nonce(i), reserveCount, job.isNicehash(), nonceSize()); + Nonce::next(index(), nonce(i), reserveCount, nonceMask()); } } @@ -108,6 +110,7 @@ private: alignas(16) uint8_t m_blobs[2][Job::kMaxBlobSize * N]{}; Job m_jobs[2]; uint32_t m_rounds[2] = { 0, 0 }; + uint64_t m_nonce_mask[2]; uint64_t m_sequence = 0; uint8_t m_index = 0; }; @@ -128,7 +131,7 @@ inline bool xmrig::WorkerJob<1>::nextRound(uint32_t rounds, uint32_t roundSize) uint32_t* n = nonce(); if ((m_rounds[index()] % rounds) == 0) { - if (!Nonce::next(index(), n, rounds * roundSize, currentJob().isNicehash(), nonceSize())) { + if (!Nonce::next(index(), n, rounds * roundSize, nonceMask())) { return false; } if (nonceSize() == sizeof(uint64_t)) { @@ -149,11 +152,12 @@ inline void xmrig::WorkerJob<1>::save(const Job &job, uint32_t reserveCount, Non m_index = job.index(); m_jobs[index()] = job; m_rounds[index()] = 0; + m_nonce_mask[index()] = job.isNicehash() ? 0xFFFFFFULL : (nonceSize() == sizeof(uint64_t) ? (-1ull >> (job.extraNonce().size() * 4 + 1)): 0xFFFFFFFFULL); m_jobs[index()].setBackend(backend); memcpy(blob(), job.blob(), job.size()); - Nonce::next(index(), nonce(), reserveCount, currentJob().isNicehash(), nonceSize()); + Nonce::next(index(), nonce(), reserveCount, nonceMask()); } diff --git a/src/crypto/common/Nonce.cpp b/src/crypto/common/Nonce.cpp index 133cef97d..2c5a47a9a 100644 --- a/src/crypto/common/Nonce.cpp +++ b/src/crypto/common/Nonce.cpp @@ -50,9 +50,8 @@ xmrig::Nonce::Nonce() } -bool xmrig::Nonce::next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, bool nicehash, size_t nonceSize) +bool xmrig::Nonce::next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, uint64_t mask) { - const uint64_t mask = nicehash ? 0xFFFFFFULL : (nonceSize == sizeof(uint64_t) ? 0x7FFFFFFFFFFFFFFFULL : 0xFFFFFFFFULL); if (reserveCount == 0 || mask < reserveCount - 1) { return false; } @@ -73,7 +72,7 @@ bool xmrig::Nonce::next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, b } *nonce = (nonce[0] & ~mask) | counter; if (mask > 0xFFFFFFFFULL) { - nonce[1] = (counter >> 32); + nonce[1] = (nonce[1] & (~mask >> 32)) | (counter >> 32); } return true; } diff --git a/src/crypto/common/Nonce.h b/src/crypto/common/Nonce.h index c4d7927f9..05c842987 100644 --- a/src/crypto/common/Nonce.h +++ b/src/crypto/common/Nonce.h @@ -27,7 +27,6 @@ #include -#include namespace xmrig { @@ -54,7 +53,7 @@ public: static inline void stop(Backend backend) { m_sequence[backend] = 0; } static inline void touch(Backend backend) { m_sequence[backend]++; } - static bool next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, bool nicehash, size_t nonceSize); + static bool next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, uint64_t mask); static void stop(); static void touch(); From 4a9db89527dfac4e0c66892190fd3d642408d459 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Thu, 10 Sep 2020 14:28:40 +0200 Subject: [PATCH 05/13] RandomX: added SSE4.1-optimized Blake2b +0.15% on `rx/0` +0.3% on `rx/wow` --- cmake/flags.cmake | 8 +- src/backend/cpu/interfaces/ICpuInfo.h | 1 + src/backend/cpu/platform/BasicCpuInfo.cpp | 4 +- .../randomx/blake2/blake2b-load-sse41.h | 402 ++++++++++++++++++ src/crypto/randomx/blake2/blake2b-round.h | 119 ++++++ src/crypto/randomx/blake2/blake2b.c | 89 +++- src/crypto/rx/RxVm.cpp | 9 + 7 files changed, 623 insertions(+), 9 deletions(-) create mode 100644 src/crypto/randomx/blake2/blake2b-load-sse41.h create mode 100644 src/crypto/randomx/blake2/blake2b-round.h diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 5edad3392..4ff316e5d 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -29,8 +29,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions") else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1") add_definitions(/DHAVE_ROTR) endif() @@ -87,8 +87,8 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1") check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR) if (HAVE_ROTR) diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h index dd740a180..20fb62958 100644 --- a/src/backend/cpu/interfaces/ICpuInfo.h +++ b/src/backend/cpu/interfaces/ICpuInfo.h @@ -63,6 +63,7 @@ public: FLAG_PDPE1GB, FLAG_SSE2, FLAG_SSSE3, + FLAG_SSE41, FLAG_XOP, FLAG_POPCNT, FLAG_CAT_L3, diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index ca022b2ea..cc03646b5 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -57,7 +57,7 @@ namespace xmrig { -static const std::array flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "xop", "popcnt", "cat_l3" }; +static const std::array flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3" }; static const std::array msrNames = { "none", "ryzen", "intel", "custom" }; @@ -141,6 +141,7 @@ static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); } static inline bool has_sse2() { return has_feature(PROCESSOR_INFO, EDX_Reg, 1 << 26); } static inline bool has_ssse3() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 9); } +static inline bool has_sse41() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 19); } static inline bool has_xop() { return has_feature(0x80000001, ECX_Reg, 1 << 11); } static inline bool has_popcnt() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 23); } static inline bool has_cat_l3() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 15) && has_feature(0x10, EBX_Reg, 1 << 1); } @@ -177,6 +178,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : m_flags.set(FLAG_PDPE1GB, has_pdpe1gb()); m_flags.set(FLAG_SSE2, has_sse2()); m_flags.set(FLAG_SSSE3, has_ssse3()); + m_flags.set(FLAG_SSE41, has_sse41()); m_flags.set(FLAG_XOP, has_xop()); m_flags.set(FLAG_POPCNT, has_popcnt()); m_flags.set(FLAG_CAT_L3, has_cat_l3()); diff --git a/src/crypto/randomx/blake2/blake2b-load-sse41.h b/src/crypto/randomx/blake2/blake2b-load-sse41.h new file mode 100644 index 000000000..0eca86599 --- /dev/null +++ b/src/crypto/randomx/blake2/blake2b-load-sse41.h @@ -0,0 +1,402 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2B_LOAD_SSE41_H +#define BLAKE2B_LOAD_SSE41_H + +#define LOAD_MSG_0_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_0_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_1_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_1_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_1_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_1_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#define LOAD_MSG_2_1(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m5, 8); \ +b1 = _mm_unpackhi_epi64(m2, m7); \ +} while(0) + + +#define LOAD_MSG_2_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m0); \ +b1 = _mm_blend_epi16(m1, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_2_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m5, m1, 0xF0); \ +b1 = _mm_unpackhi_epi64(m3, m4); \ +} while(0) + + +#define LOAD_MSG_2_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m3); \ +b1 = _mm_alignr_epi8(m2, m0, 8); \ +} while(0) + + +#define LOAD_MSG_3_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_unpackhi_epi64(m6, m5); \ +} while(0) + + +#define LOAD_MSG_3_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m0); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_3_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m2, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_3_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m5); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_4_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m2); \ +b1 = _mm_unpacklo_epi64(m1, m5); \ +} while(0) + + +#define LOAD_MSG_4_2(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m0, m3, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m7, m5, 0xF0); \ +b1 = _mm_blend_epi16(m3, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m0, 8); \ +b1 = _mm_blend_epi16(m4, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_5_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m3); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_5_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m5); \ +b1 = _mm_unpackhi_epi64(m5, m1); \ +} while(0) + + +#define LOAD_MSG_5_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m2, m3, 0xF0); \ +b1 = _mm_unpackhi_epi64(m7, m0); \ +} while(0) + + +#define LOAD_MSG_5_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m2); \ +b1 = _mm_blend_epi16(m7, m4, 0xF0); \ +} while(0) + + +#define LOAD_MSG_6_1(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m6, m0, 0xF0); \ +b1 = _mm_unpacklo_epi64(m7, m2); \ +} while(0) + + +#define LOAD_MSG_6_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_alignr_epi8(m5, m6, 8); \ +} while(0) + + +#define LOAD_MSG_6_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m3); \ +b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ +} while(0) + + +#define LOAD_MSG_6_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_blend_epi16(m1, m5, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m3); \ +b1 = _mm_blend_epi16(m6, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_2(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpackhi_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_7_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_unpacklo_epi64(m4, m1); \ +} while(0) + + +#define LOAD_MSG_7_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m2); \ +b1 = _mm_unpacklo_epi64(m3, m5); \ +} while(0) + + +#define LOAD_MSG_8_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m7); \ +b1 = _mm_alignr_epi8(m0, m5, 8); \ +} while(0) + + +#define LOAD_MSG_8_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_alignr_epi8(m4, m1, 8); \ +} while(0) + + +#define LOAD_MSG_8_3(b0, b1) \ +do \ +{ \ +b0 = m6; \ +b1 = _mm_alignr_epi8(m5, m0, 8); \ +} while(0) + + +#define LOAD_MSG_8_4(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m3, 0xF0); \ +b1 = m2; \ +} while(0) + + +#define LOAD_MSG_9_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_unpackhi_epi64(m3, m0); \ +} while(0) + + +#define LOAD_MSG_9_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m2); \ +b1 = _mm_blend_epi16(m3, m2, 0xF0); \ +} while(0) + + +#define LOAD_MSG_9_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_unpackhi_epi64(m1, m6); \ +} while(0) + + +#define LOAD_MSG_9_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpacklo_epi64(m6, m0); \ +} while(0) + + +#define LOAD_MSG_10_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_10_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_11_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_11_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_11_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_11_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#endif diff --git a/src/crypto/randomx/blake2/blake2b-round.h b/src/crypto/randomx/blake2/blake2b-round.h new file mode 100644 index 000000000..1edc2cc4c --- /dev/null +++ b/src/crypto/randomx/blake2/blake2b-round.h @@ -0,0 +1,119 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2B_ROUND_H +#define BLAKE2B_ROUND_H + +#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) + +#define TOF(reg) _mm_castsi128_ps((reg)) +#define TOI(reg) _mm_castps_si128((reg)) + +#define LIKELY(x) __builtin_expect((x),1) + + +/* Microarchitecture-specific macros */ +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) + + + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); \ + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; + +#include "blake2b-load-sse41.h" + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + +#endif diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c index 29a402d8a..6a0889cbb 100644 --- a/src/crypto/randomx/blake2/blake2b.c +++ b/src/crypto/randomx/blake2/blake2b.c @@ -39,6 +39,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/blake2/blake2.h" #include "crypto/randomx/blake2/blake2-impl.h" +#if defined(_M_X64) || defined(__x86_64__) + +#ifdef _MSC_VER +#include +#endif + +#include +#include "blake2b-round.h" + +#endif + static const uint64_t blake2b_IV[8] = { UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), @@ -179,7 +190,63 @@ int rx_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t return 0; } -static void rx_blake2b_compress(blake2b_state *S, const uint8_t *block) { +#if defined(_M_X64) || defined(__x86_64__) +static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block) +{ + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; + + const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + + const __m128i m0 = LOADU(block + 00); + const __m128i m1 = LOADU(block + 16); + const __m128i m2 = LOADU(block + 32); + const __m128i m3 = LOADU(block + 48); + const __m128i m4 = LOADU(block + 64); + const __m128i m5 = LOADU(block + 80); + const __m128i m6 = LOADU(block + 96); + const __m128i m7 = LOADU(block + 112); + + row1l = LOADU(&S->h[0]); + row1h = LOADU(&S->h[2]); + row2l = LOADU(&S->h[4]); + row2h = LOADU(&S->h[6]); + row3l = LOADU(&blake2b_IV[0]); + row3h = LOADU(&blake2b_IV[2]); + row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); + row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); + + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); + STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); + STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); +} +#undef ROUND +#endif + +static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) { uint64_t m[16]; uint64_t v[16]; unsigned int i, r; @@ -237,6 +304,20 @@ static void rx_blake2b_compress(blake2b_state *S, const uint8_t *block) { #undef ROUND } +#if defined(_M_X64) || defined(__x86_64__) + +uint32_t rx_blake2b_use_sse41 = 0; + +#define rx_blake2b_compress(S, block) \ + if (rx_blake2b_use_sse41) \ + rx_blake2b_compress_sse41(S, block); \ + else \ + rx_blake2b_compress_integer(S, block); + +#else +#define rx_blake2b_compress(S, block) rx_blake2b_compress_integer(S, block); +#endif + int rx_blake2b_update(blake2b_state *S, const void *in, size_t inlen) { const uint8_t *pin = (const uint8_t *)in; @@ -260,14 +341,14 @@ int rx_blake2b_update(blake2b_state *S, const void *in, size_t inlen) { size_t fill = BLAKE2B_BLOCKBYTES - left; memcpy(&S->buf[left], pin, fill); blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); - rx_blake2b_compress(S, S->buf); + rx_blake2b_compress(S, S->buf); S->buflen = 0; inlen -= fill; pin += fill; /* Avoid buffer copies when possible */ while (inlen > BLAKE2B_BLOCKBYTES) { blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); - rx_blake2b_compress(S, pin); + rx_blake2b_compress(S, pin); inlen -= BLAKE2B_BLOCKBYTES; pin += BLAKE2B_BLOCKBYTES; } @@ -294,7 +375,7 @@ int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen) { blake2b_increment_counter(S, S->buflen); blake2b_set_lastblock(S); memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ - rx_blake2b_compress(S, S->buf); + rx_blake2b_compress(S, S->buf); for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */ store64(buffer + sizeof(S->h[i]) * i, S->h[i]); diff --git a/src/crypto/rx/RxVm.cpp b/src/crypto/rx/RxVm.cpp index c8a5ca8dc..8879eef33 100644 --- a/src/crypto/rx/RxVm.cpp +++ b/src/crypto/rx/RxVm.cpp @@ -31,6 +31,11 @@ #include "crypto/rx/RxVm.h" +#if defined(_M_X64) || defined(__x86_64__) +extern "C" uint32_t rx_blake2b_use_sse41; +#endif + + randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::Assembly assembly, uint32_t node) { int flags = 0; @@ -55,6 +60,10 @@ randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so flags |= RANDOMX_FLAG_AMD; } +# if defined(_M_X64) || defined(__x86_64__) + rx_blake2b_use_sse41 = Cpu::info()->has(ICpuInfo::FLAG_SSE41) ? 1 : 0; +# endif + return randomx_create_vm(static_cast(flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad, node); } From a05393727c980c333f60fc7a532533a1bdf5867c Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sat, 12 Sep 2020 23:07:52 +0200 Subject: [PATCH 06/13] RandomX: added performance profiler (for developers) Also optimized Blake2b SSE4.1 code size to avoid code cache pollution. --- CMakeLists.txt | 1 + src/base/base.cmake | 12 + src/base/io/log/Tags.cpp | 10 + src/base/io/log/Tags.h | 4 + src/base/tools/Profiler.cpp | 100 +++++ src/base/tools/Profiler.h | 132 ++++++ src/core/Miner.cpp | 43 ++ src/crypto/randomx/aes_hash.cpp | 3 + src/crypto/randomx/blake2/blake2.h | 2 +- .../randomx/blake2/blake2b-load-sse41.h | 402 ------------------ src/crypto/randomx/blake2/blake2b-round.h | 14 +- src/crypto/randomx/blake2/blake2b.c | 98 ++--- src/crypto/randomx/blake2_generator.cpp | 2 +- src/crypto/randomx/jit_compiler_x86.cpp | 3 + src/crypto/randomx/randomx.cpp | 18 +- src/crypto/randomx/virtual_machine.cpp | 10 +- src/crypto/randomx/virtual_machine.hpp | 8 +- src/crypto/randomx/vm_compiled.cpp | 5 + src/crypto/rx/RxConfig.cpp | 4 +- 19 files changed, 390 insertions(+), 481 deletions(-) create mode 100644 src/base/tools/Profiler.cpp create mode 100644 src/base/tools/Profiler.h delete mode 100644 src/crypto/randomx/blake2/blake2b-load-sse41.h diff --git a/CMakeLists.txt b/CMakeLists.txt index dff77ee58..5bacc9697 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ option(WITH_NVML "Enable NVML (NVIDIA Management Library) support (on option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support (only if OpenCL backend enabled)" ON) option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON) option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF) +option(WITH_PROFILING "Enable profiling for developers" OFF) option(BUILD_STATIC "Build static binary" OFF) option(ARM_TARGET "Force use specific ARM target 8 or 7" 0) diff --git a/src/base/base.cmake b/src/base/base.cmake index 0c82201f2..da53d5ea8 100644 --- a/src/base/base.cmake +++ b/src/base/base.cmake @@ -222,3 +222,15 @@ if (WITH_KAWPOW) src/base/net/stratum/EthStratumClient.cpp ) endif() + +if (WITH_PROFILING) + add_definitions(/DXMRIG_FEATURE_PROFILING) + + list(APPEND HEADERS_BASE + src/base/tools/Profiler.h + ) + + list(APPEND SOURCES_BASE + src/base/tools/Profiler.cpp + ) +endif() diff --git a/src/base/io/log/Tags.cpp b/src/base/io/log/Tags.cpp index 0b4f7a0a9..af36b0baa 100644 --- a/src/base/io/log/Tags.cpp +++ b/src/base/io/log/Tags.cpp @@ -101,3 +101,13 @@ const char *xmrig::Tags::opencl() return tag; } #endif + + +#ifdef XMRIG_FEATURE_PROFILING +const char* xmrig::Tags::profiler() +{ + static const char* tag = CYAN_BG_BOLD(WHITE_BOLD_S " profile "); + + return tag; +} +#endif diff --git a/src/base/io/log/Tags.h b/src/base/io/log/Tags.h index e6d470be2..072d7d414 100644 --- a/src/base/io/log/Tags.h +++ b/src/base/io/log/Tags.h @@ -53,6 +53,10 @@ public: # ifdef XMRIG_FEATURE_OPENCL static const char *opencl(); # endif + +# ifdef XMRIG_FEATURE_PROFILING + static const char* profiler(); +# endif }; diff --git a/src/base/tools/Profiler.cpp b/src/base/tools/Profiler.cpp new file mode 100644 index 000000000..f6f066f37 --- /dev/null +++ b/src/base/tools/Profiler.cpp @@ -0,0 +1,100 @@ +/* XMRig + * Copyright 2018-2020 SChernykh + * Copyright 2016-2020 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "base/tools/Profiler.h" +#include "base/io/log/Log.h" +#include "base/io/log/Tags.h" +#include +#include +#include +#include + + +#ifdef XMRIG_FEATURE_PROFILING + + +ProfileScopeData* ProfileScopeData::s_data[MAX_DATA_COUNT] = {}; +volatile long ProfileScopeData::s_dataCount = 0; +double ProfileScopeData::s_tscSpeed = 0.0; + + +#ifndef NOINLINE +#ifdef __GNUC__ +#define NOINLINE __attribute__ ((noinline)) +#elif _MSC_VER +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE +#endif +#endif + + +static std::string get_thread_id() +{ + std::stringstream ss; + ss << std::this_thread::get_id(); + + std::string s = ss.str(); + if (s.length() > ProfileScopeData::MAX_THREAD_ID_LENGTH) { + s.resize(ProfileScopeData::MAX_THREAD_ID_LENGTH); + } + + return s; +} + + +NOINLINE void ProfileScopeData::Register(ProfileScopeData* data) +{ +#ifdef _MSC_VER + const long id = _InterlockedIncrement(&s_dataCount) - 1; +#else + const long id = __sync_fetch_and_add(&s_dataCount, 1); +#endif + + if (static_cast(id) < MAX_DATA_COUNT) { + s_data[id] = data; + + const std::string s = get_thread_id(); + memcpy(data->m_threadId, s.c_str(), s.length() + 1); + } +} + + +NOINLINE void ProfileScopeData::Init() +{ + using namespace std::chrono; + + const uint64_t t1 = static_cast(time_point_cast(high_resolution_clock::now()).time_since_epoch().count()); + const uint64_t count1 = ReadTSC(); + + for (;;) + { + const uint64_t t2 = static_cast(time_point_cast(high_resolution_clock::now()).time_since_epoch().count()); + const uint64_t count2 = ReadTSC(); + + if (t2 - t1 > 1000000000) { + s_tscSpeed = (count2 - count1) * 1e9 / (t2 - t1); + LOG_INFO("%s TSC speed = %.3f GHz", xmrig::Tags::profiler(), s_tscSpeed / 1e9); + return; + } + } +} + + +#endif /* XMRIG_FEATURE_PROFILING */ diff --git a/src/base/tools/Profiler.h b/src/base/tools/Profiler.h new file mode 100644 index 000000000..c74277151 --- /dev/null +++ b/src/base/tools/Profiler.h @@ -0,0 +1,132 @@ +/* XMRig + * Copyright 2018-2020 SChernykh + * Copyright 2016-2020 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef XMRIG_PROFILER_H +#define XMRIG_PROFILER_H + + +#ifndef FORCE_INLINE +#if defined(_MSC_VER) +#define FORCE_INLINE __forceinline +#elif defined(__GNUC__) +#define FORCE_INLINE __attribute__((always_inline)) inline +#elif defined(__clang__) +#define FORCE_INLINE __inline__ +#else +#define FORCE_INLINE +#endif +#endif + + +#ifdef XMRIG_FEATURE_PROFILING + + +#include +#include + +#if defined(_MSC_VER) +#include +#endif + + +static FORCE_INLINE uint64_t ReadTSC() +{ +#ifdef _MSC_VER + return __rdtsc(); +#else + uint32_t hi, lo; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return (((uint64_t)hi) << 32) | lo; +#endif +} + + +struct ProfileScopeData +{ + const char* m_name; + uint64_t m_totalCycles; + uint32_t m_totalSamples; + + enum + { + MAX_THREAD_ID_LENGTH = 11, + MAX_SAMPLE_COUNT = 128, + MAX_DATA_COUNT = 1024 + }; + + char m_threadId[MAX_THREAD_ID_LENGTH + 1]; + + static ProfileScopeData* s_data[MAX_DATA_COUNT]; + static volatile long s_dataCount; + static double s_tscSpeed; + + static void Register(ProfileScopeData* data); + static void Init(); +}; + +static_assert(std::is_trivial::value, "ProfileScopeData must be a trivial struct"); +static_assert(sizeof(ProfileScopeData) <= 32, "ProfileScopeData struct is too big"); + + +class ProfileScope +{ +public: + FORCE_INLINE ProfileScope(ProfileScopeData& data) + : m_data(data) + { + if (m_data.m_totalCycles == 0) { + ProfileScopeData::Register(&data); + } + + m_startCounter = ReadTSC(); + } + + FORCE_INLINE ~ProfileScope() + { + m_data.m_totalCycles += ReadTSC() - m_startCounter; + ++m_data.m_totalSamples; + } + +private: + ProfileScopeData& m_data; + uint64_t m_startCounter; +}; + + +#define PROFILE_SCOPE(x) static thread_local ProfileScopeData x##_data{#x}; ProfileScope x(x##_data); + + +#else /* XMRIG_FEATURE_PROFILING */ +#define PROFILE_SCOPE(x) +#endif /* XMRIG_FEATURE_PROFILING */ + + +#include "crypto/randomx/blake2/blake2.h" + + +struct rx_blake2b_wrapper +{ + FORCE_INLINE static void run(void* out, size_t outlen, const void* in, size_t inlen) + { + PROFILE_SCOPE(RandomX_Blake2b); + rx_blake2b(out, outlen, in, inlen); + } +}; + + +#endif /* XMRIG_PROFILER_H */ diff --git a/src/core/Miner.cpp b/src/core/Miner.cpp index 1a8c29f99..12be05ecc 100644 --- a/src/core/Miner.cpp +++ b/src/core/Miner.cpp @@ -38,6 +38,7 @@ #include "base/kernel/Platform.h" #include "base/net/stratum/Job.h" #include "base/tools/Object.h" +#include "base/tools/Profiler.h" #include "base/tools/Timer.h" #include "core/config/Config.h" #include "core/Controller.h" @@ -267,6 +268,44 @@ public: h = "MH/s"; } +# ifdef XMRIG_FEATURE_PROFILING + ProfileScopeData* data[ProfileScopeData::MAX_DATA_COUNT]; + + const uint32_t n = std::min(ProfileScopeData::s_dataCount, ProfileScopeData::MAX_DATA_COUNT); + memcpy(data, ProfileScopeData::s_data, n * sizeof(ProfileScopeData*)); + + std::sort(data, data + n, [](ProfileScopeData* a, ProfileScopeData* b) { + return strcmp(a->m_threadId, b->m_threadId) < 0; + }); + + for (uint32_t i = 0; i < n;) + { + uint32_t n1 = i; + while ((n1 < n) && (strcmp(data[i]->m_threadId, data[n1]->m_threadId) == 0)) { + ++n1; + } + + std::sort(data + i, data + n1, [](ProfileScopeData* a, ProfileScopeData* b) { + return a->m_totalCycles > b->m_totalCycles; + }); + + for (uint32_t j = i; j < n1; ++j) { + ProfileScopeData* p = data[j]; + LOG_INFO("%s Thread %6s | %-30s | %7.3f%% | %9.0f ns", + Tags::profiler(), + p->m_threadId, + p->m_name, + p->m_totalCycles * 100.0 / data[i]->m_totalCycles, + p->m_totalCycles / p->m_totalSamples * 1e9 / ProfileScopeData::s_tscSpeed + ); + } + + LOG_INFO("%s --------------|--------------------------------|----------|-------------", Tags::profiler()); + + i = n1; + } +# endif + LOG_INFO("%s " WHITE_BOLD("speed") " 10s/60s/15m " CYAN_BOLD("%s") CYAN(" %s %s ") CYAN_BOLD("%s") " max " CYAN_BOLD("%s %s"), Tags::miner(), Hashrate::format(speed[0] * scale, num, sizeof(num) / 4), @@ -311,6 +350,10 @@ xmrig::Miner::Miner(Controller *controller) Platform::setThreadPriority(std::min(priority + 1, 5)); } +# ifdef XMRIG_FEATURE_PROFILING + ProfileScopeData::Init(); +# endif + # ifdef XMRIG_ALGO_RANDOMX Rx::init(this); # endif diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index 1898a2c55..571b4ca73 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/soft_aes.h" #include "crypto/randomx/randomx.h" +#include "base/tools/Profiler.h" #define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d #define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e @@ -215,6 +216,8 @@ template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { + PROFILE_SCOPE(RandomX_AES); + uint8_t* scratchpadPtr = (uint8_t*)scratchpad; const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize; diff --git a/src/crypto/randomx/blake2/blake2.h b/src/crypto/randomx/blake2/blake2.h index 4d364c36c..52f05b396 100644 --- a/src/crypto/randomx/blake2/blake2.h +++ b/src/crypto/randomx/blake2/blake2.h @@ -92,7 +92,7 @@ extern "C" { int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen); /* Simple API */ - int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen); + int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen); /* Argon2 Team - Begin Code */ int rxa2_blake2b_long(void *out, size_t outlen, const void *in, size_t inlen); diff --git a/src/crypto/randomx/blake2/blake2b-load-sse41.h b/src/crypto/randomx/blake2/blake2b-load-sse41.h deleted file mode 100644 index 0eca86599..000000000 --- a/src/crypto/randomx/blake2/blake2b-load-sse41.h +++ /dev/null @@ -1,402 +0,0 @@ -/* - BLAKE2 reference source code package - optimized C implementations - - Copyright 2012, Samuel Neves . You may use this under the - terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at - your option. The terms of these licenses can be found at: - - - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - - OpenSSL license : https://www.openssl.org/source/license.html - - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - - More information about the BLAKE2 hash function can be found at - https://blake2.net. -*/ -#ifndef BLAKE2B_LOAD_SSE41_H -#define BLAKE2B_LOAD_SSE41_H - -#define LOAD_MSG_0_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m0, m1); \ -b1 = _mm_unpacklo_epi64(m2, m3); \ -} while(0) - - -#define LOAD_MSG_0_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m0, m1); \ -b1 = _mm_unpackhi_epi64(m2, m3); \ -} while(0) - - -#define LOAD_MSG_0_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m4, m5); \ -b1 = _mm_unpacklo_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_0_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m4, m5); \ -b1 = _mm_unpackhi_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_1_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m7, m2); \ -b1 = _mm_unpackhi_epi64(m4, m6); \ -} while(0) - - -#define LOAD_MSG_1_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m5, m4); \ -b1 = _mm_alignr_epi8(m3, m7, 8); \ -} while(0) - - -#define LOAD_MSG_1_3(b0, b1) \ -do \ -{ \ -b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ -b1 = _mm_unpackhi_epi64(m5, m2); \ -} while(0) - - -#define LOAD_MSG_1_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m6, m1); \ -b1 = _mm_unpackhi_epi64(m3, m1); \ -} while(0) - - -#define LOAD_MSG_2_1(b0, b1) \ -do \ -{ \ -b0 = _mm_alignr_epi8(m6, m5, 8); \ -b1 = _mm_unpackhi_epi64(m2, m7); \ -} while(0) - - -#define LOAD_MSG_2_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m4, m0); \ -b1 = _mm_blend_epi16(m1, m6, 0xF0); \ -} while(0) - - -#define LOAD_MSG_2_3(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m5, m1, 0xF0); \ -b1 = _mm_unpackhi_epi64(m3, m4); \ -} while(0) - - -#define LOAD_MSG_2_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m7, m3); \ -b1 = _mm_alignr_epi8(m2, m0, 8); \ -} while(0) - - -#define LOAD_MSG_3_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m3, m1); \ -b1 = _mm_unpackhi_epi64(m6, m5); \ -} while(0) - - -#define LOAD_MSG_3_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m4, m0); \ -b1 = _mm_unpacklo_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_3_3(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m1, m2, 0xF0); \ -b1 = _mm_blend_epi16(m2, m7, 0xF0); \ -} while(0) - - -#define LOAD_MSG_3_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m3, m5); \ -b1 = _mm_unpacklo_epi64(m0, m4); \ -} while(0) - - -#define LOAD_MSG_4_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m4, m2); \ -b1 = _mm_unpacklo_epi64(m1, m5); \ -} while(0) - - -#define LOAD_MSG_4_2(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m0, m3, 0xF0); \ -b1 = _mm_blend_epi16(m2, m7, 0xF0); \ -} while(0) - - -#define LOAD_MSG_4_3(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m7, m5, 0xF0); \ -b1 = _mm_blend_epi16(m3, m1, 0xF0); \ -} while(0) - - -#define LOAD_MSG_4_4(b0, b1) \ -do \ -{ \ -b0 = _mm_alignr_epi8(m6, m0, 8); \ -b1 = _mm_blend_epi16(m4, m6, 0xF0); \ -} while(0) - - -#define LOAD_MSG_5_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m1, m3); \ -b1 = _mm_unpacklo_epi64(m0, m4); \ -} while(0) - - -#define LOAD_MSG_5_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m6, m5); \ -b1 = _mm_unpackhi_epi64(m5, m1); \ -} while(0) - - -#define LOAD_MSG_5_3(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m2, m3, 0xF0); \ -b1 = _mm_unpackhi_epi64(m7, m0); \ -} while(0) - - -#define LOAD_MSG_5_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m6, m2); \ -b1 = _mm_blend_epi16(m7, m4, 0xF0); \ -} while(0) - - -#define LOAD_MSG_6_1(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m6, m0, 0xF0); \ -b1 = _mm_unpacklo_epi64(m7, m2); \ -} while(0) - - -#define LOAD_MSG_6_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m2, m7); \ -b1 = _mm_alignr_epi8(m5, m6, 8); \ -} while(0) - - -#define LOAD_MSG_6_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m0, m3); \ -b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ -} while(0) - - -#define LOAD_MSG_6_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m3, m1); \ -b1 = _mm_blend_epi16(m1, m5, 0xF0); \ -} while(0) - - -#define LOAD_MSG_7_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m6, m3); \ -b1 = _mm_blend_epi16(m6, m1, 0xF0); \ -} while(0) - - -#define LOAD_MSG_7_2(b0, b1) \ -do \ -{ \ -b0 = _mm_alignr_epi8(m7, m5, 8); \ -b1 = _mm_unpackhi_epi64(m0, m4); \ -} while(0) - - -#define LOAD_MSG_7_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m2, m7); \ -b1 = _mm_unpacklo_epi64(m4, m1); \ -} while(0) - - -#define LOAD_MSG_7_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m0, m2); \ -b1 = _mm_unpacklo_epi64(m3, m5); \ -} while(0) - - -#define LOAD_MSG_8_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m3, m7); \ -b1 = _mm_alignr_epi8(m0, m5, 8); \ -} while(0) - - -#define LOAD_MSG_8_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m7, m4); \ -b1 = _mm_alignr_epi8(m4, m1, 8); \ -} while(0) - - -#define LOAD_MSG_8_3(b0, b1) \ -do \ -{ \ -b0 = m6; \ -b1 = _mm_alignr_epi8(m5, m0, 8); \ -} while(0) - - -#define LOAD_MSG_8_4(b0, b1) \ -do \ -{ \ -b0 = _mm_blend_epi16(m1, m3, 0xF0); \ -b1 = m2; \ -} while(0) - - -#define LOAD_MSG_9_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m5, m4); \ -b1 = _mm_unpackhi_epi64(m3, m0); \ -} while(0) - - -#define LOAD_MSG_9_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m1, m2); \ -b1 = _mm_blend_epi16(m3, m2, 0xF0); \ -} while(0) - - -#define LOAD_MSG_9_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m7, m4); \ -b1 = _mm_unpackhi_epi64(m1, m6); \ -} while(0) - - -#define LOAD_MSG_9_4(b0, b1) \ -do \ -{ \ -b0 = _mm_alignr_epi8(m7, m5, 8); \ -b1 = _mm_unpacklo_epi64(m6, m0); \ -} while(0) - - -#define LOAD_MSG_10_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m0, m1); \ -b1 = _mm_unpacklo_epi64(m2, m3); \ -} while(0) - - -#define LOAD_MSG_10_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m0, m1); \ -b1 = _mm_unpackhi_epi64(m2, m3); \ -} while(0) - - -#define LOAD_MSG_10_3(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m4, m5); \ -b1 = _mm_unpacklo_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_10_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpackhi_epi64(m4, m5); \ -b1 = _mm_unpackhi_epi64(m6, m7); \ -} while(0) - - -#define LOAD_MSG_11_1(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m7, m2); \ -b1 = _mm_unpackhi_epi64(m4, m6); \ -} while(0) - - -#define LOAD_MSG_11_2(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m5, m4); \ -b1 = _mm_alignr_epi8(m3, m7, 8); \ -} while(0) - - -#define LOAD_MSG_11_3(b0, b1) \ -do \ -{ \ -b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ -b1 = _mm_unpackhi_epi64(m5, m2); \ -} while(0) - - -#define LOAD_MSG_11_4(b0, b1) \ -do \ -{ \ -b0 = _mm_unpacklo_epi64(m6, m1); \ -b1 = _mm_unpackhi_epi64(m3, m1); \ -} while(0) - - -#endif diff --git a/src/crypto/randomx/blake2/blake2b-round.h b/src/crypto/randomx/blake2/blake2b-round.h index 1edc2cc4c..bf4f1ffed 100644 --- a/src/crypto/randomx/blake2/blake2b-round.h +++ b/src/crypto/randomx/blake2/blake2b-round.h @@ -102,17 +102,21 @@ row4l = t1; \ row4h = t0; -#include "blake2b-load-sse41.h" +#define LOAD_MSG(r, i, b0, b1) \ +do { \ + b0 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 1]], m[blake2b_sigma_sse41[r][i * 4 + 0]]); \ + b1 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 3]], m[blake2b_sigma_sse41[r][i * 4 + 2]]); \ +} while(0) #define ROUND(r) \ - LOAD_MSG_ ##r ##_1(b0, b1); \ + LOAD_MSG(r, 0, b0, b1); \ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_2(b0, b1); \ + LOAD_MSG(r, 1, b0, b1); \ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ - LOAD_MSG_ ##r ##_3(b0, b1); \ + LOAD_MSG(r, 2, b0, b1); \ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_4(b0, b1); \ + LOAD_MSG(r, 3, b0, b1); \ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c index 6a0889cbb..7a1b9daeb 100644 --- a/src/crypto/randomx/blake2/blake2b.c +++ b/src/crypto/randomx/blake2/blake2b.c @@ -56,6 +56,23 @@ static const uint64_t blake2b_IV[8] = { UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; +#if defined(_M_X64) || defined(__x86_64__) +static const uint8_t blake2b_sigma_sse41[12][16] = { + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, + {11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4}, + {7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, + {9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13}, + {2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9}, + {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, + {13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, + {6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5}, + {10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0}, + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, +}; +#endif + static const uint8_t blake2b_sigma[12][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, @@ -203,15 +220,6 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block) const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - const __m128i m0 = LOADU(block + 00); - const __m128i m1 = LOADU(block + 16); - const __m128i m2 = LOADU(block + 32); - const __m128i m3 = LOADU(block + 48); - const __m128i m4 = LOADU(block + 64); - const __m128i m5 = LOADU(block + 80); - const __m128i m6 = LOADU(block + 96); - const __m128i m7 = LOADU(block + 112); - row1l = LOADU(&S->h[0]); row1h = LOADU(&S->h[2]); row2l = LOADU(&S->h[4]); @@ -221,18 +229,11 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block) row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - ROUND(10); - ROUND(11); + const uint64_t* m = (const uint64_t*)(block); + + for (uint32_t r = 0; r < 12; ++r) { + ROUND(r); + } row1l = _mm_xor_si128(row3l, row1l); row1h = _mm_xor_si128(row3h, row1h); @@ -388,8 +389,7 @@ int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen) { return 0; } -int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, - const void *key, size_t keylen) { +int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen) { blake2b_state S; int ret = -1; @@ -402,25 +402,14 @@ int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, goto fail; } - if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) { + if (rx_blake2b_init(&S, outlen) < 0) { goto fail; } - if (keylen > 0) { - if (rx_blake2b_init_key(&S, outlen, key, keylen) < 0) { - goto fail; - } - } - else { - if (rx_blake2b_init(&S, outlen) < 0) { - goto fail; - } - } - - if (rx_blake2b_update(&S, in, inlen) < 0) { + if (rx_blake2b_update(&S, in, inlen) < 0) { goto fail; } - ret = rx_blake2b_final(&S, out, outlen); + ret = rx_blake2b_final(&S, out, outlen); fail: //clear_internal_memory(&S, sizeof(S)); @@ -442,43 +431,42 @@ int rxa2_blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { store32(outlen_bytes, (uint32_t)outlen); #define TRY(statement) \ - do { \ - ret = statement; \ - if (ret < 0) { \ - goto fail; \ - } \ - } while ((void)0, 0) + do { \ + ret = statement; \ + if (ret < 0) { \ + goto fail; \ + } \ + } while ((void)0, 0) if (outlen <= BLAKE2B_OUTBYTES) { - TRY(rx_blake2b_init(&blake_state, outlen)); - TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); - TRY(rx_blake2b_update(&blake_state, in, inlen)); - TRY(rx_blake2b_final(&blake_state, out, outlen)); + TRY(rx_blake2b_init(&blake_state, outlen)); + TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(rx_blake2b_update(&blake_state, in, inlen)); + TRY(rx_blake2b_final(&blake_state, out, outlen)); } else { uint32_t toproduce; uint8_t out_buffer[BLAKE2B_OUTBYTES]; uint8_t in_buffer[BLAKE2B_OUTBYTES]; - TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); - TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); - TRY(rx_blake2b_update(&blake_state, in, inlen)); - TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); + TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); + TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(rx_blake2b_update(&blake_state, in, inlen)); + TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); out += BLAKE2B_OUTBYTES / 2; toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2; while (toproduce > BLAKE2B_OUTBYTES) { memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); - TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, - BLAKE2B_OUTBYTES, NULL, 0)); + TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, + BLAKE2B_OUTBYTES)); memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); out += BLAKE2B_OUTBYTES / 2; toproduce -= BLAKE2B_OUTBYTES / 2; } memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); - TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL, - 0)); + TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES)); memcpy(out, out_buffer, toproduce); } fail: diff --git a/src/crypto/randomx/blake2_generator.cpp b/src/crypto/randomx/blake2_generator.cpp index edfe2e34c..ef3894d88 100644 --- a/src/crypto/randomx/blake2_generator.cpp +++ b/src/crypto/randomx/blake2_generator.cpp @@ -55,7 +55,7 @@ namespace randomx { void Blake2Generator::checkData(const size_t bytesNeeded) { if (dataIndex + bytesNeeded > sizeof(data)) { - rx_blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + rx_blake2b(data, sizeof(data), data, sizeof(data)); dataIndex = 0; } } diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 7aae54fd4..09746b901 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/program.hpp" #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/virtual_memory.hpp" +#include "base/tools/Profiler.h" #ifdef XMRIG_FIX_RYZEN # include "crypto/rx/Rx.h" @@ -255,6 +256,8 @@ namespace randomx { } void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) { + PROFILE_SCOPE(RandomX_JIT_compile); + vm_flags = flags; generateProgramPrologue(prog, pcfg); diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 8a6053638..89d319de2 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "base/tools/Profiler.h" + RandomX_ConfigurationWownero::RandomX_ConfigurationWownero() { ArgonSalt = "RandomWOW\x01"; @@ -574,33 +576,35 @@ extern "C" { assert(inputSize == 0 || input != nullptr); assert(output != nullptr); alignas(16) uint64_t tempHash[8]; - rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize); machine->initScratchpad(&tempHash); machine->resetRoundingMode(); for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) { machine->run(&tempHash); - rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile)); } machine->run(&tempHash); - machine->getFinalResult(output, RANDOMX_HASH_SIZE); + machine->getFinalResult(output); } void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) { - rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize); machine->initScratchpad(tempHash); } void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) { + PROFILE_SCOPE(RandomX_hash); + machine->resetRoundingMode(); for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) { machine->run(&tempHash); - rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile)); } machine->run(&tempHash); // Finish current hash and fill the scratchpad for the next hash at the same time - rx_blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0); - machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash); + rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), nextInput, nextInputSize); + machine->hashAndFill(output, tempHash); } } diff --git a/src/crypto/randomx/virtual_machine.cpp b/src/crypto/randomx/virtual_machine.cpp index ecd187e2f..f00213a30 100644 --- a/src/crypto/randomx/virtual_machine.cpp +++ b/src/crypto/randomx/virtual_machine.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/blake2/blake2.h" #include "crypto/randomx/intrin_portable.h" #include "crypto/randomx/allocator.hpp" +#include "base/tools/Profiler.h" randomx_vm::~randomx_vm() { @@ -109,15 +110,15 @@ namespace randomx { } template - void VmBase::getFinalResult(void* out, size_t outSize) { + void VmBase::getFinalResult(void* out) { hashAes1Rx4(scratchpad, ScratchpadSize, ®.a); - rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile)); } template - void VmBase::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) { + void VmBase::hashAndFill(void* out, uint64_t (&fill_state)[8]) { hashAndFillAes1Rx4(scratchpad, ScratchpadSize, ®.a, fill_state); - rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile)); } template @@ -127,6 +128,7 @@ namespace randomx { template void VmBase::generateProgram(void* seed) { + PROFILE_SCOPE(RandomX_generate_program); fillAes4Rx4(seed, 128 + RandomX_CurrentConfig.ProgramSize * 8, &program); } diff --git a/src/crypto/randomx/virtual_machine.hpp b/src/crypto/randomx/virtual_machine.hpp index 3fdd86df4..a60e693ae 100644 --- a/src/crypto/randomx/virtual_machine.hpp +++ b/src/crypto/randomx/virtual_machine.hpp @@ -38,8 +38,8 @@ class randomx_vm public: virtual ~randomx_vm() = 0; virtual void setScratchpad(uint8_t *scratchpad) = 0; - virtual void getFinalResult(void* out, size_t outSize) = 0; - virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0; + virtual void getFinalResult(void* out) = 0; + virtual void hashAndFill(void* out, uint64_t (&fill_state)[8]) = 0; virtual void setDataset(randomx_dataset* dataset) { } virtual void setCache(randomx_cache* cache) { } virtual void initScratchpad(void* seed) = 0; @@ -86,8 +86,8 @@ namespace randomx { ~VmBase() override; void setScratchpad(uint8_t *scratchpad) override; void initScratchpad(void* seed) override; - void getFinalResult(void* out, size_t outSize) override; - void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override; + void getFinalResult(void* out) override; + void hashAndFill(void* out, uint64_t (&fill_state)[8]) override; protected: void generateProgram(void* seed); diff --git a/src/crypto/randomx/vm_compiled.cpp b/src/crypto/randomx/vm_compiled.cpp index 501bb8c70..a61797e85 100644 --- a/src/crypto/randomx/vm_compiled.cpp +++ b/src/crypto/randomx/vm_compiled.cpp @@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/vm_compiled.hpp" #include "crypto/randomx/common.hpp" +#include "base/tools/Profiler.h" namespace randomx { @@ -41,6 +42,8 @@ namespace randomx { template void CompiledVm::run(void* seed) { + PROFILE_SCOPE(RandomX_run); + compiler.prepare(); VmBase::generateProgram(seed); randomx_vm::initialize(); @@ -51,6 +54,8 @@ namespace randomx { template void CompiledVm::execute() { + PROFILE_SCOPE(RandomX_JIT_execute); + #ifdef XMRIG_ARM memcpy(reg.f, config.eMask, sizeof(config.eMask)); #endif diff --git a/src/crypto/rx/RxConfig.cpp b/src/crypto/rx/RxConfig.cpp index 55dae35d6..d480d17b9 100644 --- a/src/crypto/rx/RxConfig.cpp +++ b/src/crypto/rx/RxConfig.cpp @@ -120,8 +120,8 @@ bool xmrig::RxConfig::read(const rapidjson::Value &value) } # endif - const int mode = Json::getInt(value, kScratchpadPrefetchMode, static_cast(m_scratchpadPrefetchMode)); - if ((mode >= ScratchpadPrefetchOff) && (mode < ScratchpadPrefetchMax)) { + const uint32_t mode = static_cast(Json::getInt(value, kScratchpadPrefetchMode, static_cast(m_scratchpadPrefetchMode))); + if (mode < ScratchpadPrefetchMax) { m_scratchpadPrefetchMode = static_cast(mode); } From 30be1cd102cbe6f3a929be81d0546238b006714c Mon Sep 17 00:00:00 2001 From: cohcho Date: Thu, 10 Sep 2020 12:10:35 +0000 Subject: [PATCH 07/13] reserve at most 1 bit for wrapping detection --- src/backend/common/WorkerJob.h | 4 ++-- src/base/net/stratum/Job.h | 1 + src/crypto/common/Nonce.cpp | 6 ++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/backend/common/WorkerJob.h b/src/backend/common/WorkerJob.h index 51b849ac2..e835ff1de 100644 --- a/src/backend/common/WorkerJob.h +++ b/src/backend/common/WorkerJob.h @@ -96,7 +96,7 @@ private: const size_t size = job.size(); m_jobs[index()] = job; m_rounds[index()] = 0; - m_nonce_mask[index()] = job.isNicehash() ? 0xFFFFFFULL : (nonceSize() == sizeof(uint64_t) ? (-1ull >> (job.extraNonce().size() * 4 + 1)): 0xFFFFFFFFULL); + m_nonce_mask[index()] = job.nonceMask(); m_jobs[index()].setBackend(backend); @@ -152,7 +152,7 @@ inline void xmrig::WorkerJob<1>::save(const Job &job, uint32_t reserveCount, Non m_index = job.index(); m_jobs[index()] = job; m_rounds[index()] = 0; - m_nonce_mask[index()] = job.isNicehash() ? 0xFFFFFFULL : (nonceSize() == sizeof(uint64_t) ? (-1ull >> (job.extraNonce().size() * 4 + 1)): 0xFFFFFFFFULL); + m_nonce_mask[index()] = job.nonceMask(); m_jobs[index()].setBackend(backend); diff --git a/src/base/net/stratum/Job.h b/src/base/net/stratum/Job.h index 8341439ac..ba5a0aa2e 100644 --- a/src/base/net/stratum/Job.h +++ b/src/base/net/stratum/Job.h @@ -82,6 +82,7 @@ public: inline uint32_t backend() const { return m_backend; } inline uint64_t diff() const { return m_diff; } inline uint64_t height() const { return m_height; } + inline uint64_t nonceMask() const { return isNicehash() ? 0xFFFFFFULL : (nonceSize() == sizeof(uint64_t) ? (-1ull >> (extraNonce().size() * 4)): 0xFFFFFFFFULL); } inline uint64_t target() const { return m_target; } inline uint8_t *blob() { return m_blob; } inline uint8_t fixedByte() const { return *(m_blob + 42); } diff --git a/src/crypto/common/Nonce.cpp b/src/crypto/common/Nonce.cpp index 2c5a47a9a..81cc80e43 100644 --- a/src/crypto/common/Nonce.cpp +++ b/src/crypto/common/Nonce.cpp @@ -52,6 +52,7 @@ xmrig::Nonce::Nonce() bool xmrig::Nonce::next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, uint64_t mask) { + mask &= 0x7FFFFFFFFFFFFFFFULL; if (reserveCount == 0 || mask < reserveCount - 1) { return false; } @@ -60,7 +61,8 @@ bool xmrig::Nonce::next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, u while (true) { if (mask < counter) { return false; - } else if (mask - counter <= reserveCount - 1) { + } + else if (mask - counter <= reserveCount - 1) { pause(true); if (mask - counter < reserveCount - 1) { return false; @@ -72,7 +74,7 @@ bool xmrig::Nonce::next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, u } *nonce = (nonce[0] & ~mask) | counter; if (mask > 0xFFFFFFFFULL) { - nonce[1] = (nonce[1] & (~mask >> 32)) | (counter >> 32); + nonce[1] = (nonce[1] & (~mask >> 32)) | (counter >> 32); } return true; } From 8d1168385a512ace7128942826cb203633bcc202 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Tue, 15 Sep 2020 20:48:27 +0200 Subject: [PATCH 08/13] RandomX: returned old soft AES impl and auto-select between the two --- src/crypto/randomx/aes_hash.cpp | 13 +-- src/crypto/randomx/aes_hash.hpp | 8 +- src/crypto/randomx/jit_compiler_x86.cpp | 98 ++++++++++----------- src/crypto/randomx/soft_aes.cpp | 43 +++++++++ src/crypto/randomx/soft_aes.h | 53 +++++++++-- src/crypto/randomx/virtual_machine.cpp | 26 ++++-- src/crypto/randomx/virtual_machine.hpp | 2 +- src/crypto/randomx/vm_compiled.cpp | 6 +- src/crypto/randomx/vm_compiled.hpp | 6 +- src/crypto/randomx/vm_compiled_light.cpp | 4 +- src/crypto/randomx/vm_compiled_light.hpp | 6 +- src/crypto/randomx/vm_interpreted.cpp | 10 +-- src/crypto/randomx/vm_interpreted.hpp | 6 +- src/crypto/randomx/vm_interpreted_light.cpp | 4 +- src/crypto/randomx/vm_interpreted_light.hpp | 6 +- src/crypto/rx/Rx.cpp | 4 + 16 files changed, 194 insertions(+), 101 deletions(-) diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index 571b4ca73..7c4b0c818 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Hashing throughput: >20 GiB/s per CPU core with hardware AES */ -template +template void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { const uint8_t* inptr = (uint8_t*)input; const uint8_t* inputEnd = inptr + inputSize; @@ -118,7 +118,7 @@ template void hashAes1Rx4(const void *input, size_t inputSize, void *hash) The modified state is written back to 'state' to allow multiple calls to this function. */ -template +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { const uint8_t* outptr = (uint8_t*)buffer; const uint8_t* outputEnd = outptr + outputSize; @@ -159,7 +159,7 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); -template +template void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { const uint8_t* outptr = (uint8_t*)buffer; const uint8_t* outputEnd = outptr + outputSize; @@ -214,7 +214,7 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); -template +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { PROFILE_SCOPE(RandomX_AES); @@ -311,5 +311,6 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3); } -template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); -template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); +template void hashAndFillAes1Rx4<0>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); +template void hashAndFillAes1Rx4<1>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); +template void hashAndFillAes1Rx4<2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); diff --git a/src/crypto/randomx/aes_hash.hpp b/src/crypto/randomx/aes_hash.hpp index 9f75f73ae..345ec8d99 100644 --- a/src/crypto/randomx/aes_hash.hpp +++ b/src/crypto/randomx/aes_hash.hpp @@ -30,14 +30,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -template +template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); -template +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); -template +template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); -template +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 09746b901..2eff9ab90 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -343,7 +343,6 @@ namespace randomx { r[j] = k; } - constexpr uint64_t instr_mask = (uint64_t(-1) - (0xFFFF << 8)) | ((RegistersCount - 1) << 8) | ((RegistersCount - 1) << 16); for (int i = 0, n = static_cast(RandomX_CurrentConfig.ProgramSize); i < n; i += 4) { Instruction& instr1 = prog(i); Instruction& instr2 = prog(i + 1); @@ -355,16 +354,9 @@ namespace randomx { InstructionGeneratorX86 gen3 = engine[instr3.opcode]; InstructionGeneratorX86 gen4 = engine[instr4.opcode]; - *((uint64_t*)&instr1) &= instr_mask; (this->*gen1)(instr1); - - *((uint64_t*)&instr2) &= instr_mask; (this->*gen2)(instr2); - - *((uint64_t*)&instr3) &= instr_mask; (this->*gen3)(instr3); - - *((uint64_t*)&instr4) &= instr_mask; (this->*gen4)(instr4); } @@ -518,7 +510,7 @@ namespace randomx { template void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos); FORCE_INLINE void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, uint32_t& codePos) { - const uint32_t dst = static_cast(instr.dst) << 16; + const uint32_t dst = static_cast(instr.dst % RegistersCount) << 16; *(uint32_t*)(code + codePos) = 0x24808d41 + dst; codePos += (dst == (RegisterNeedsSib << 16)) ? 4 : 3; @@ -540,8 +532,8 @@ namespace randomx { uint32_t pos = codePos; uint8_t* const p = code + pos; - const uint32_t dst = instr.dst; - const uint32_t sib = (instr.getModShift() << 6) | (instr.src << 3) | dst; + const uint32_t dst = instr.dst % RegistersCount; + const uint32_t sib = (instr.getModShift() << 6) | ((instr.src % RegistersCount) << 3) | dst; uint32_t k = 0x048d4f + (dst << 19); if (dst == RegisterNeedsDisplacement) @@ -560,8 +552,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; - const uint32_t dst = instr.dst; + const uint32_t src = instr.src % RegistersCount; + const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); @@ -585,8 +577,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; - const uint32_t dst = instr.dst; + const uint32_t src = instr.src % RegistersCount; + const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint32_t*)(p + pos) = 0xc02b4d + (dst << 19) + (src << 16); @@ -606,8 +598,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; - const uint32_t dst = instr.dst; + const uint32_t src = instr.src % RegistersCount; + const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); @@ -627,8 +619,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; - const uint32_t dst = instr.dst; + const uint32_t src = instr.src % RegistersCount; + const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { emit32(0xc0af0f4d + ((dst * 8 + src) << 24), p, pos); @@ -647,8 +639,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); @@ -668,8 +660,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; - const uint32_t dst = instr.dst; + const uint32_t src = instr.src % RegistersCount; + const uint32_t dst = instr.dst % RegistersCount; *(uint32_t*)(p + pos) = 0xc08b49 + (dst << 16); *(uint32_t*)(p + pos + 3) = 0xe0f749 + (src << 16); @@ -684,8 +676,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; - const uint32_t dst = instr.dst; + const uint32_t src = instr.src % RegistersCount; + const uint32_t dst = instr.dst % RegistersCount; *(uint32_t*)(p + pos) = 0xC4D08B49 + (dst << 16); *(uint32_t*)(p + pos + 4) = 0xC0F6FB42 + (dst << 27) + (src << 24); @@ -699,8 +691,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); @@ -723,8 +715,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); @@ -746,8 +738,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; *(uint64_t*)(p + pos) = 0x8b4ce8f749c08b49ull + (dst << 16) + (src << 40); pos += 8; @@ -761,8 +753,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); @@ -792,7 +784,7 @@ namespace randomx { emit64(randomx_reciprocal_fast(divisor), p, pos); - const uint32_t dst = instr.dst; + const uint32_t dst = instr.dst % RegistersCount; emit32(0xc0af0f4c + (dst << 27), p, pos); registerUsage[dst] = pos; @@ -805,7 +797,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t dst = instr.dst; + const uint32_t dst = instr.dst % RegistersCount; *(uint32_t*)(p + pos) = 0xd8f749 + (dst << 16); pos += 3; @@ -817,8 +809,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint32_t*)(p + pos) = 0xc0334d + (((dst << 3) + src) << 16); @@ -838,8 +830,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); @@ -859,8 +851,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint64_t*)(p + pos) = 0xc8d349c88b41ull + (src << 16) + (dst << 40); @@ -880,8 +872,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; - const uint64_t dst = instr.dst; + const uint64_t src = instr.src % RegistersCount; + const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint64_t*)(p + pos) = 0xc0d349c88b41ull + (src << 16) + (dst << 40); @@ -901,8 +893,8 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; - const uint32_t dst = instr.dst; + const uint32_t src = instr.src % RegistersCount; + const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint32_t*)(p + pos) = 0xc0874d + (((dst << 3) + src) << 16); @@ -918,7 +910,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t dst = instr.dst; + const uint64_t dst = instr.dst % RegistersCount; *(uint64_t*)(p + pos) = 0x01c0c60f66ull + (((dst << 3) + dst) << 24); pos += 5; @@ -943,7 +935,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; + const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, src, p, pos); @@ -971,7 +963,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; + const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, src, p, pos); @@ -1010,7 +1002,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; + const uint32_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, src, p, pos); @@ -1046,7 +1038,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint32_t src = instr.src; + const uint32_t src = instr.src % RegistersCount; *(uint32_t*)(p + pos) = 0x00C08B49 + (src << 16); const int rotate = (static_cast(instr.getImm32() & 63) - 2) & 63; @@ -1070,7 +1062,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const uint64_t src = instr.src; + const uint64_t src = instr.src % RegistersCount; const uint64_t rotate = (static_cast(instr.getImm32() & 63) - 2) & 63; *(uint64_t*)(p + pos) = 0xC0F0FBC3C4ULL | (src << 32) | (rotate << 40); @@ -1093,7 +1085,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - const int reg = instr.dst; + const int reg = instr.dst % RegistersCount; int32_t jmp_offset = registerUsage[reg] - (pos + 16); if (BranchesWithin32B) { @@ -1140,7 +1132,7 @@ namespace randomx { uint32_t pos = codePos; genAddressRegDst(instr, p, pos); - emit32(0x0604894c + (static_cast(instr.src) << 19), p, pos); + emit32(0x0604894c + (static_cast(instr.src % RegistersCount) << 19), p, pos); codePos = pos; } diff --git a/src/crypto/randomx/soft_aes.cpp b/src/crypto/randomx/soft_aes.cpp index 04fb7ac0e..a205398c8 100644 --- a/src/crypto/randomx/soft_aes.cpp +++ b/src/crypto/randomx/soft_aes.cpp @@ -28,6 +28,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "crypto/randomx/soft_aes.h" +#include "crypto/randomx/aes_hash.hpp" +#include "base/tools/Chrono.h" +#include alignas(64) uint32_t lutEnc0[256]; alignas(64) uint32_t lutEnc1[256]; @@ -117,3 +120,43 @@ static struct SAESInitializer } } } aes_initializer; + +static uint32_t softAESImpl = 1; + +uint32_t GetSoftAESImpl() +{ + return softAESImpl; +} + +void SelectSoftAESImpl() +{ + constexpr int test_length_ms = 100; + double speed[2]; + + for (int i = 0; i < 2; ++i) + { + std::vector scratchpad(10 * 1024); + uint8_t hash[64] = {}; + uint8_t state[64] = {}; + + uint64_t t1, t2; + + uint32_t count = 0; + t1 = xmrig::Chrono::highResolutionMSecs(); + do { + if (i == 0) { + hashAndFillAes1Rx4<1>(scratchpad.data(), scratchpad.size(), hash, state); + } + else { + hashAndFillAes1Rx4<2>(scratchpad.data(), scratchpad.size(), hash, state); + } + ++count; + + t2 = xmrig::Chrono::highResolutionMSecs(); + } while (t2 - t1 < test_length_ms); + + speed[i] = count * 1e3 / (t2 - t1); + } + + softAESImpl = (speed[0] > speed[1]) ? 1 : 2; +} diff --git a/src/crypto/randomx/soft_aes.h b/src/crypto/randomx/soft_aes.h index f4142aae2..d03a1a279 100644 --- a/src/crypto/randomx/soft_aes.h +++ b/src/crypto/randomx/soft_aes.h @@ -41,11 +41,14 @@ extern uint32_t lutDec1[256]; extern uint32_t lutDec2[256]; extern uint32_t lutDec3[256]; -template rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key); -template rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key); +uint32_t GetSoftAESImpl(); +void SelectSoftAESImpl(); + +template rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key); +template rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key); template<> -FORCE_INLINE rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) { +FORCE_INLINE rx_vec_i128 aesenc<1>(rx_vec_i128 in, rx_vec_i128 key) { volatile uint8_t s[16]; memcpy((void*) s, &in, 16); @@ -73,7 +76,7 @@ FORCE_INLINE rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) { } template<> -FORCE_INLINE rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) { +FORCE_INLINE rx_vec_i128 aesdec<1>(rx_vec_i128 in, rx_vec_i128 key) { volatile uint8_t s[16]; memcpy((void*) s, &in, 16); @@ -101,11 +104,49 @@ FORCE_INLINE rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) { } template<> -FORCE_INLINE rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) { +FORCE_INLINE rx_vec_i128 aesenc<2>(rx_vec_i128 in, rx_vec_i128 key) { + uint32_t s0, s1, s2, s3; + + s0 = rx_vec_i128_w(in); + s1 = rx_vec_i128_z(in); + s2 = rx_vec_i128_y(in); + s3 = rx_vec_i128_x(in); + + rx_vec_i128 out = rx_set_int_vec_i128( + (lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]), + (lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]), + (lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]), + (lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24]) + ); + + return rx_xor_vec_i128(out, key); +} + +template<> +FORCE_INLINE rx_vec_i128 aesdec<2>(rx_vec_i128 in, rx_vec_i128 key) { + uint32_t s0, s1, s2, s3; + + s0 = rx_vec_i128_w(in); + s1 = rx_vec_i128_z(in); + s2 = rx_vec_i128_y(in); + s3 = rx_vec_i128_x(in); + + rx_vec_i128 out = rx_set_int_vec_i128( + (lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]), + (lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]), + (lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]), + (lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24]) + ); + + return rx_xor_vec_i128(out, key); +} + +template<> +FORCE_INLINE rx_vec_i128 aesenc<0>(rx_vec_i128 in, rx_vec_i128 key) { return rx_aesenc_vec_i128(in, key); } template<> -FORCE_INLINE rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) { +FORCE_INLINE rx_vec_i128 aesdec<0>(rx_vec_i128 in, rx_vec_i128 key) { return rx_aesdec_vec_i128(in, key); } diff --git a/src/crypto/randomx/virtual_machine.cpp b/src/crypto/randomx/virtual_machine.cpp index f00213a30..3a2d675c4 100644 --- a/src/crypto/randomx/virtual_machine.cpp +++ b/src/crypto/randomx/virtual_machine.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/blake2/blake2.h" #include "crypto/randomx/intrin_portable.h" #include "crypto/randomx/allocator.hpp" +#include "crypto/randomx/soft_aes.h" #include "base/tools/Profiler.h" randomx_vm::~randomx_vm() { @@ -96,11 +97,11 @@ void randomx_vm::initialize() { namespace randomx { - template + template VmBase::~VmBase() { } - template + template void VmBase::setScratchpad(uint8_t *scratchpad) { if (datasetPtr == nullptr) { throw std::invalid_argument("Cache/Dataset not set"); @@ -109,24 +110,35 @@ namespace randomx { this->scratchpad = scratchpad; } - template + template void VmBase::getFinalResult(void* out) { hashAes1Rx4(scratchpad, ScratchpadSize, ®.a); rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile)); } - template + template void VmBase::hashAndFill(void* out, uint64_t (&fill_state)[8]) { - hashAndFillAes1Rx4(scratchpad, ScratchpadSize, ®.a, fill_state); + if (!softAes) { + hashAndFillAes1Rx4<0>(scratchpad, ScratchpadSize, ®.a, fill_state); + } + else { + if (GetSoftAESImpl() == 1) { + hashAndFillAes1Rx4<1>(scratchpad, ScratchpadSize, ®.a, fill_state); + } + else { + hashAndFillAes1Rx4<2>(scratchpad, ScratchpadSize, ®.a, fill_state); + } + } + rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile)); } - template + template void VmBase::initScratchpad(void* seed) { fillAes1Rx4(seed, ScratchpadSize, scratchpad); } - template + template void VmBase::generateProgram(void* seed) { PROFILE_SCOPE(RandomX_generate_program); fillAes4Rx4(seed, 128 + RandomX_CurrentConfig.ProgramSize * 8, &program); diff --git a/src/crypto/randomx/virtual_machine.hpp b/src/crypto/randomx/virtual_machine.hpp index a60e693ae..8d44a7f52 100644 --- a/src/crypto/randomx/virtual_machine.hpp +++ b/src/crypto/randomx/virtual_machine.hpp @@ -79,7 +79,7 @@ protected: namespace randomx { - template + template class VmBase : public randomx_vm { public: diff --git a/src/crypto/randomx/vm_compiled.cpp b/src/crypto/randomx/vm_compiled.cpp index a61797e85..c32034b38 100644 --- a/src/crypto/randomx/vm_compiled.cpp +++ b/src/crypto/randomx/vm_compiled.cpp @@ -35,12 +35,12 @@ namespace randomx { static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct randomx::MemoryRegisters"); static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct randomx::RegisterFile"); - template + template void CompiledVm::setDataset(randomx_dataset* dataset) { datasetPtr = dataset; } - template + template void CompiledVm::run(void* seed) { PROFILE_SCOPE(RandomX_run); @@ -52,7 +52,7 @@ namespace randomx { execute(); } - template + template void CompiledVm::execute() { PROFILE_SCOPE(RandomX_JIT_execute); diff --git a/src/crypto/randomx/vm_compiled.hpp b/src/crypto/randomx/vm_compiled.hpp index 22c269068..0e9c4eb8e 100644 --- a/src/crypto/randomx/vm_compiled.hpp +++ b/src/crypto/randomx/vm_compiled.hpp @@ -37,7 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { - template + template class CompiledVm : public VmBase { public: @@ -61,6 +61,6 @@ namespace randomx { JitCompiler compiler; }; - using CompiledVmDefault = CompiledVm; - using CompiledVmHardAes = CompiledVm; + using CompiledVmDefault = CompiledVm<1>; + using CompiledVmHardAes = CompiledVm<0>; } diff --git a/src/crypto/randomx/vm_compiled_light.cpp b/src/crypto/randomx/vm_compiled_light.cpp index 02115cefd..d4f6fe50b 100644 --- a/src/crypto/randomx/vm_compiled_light.cpp +++ b/src/crypto/randomx/vm_compiled_light.cpp @@ -32,14 +32,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { - template + template void CompiledLightVm::setCache(randomx_cache* cache) { cachePtr = cache; mem.memory = cache->memory; compiler.generateSuperscalarHash(cache->programs, cache->reciprocalCache); } - template + template void CompiledLightVm::run(void* seed) { VmBase::generateProgram(seed); randomx_vm::initialize(); diff --git a/src/crypto/randomx/vm_compiled_light.hpp b/src/crypto/randomx/vm_compiled_light.hpp index 6d11d60ad..8139a5442 100644 --- a/src/crypto/randomx/vm_compiled_light.hpp +++ b/src/crypto/randomx/vm_compiled_light.hpp @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { - template + template class CompiledLightVm : public CompiledVm { public: @@ -52,6 +52,6 @@ namespace randomx { using CompiledVm::datasetOffset; }; - using CompiledLightVmDefault = CompiledLightVm; - using CompiledLightVmHardAes = CompiledLightVm; + using CompiledLightVmDefault = CompiledLightVm<1>; + using CompiledLightVmHardAes = CompiledLightVm<0>; } diff --git a/src/crypto/randomx/vm_interpreted.cpp b/src/crypto/randomx/vm_interpreted.cpp index e21ecfe69..840ea7688 100644 --- a/src/crypto/randomx/vm_interpreted.cpp +++ b/src/crypto/randomx/vm_interpreted.cpp @@ -33,20 +33,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { - template + template void InterpretedVm::setDataset(randomx_dataset* dataset) { datasetPtr = dataset; mem.memory = dataset->memory; } - template + template void InterpretedVm::run(void* seed) { VmBase::generateProgram(seed); randomx_vm::initialize(); execute(); } - template + template void InterpretedVm::execute() { NativeRegisterFile nreg; @@ -106,14 +106,14 @@ namespace randomx { rx_store_vec_f128(®.e[i].lo, nreg.e[i]); } - template + template void InterpretedVm::datasetRead(uint64_t address, int_reg_t(&r)[RegistersCount]) { uint64_t* datasetLine = (uint64_t*)(mem.memory + address); for (int i = 0; i < RegistersCount; ++i) r[i] ^= datasetLine[i]; } - template + template void InterpretedVm::datasetPrefetch(uint64_t address) { rx_prefetch_nta(mem.memory + address); } diff --git a/src/crypto/randomx/vm_interpreted.hpp b/src/crypto/randomx/vm_interpreted.hpp index d928de749..452ef17f5 100644 --- a/src/crypto/randomx/vm_interpreted.hpp +++ b/src/crypto/randomx/vm_interpreted.hpp @@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { - template + template class InterpretedVm : public VmBase, public BytecodeMachine { public: using VmBase::mem; @@ -65,6 +65,6 @@ namespace randomx { InstructionByteCode bytecode[RANDOMX_PROGRAM_MAX_SIZE]; }; - using InterpretedVmDefault = InterpretedVm; - using InterpretedVmHardAes = InterpretedVm; + using InterpretedVmDefault = InterpretedVm<1>; + using InterpretedVmHardAes = InterpretedVm<0>; } diff --git a/src/crypto/randomx/vm_interpreted_light.cpp b/src/crypto/randomx/vm_interpreted_light.cpp index bed6f35bc..589c1211d 100644 --- a/src/crypto/randomx/vm_interpreted_light.cpp +++ b/src/crypto/randomx/vm_interpreted_light.cpp @@ -31,13 +31,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { - template + template void InterpretedLightVm::setCache(randomx_cache* cache) { cachePtr = cache; mem.memory = cache->memory; } - template + template void InterpretedLightVm::datasetRead(uint64_t address, int_reg_t(&r)[8]) { uint32_t itemNumber = address / CacheLineSize; int_reg_t rl[8]; diff --git a/src/crypto/randomx/vm_interpreted_light.hpp b/src/crypto/randomx/vm_interpreted_light.hpp index bec7978b1..8a1b5a364 100644 --- a/src/crypto/randomx/vm_interpreted_light.hpp +++ b/src/crypto/randomx/vm_interpreted_light.hpp @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { - template + template class InterpretedLightVm : public InterpretedVm { public: using VmBase::mem; @@ -50,6 +50,6 @@ namespace randomx { void datasetPrefetch(uint64_t address) override { } }; - using InterpretedLightVmDefault = InterpretedLightVm; - using InterpretedLightVmHardAes = InterpretedLightVm; + using InterpretedLightVmDefault = InterpretedLightVm<1>; + using InterpretedLightVmHardAes = InterpretedLightVm<0>; } diff --git a/src/crypto/rx/Rx.cpp b/src/crypto/rx/Rx.cpp index a6a1f5c95..79354d7e8 100644 --- a/src/crypto/rx/Rx.cpp +++ b/src/crypto/rx/Rx.cpp @@ -33,6 +33,7 @@ #include "crypto/rx/RxConfig.h" #include "crypto/rx/RxQueue.h" #include "crypto/randomx/randomx.h" +#include "crypto/randomx/soft_aes.h" namespace xmrig { @@ -113,6 +114,9 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu if (!osInitialized) { setupMainLoopExceptionFrame(); + if (!cpu.isHwAES()) { + SelectSoftAESImpl(); + } osInitialized = true; } From c7476e076b0e15e0d025f140168f17ca34229943 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Fri, 18 Sep 2020 20:51:25 +0200 Subject: [PATCH 09/13] RandomX refactoring, moved more stuff to compile time Small x86 JIT compiler speedup. --- src/backend/cpu/interfaces/ICpuInfo.h | 1 + src/backend/cpu/platform/BasicCpuInfo.cpp | 31 +++++++ src/backend/cpu/platform/BasicCpuInfo.h | 2 + src/crypto/randomx/bytecode_machine.cpp | 100 ++++++++++++++-------- src/crypto/randomx/bytecode_machine.hpp | 2 +- src/crypto/randomx/common.hpp | 4 +- src/crypto/randomx/jit_compiler_a64.cpp | 8 +- src/crypto/randomx/jit_compiler_x86.cpp | 62 +++----------- src/crypto/randomx/jit_compiler_x86.hpp | 5 +- src/crypto/randomx/randomx.cpp | 34 +++----- src/crypto/randomx/randomx.h | 60 ++++--------- 11 files changed, 149 insertions(+), 160 deletions(-) diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h index 20fb62958..ffab2d7d0 100644 --- a/src/backend/cpu/interfaces/ICpuInfo.h +++ b/src/backend/cpu/interfaces/ICpuInfo.h @@ -98,6 +98,7 @@ public: virtual size_t packages() const = 0; virtual size_t threads() const = 0; virtual Vendor vendor() const = 0; + virtual bool jccErratum() const = 0; }; diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index cc03646b5..dac1a4fad 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -212,6 +212,37 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : m_vendor = VENDOR_INTEL; m_assembly = Assembly::INTEL; m_msrMod = MSR_MOD_INTEL; + + struct + { + unsigned int stepping : 4; + unsigned int model : 4; + unsigned int family : 4; + unsigned int processor_type : 2; + unsigned int reserved1 : 2; + unsigned int ext_model : 4; + unsigned int ext_family : 8; + unsigned int reserved2 : 4; + } processor_info; + + cpuid(1, data); + memcpy(&processor_info, data, sizeof(processor_info)); + + // Intel JCC erratum mitigation + if (processor_info.family == 6) { + const uint32_t model = processor_info.model | (processor_info.ext_model << 4); + const uint32_t stepping = processor_info.stepping; + + // Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf + m_jccErratum = + ((model == 0x4E) && (stepping == 0x3)) || + ((model == 0x55) && (stepping == 0x4)) || + ((model == 0x5E) && (stepping == 0x3)) || + ((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) || + ((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) || + ((model == 0xA6) && (stepping == 0x0)) || + ((model == 0xAE) && (stepping == 0xA)); + } } } # endif diff --git a/src/backend/cpu/platform/BasicCpuInfo.h b/src/backend/cpu/platform/BasicCpuInfo.h index 05e5f442a..e3e184bb8 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.h +++ b/src/backend/cpu/platform/BasicCpuInfo.h @@ -61,11 +61,13 @@ protected: inline size_t packages() const override { return 1; } inline size_t threads() const override { return m_threads; } inline Vendor vendor() const override { return m_vendor; } + inline bool jccErratum() const override { return m_jccErratum; } protected: char m_brand[64 + 6]{}; size_t m_threads; Vendor m_vendor = VENDOR_UNKNOWN; + bool m_jccErratum = false; private: Assembly m_assembly = Assembly::NONE; diff --git a/src/crypto/randomx/bytecode_machine.cpp b/src/crypto/randomx/bytecode_machine.cpp index f0b95c30e..c1ef3a0e5 100644 --- a/src/crypto/randomx/bytecode_machine.cpp +++ b/src/crypto/randomx/bytecode_machine.cpp @@ -79,9 +79,9 @@ namespace randomx { } void BytecodeMachine::compileInstruction(RANDOMX_GEN_ARGS) { - int opcode = instr.opcode; + uint32_t opcode = instr.opcode; - if (opcode < RandomX_CurrentConfig.CEIL_IADD_RS) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IADD_RS) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IADD_RS; @@ -99,8 +99,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IADD_RS; - if (opcode < RandomX_CurrentConfig.CEIL_IADD_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IADD_M) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IADD_M; @@ -117,8 +118,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IADD_M; - if (opcode < RandomX_CurrentConfig.CEIL_ISUB_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_ISUB_R) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::ISUB_R; @@ -133,8 +135,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_ISUB_R; - if (opcode < RandomX_CurrentConfig.CEIL_ISUB_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_ISUB_M) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::ISUB_M; @@ -151,8 +154,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_ISUB_M; - if (opcode < RandomX_CurrentConfig.CEIL_IMUL_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IMUL_R) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IMUL_R; @@ -167,8 +171,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IMUL_R; - if (opcode < RandomX_CurrentConfig.CEIL_IMUL_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IMUL_M) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IMUL_M; @@ -185,8 +190,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IMUL_M; - if (opcode < RandomX_CurrentConfig.CEIL_IMULH_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IMULH_R) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IMULH_R; @@ -195,8 +201,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IMULH_R; - if (opcode < RandomX_CurrentConfig.CEIL_IMULH_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IMULH_M) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IMULH_M; @@ -213,8 +220,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IMULH_M; - if (opcode < RandomX_CurrentConfig.CEIL_ISMULH_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_ISMULH_R) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::ISMULH_R; @@ -223,8 +231,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_ISMULH_R; - if (opcode < RandomX_CurrentConfig.CEIL_ISMULH_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_ISMULH_M) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::ISMULH_M; @@ -241,8 +250,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_ISMULH_M; - if (opcode < RandomX_CurrentConfig.CEIL_IMUL_RCP) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IMUL_RCP) { uint64_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { auto dst = instr.dst % RegistersCount; @@ -257,16 +267,18 @@ namespace randomx { } return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IMUL_RCP; - if (opcode < RandomX_CurrentConfig.CEIL_INEG_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_INEG_R) { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::INEG_R; ibc.idst = &nreg->r[dst]; registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_INEG_R; - if (opcode < RandomX_CurrentConfig.CEIL_IXOR_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IXOR_R) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IXOR_R; @@ -281,8 +293,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IXOR_R; - if (opcode < RandomX_CurrentConfig.CEIL_IXOR_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IXOR_M) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IXOR_M; @@ -299,8 +312,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IXOR_M; - if (opcode < RandomX_CurrentConfig.CEIL_IROR_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IROR_R) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IROR_R; @@ -315,8 +329,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IROR_R; - if (opcode < RandomX_CurrentConfig.CEIL_IROL_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_IROL_R) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IROL_R; @@ -331,8 +346,9 @@ namespace randomx { registerUsage[dst] = i; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_IROL_R; - if (opcode < RandomX_CurrentConfig.CEIL_ISWAP_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_ISWAP_R) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; if (src != dst) { @@ -347,8 +363,9 @@ namespace randomx { } return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_ISWAP_R; - if (opcode < RandomX_CurrentConfig.CEIL_FSWAP_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FSWAP_R) { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::FSWAP_R; if (dst < RegisterCountFlt) @@ -357,8 +374,9 @@ namespace randomx { ibc.fdst = &nreg->e[dst - RegisterCountFlt]; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FSWAP_R; - if (opcode < RandomX_CurrentConfig.CEIL_FADD_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FADD_R) { auto dst = instr.dst % RegisterCountFlt; auto src = instr.src % RegisterCountFlt; ibc.type = InstructionType::FADD_R; @@ -366,8 +384,9 @@ namespace randomx { ibc.fsrc = &nreg->a[src]; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FADD_R; - if (opcode < RandomX_CurrentConfig.CEIL_FADD_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FADD_M) { auto dst = instr.dst % RegisterCountFlt; auto src = instr.src % RegistersCount; ibc.type = InstructionType::FADD_M; @@ -377,8 +396,9 @@ namespace randomx { ibc.imm = signExtend2sCompl(instr.getImm32()); return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FADD_M; - if (opcode < RandomX_CurrentConfig.CEIL_FSUB_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FSUB_R) { auto dst = instr.dst % RegisterCountFlt; auto src = instr.src % RegisterCountFlt; ibc.type = InstructionType::FSUB_R; @@ -386,8 +406,9 @@ namespace randomx { ibc.fsrc = &nreg->a[src]; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FSUB_R; - if (opcode < RandomX_CurrentConfig.CEIL_FSUB_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FSUB_M) { auto dst = instr.dst % RegisterCountFlt; auto src = instr.src % RegistersCount; ibc.type = InstructionType::FSUB_M; @@ -397,15 +418,17 @@ namespace randomx { ibc.imm = signExtend2sCompl(instr.getImm32()); return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FSUB_M; - if (opcode < RandomX_CurrentConfig.CEIL_FSCAL_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FSCAL_R) { auto dst = instr.dst % RegisterCountFlt; ibc.fdst = &nreg->f[dst]; ibc.type = InstructionType::FSCAL_R; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FSCAL_R; - if (opcode < RandomX_CurrentConfig.CEIL_FMUL_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FMUL_R) { auto dst = instr.dst % RegisterCountFlt; auto src = instr.src % RegisterCountFlt; ibc.type = InstructionType::FMUL_R; @@ -413,8 +436,9 @@ namespace randomx { ibc.fsrc = &nreg->a[src]; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FMUL_R; - if (opcode < RandomX_CurrentConfig.CEIL_FDIV_M) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FDIV_M) { auto dst = instr.dst % RegisterCountFlt; auto src = instr.src % RegistersCount; ibc.type = InstructionType::FDIV_M; @@ -424,41 +448,44 @@ namespace randomx { ibc.imm = signExtend2sCompl(instr.getImm32()); return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FDIV_M; - if (opcode < RandomX_CurrentConfig.CEIL_FSQRT_R) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_FSQRT_R) { auto dst = instr.dst % RegisterCountFlt; ibc.type = InstructionType::FSQRT_R; ibc.fdst = &nreg->e[dst]; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_FSQRT_R; - if (opcode < RandomX_CurrentConfig.CEIL_CBRANCH) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_CBRANCH) { ibc.type = InstructionType::CBRANCH; //jump condition int creg = instr.dst % RegistersCount; ibc.idst = &nreg->r[creg]; ibc.target = registerUsage[creg]; - int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; - ibc.imm = signExtend2sCompl(instr.getImm32()) | (1ULL << shift); - if (RandomX_CurrentConfig.JumpOffset > 0 || shift > 0) //clear the bit below the condition mask - this limits the number of successive jumps to 2 - ibc.imm &= ~(1ULL << (shift - 1)); - ibc.memMask = RandomX_CurrentConfig.ConditionMask_Calculated << shift; + const int shift = instr.getModCond(); + ibc.imm = signExtend2sCompl(instr.getImm32()) | ((1ULL << RandomX_ConfigurationBase::JumpOffset) << shift); + ibc.imm &= ~((1ULL << (RandomX_ConfigurationBase::JumpOffset - 1)) << shift); + ibc.memMask = RandomX_ConfigurationBase::ConditionMask_Calculated << shift; //mark all registers as used for (unsigned j = 0; j < RegistersCount; ++j) { registerUsage[j] = i; } return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_CBRANCH; - if (opcode < RandomX_CurrentConfig.CEIL_CFROUND) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_CFROUND) { auto src = instr.src % RegistersCount; ibc.isrc = &nreg->r[src]; ibc.type = InstructionType::CFROUND; ibc.imm = instr.getImm32() & 63; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_CFROUND; - if (opcode < RandomX_CurrentConfig.CEIL_ISTORE) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_ISTORE) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::ISTORE; @@ -471,8 +498,9 @@ namespace randomx { ibc.memMask = ScratchpadL3Mask; return; } + opcode -= RandomX_CurrentConfig.RANDOMX_FREQ_ISTORE; - if (opcode < RandomX_CurrentConfig.CEIL_NOP) { + if (opcode < RandomX_CurrentConfig.RANDOMX_FREQ_NOP) { ibc.type = InstructionType::NOP; return; } diff --git a/src/crypto/randomx/bytecode_machine.hpp b/src/crypto/randomx/bytecode_machine.hpp index 8aee78d89..8852f4d68 100644 --- a/src/crypto/randomx/bytecode_machine.hpp +++ b/src/crypto/randomx/bytecode_machine.hpp @@ -225,7 +225,7 @@ namespace randomx { } static void exe_CFROUND(RANDOMX_EXE_ARGS) { - rx_set_rounding_mode(rotr64(*ibc.isrc, ibc.imm) % 4); + rx_set_rounding_mode(rotr64(*ibc.isrc, static_cast(ibc.imm)) % 4); } static void exe_ISTORE(RANDOMX_EXE_ARGS) { diff --git a/src/crypto/randomx/common.hpp b/src/crypto/randomx/common.hpp index 34c8477c3..aefbad032 100644 --- a/src/crypto/randomx/common.hpp +++ b/src/crypto/randomx/common.hpp @@ -74,8 +74,8 @@ namespace randomx { constexpr int SuperscalarMaxSize = 3 * RANDOMX_SUPERSCALAR_MAX_LATENCY + 2; constexpr size_t CacheLineSize = RANDOMX_DATASET_ITEM_SIZE; #define ScratchpadSize RandomX_CurrentConfig.ScratchpadL3_Size - #define CacheLineAlignMask RandomX_CurrentConfig.CacheLineAlignMask_Calculated - #define DatasetExtraItems RandomX_CurrentConfig.DatasetExtraItems_Calculated + #define CacheLineAlignMask RandomX_ConfigurationBase::CacheLineAlignMask_Calculated + #define DatasetExtraItems RandomX_ConfigurationBase::DatasetExtraItems_Calculated constexpr int StoreL3Condition = 14; //Prevent some unsafe configurations. diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index d291de4d3..7a601c5b3 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -75,11 +75,11 @@ static size_t CalcDatasetItemSize() // Prologue ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch - (uint8_t*)randomx_calc_dataset_item_aarch64) + // Main loop - RandomX_CurrentConfig.CacheAccesses * ( + RandomX_ConfigurationBase::CacheAccesses * ( // Main loop prologue ((uint8_t*)randomx_calc_dataset_item_aarch64_mix - ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch)) + 4 + // Inner main loop (instructions) - ((RandomX_CurrentConfig.SuperscalarLatency * 3) + 2) * 16 + + ((RandomX_ConfigurationBase::SuperscalarLatency * 3) + 2) * 16 + // Main loop epilogue ((uint8_t*)randomx_calc_dataset_item_aarch64_store_result - (uint8_t*)randomx_calc_dataset_item_aarch64_mix) + 4 ) + @@ -235,7 +235,7 @@ void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[N], s num32bitLiterals = 64; constexpr uint32_t tmp_reg = 12; - for (size_t i = 0; i < RandomX_CurrentConfig.CacheAccesses; ++i) + for (size_t i = 0; i < RandomX_ConfigurationBase::CacheAccesses; ++i) { // and x11, x10, CacheSize / CacheLineSize - 1 emit32(0x92400000 | 11 | (10 << 5) | ((RandomX_CurrentConfig.Log2_CacheSize - 1) << 10), code, codePos); @@ -946,7 +946,7 @@ void JitCompilerA64::h_CBRANCH(Instruction& instr, uint32_t& codePos) const uint32_t dst = IntRegMap[instr.dst]; const uint32_t modCond = instr.getModCond(); - const uint32_t shift = modCond + RandomX_CurrentConfig.JumpOffset; + const uint32_t shift = modCond + RandomX_ConfigurationBase::JumpOffset; const uint32_t imm = (instr.getImm32() | (1U << shift)) & ~(1U << (shift - 1)); emitAddImmediate(dst, dst, imm, code, k); diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 2eff9ab90..8edf5a720 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/virtual_memory.hpp" #include "base/tools/Profiler.h" +#include "backend/cpu/Cpu.h" #ifdef XMRIG_FIX_RYZEN # include "crypto/rx/Rx.h" @@ -167,55 +168,10 @@ namespace randomx { # endif } - // CPU-specific tweaks - void JitCompilerX86::applyTweaks() { - int32_t info[4]; - cpuid(0, info); - - int32_t manufacturer[4]; - manufacturer[0] = info[1]; - manufacturer[1] = info[3]; - manufacturer[2] = info[2]; - manufacturer[3] = 0; - - if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) { - struct - { - unsigned int stepping : 4; - unsigned int model : 4; - unsigned int family : 4; - unsigned int processor_type : 2; - unsigned int reserved1 : 2; - unsigned int ext_model : 4; - unsigned int ext_family : 8; - unsigned int reserved2 : 4; - } processor_info; - - cpuid(1, info); - memcpy(&processor_info, info, sizeof(processor_info)); - - // Intel JCC erratum mitigation - if (processor_info.family == 6) { - const uint32_t model = processor_info.model | (processor_info.ext_model << 4); - const uint32_t stepping = processor_info.stepping; - - // Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf - BranchesWithin32B = - ((model == 0x4E) && (stepping == 0x3)) || - ((model == 0x55) && (stepping == 0x4)) || - ((model == 0x5E) && (stepping == 0x3)) || - ((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) || - ((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) || - ((model == 0xA6) && (stepping == 0x0)) || - ((model == 0xAE) && (stepping == 0xA)); - } - } - } - static std::atomic codeOffset; JitCompilerX86::JitCompilerX86() { - applyTweaks(); + BranchesWithin32B = xmrig::Cpu::info()->jccErratum(); int32_t info[4]; cpuid(1, info); @@ -1081,6 +1037,7 @@ namespace randomx { codePos = pos; } + template void JitCompilerX86::h_CBRANCH(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; @@ -1088,7 +1045,7 @@ namespace randomx { const int reg = instr.dst % RegistersCount; int32_t jmp_offset = registerUsage[reg] - (pos + 16); - if (BranchesWithin32B) { + if (jccErratum) { const uint32_t branch_begin = static_cast(pos + 7); const uint32_t branch_end = static_cast(branch_begin + ((jmp_offset >= -128) ? 9 : 13)); @@ -1101,10 +1058,12 @@ namespace randomx { } *(uint32_t*)(p + pos) = 0x00c08149 + (reg << 16); - const int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; - *(uint32_t*)(p + pos + 3) = (instr.getImm32() | (1UL << shift)) & ~(1UL << (shift - 1)); + const int shift = instr.getModCond(); + const uint32_t or_mask = (1UL << RandomX_ConfigurationBase::JumpOffset) << shift; + const uint32_t and_mask = ~((1UL << (RandomX_ConfigurationBase::JumpOffset - 1)) << shift); + *(uint32_t*)(p + pos + 3) = (instr.getImm32() | or_mask) & and_mask; *(uint32_t*)(p + pos + 7) = 0x00c0f749 + (reg << 16); - *(uint32_t*)(p + pos + 10) = RandomX_CurrentConfig.ConditionMask_Calculated << shift; + *(uint32_t*)(p + pos + 10) = RandomX_ConfigurationBase::ConditionMask_Calculated << shift; pos += 14; if (jmp_offset >= -128) { @@ -1127,6 +1086,9 @@ namespace randomx { codePos = pos; } + template void JitCompilerX86::h_CBRANCH(const Instruction&); + template void JitCompilerX86::h_CBRANCH(const Instruction&); + void JitCompilerX86::h_ISTORE(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index c8a60c1da..3a9163b5e 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -84,7 +84,6 @@ namespace randomx { uint8_t* allocatedCode; - void applyTweaks(); void generateProgramPrologue(Program&, ProgramConfiguration&); void generateProgramEpilogue(Program&, ProgramConfiguration&); template @@ -148,11 +147,13 @@ namespace randomx { void h_FMUL_R(const Instruction&); void h_FDIV_M(const Instruction&); void h_FSQRT_R(const Instruction&); + + template void h_CBRANCH(const Instruction&); + void h_CFROUND(const Instruction&); void h_CFROUND_BMI2(const Instruction&); void h_ISTORE(const Instruction&); void h_NOP(const Instruction&); }; - } diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 89d319de2..5cfaddca3 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -111,22 +111,15 @@ RandomX_ConfigurationKeva::RandomX_ConfigurationKeva() } RandomX_ConfigurationBase::RandomX_ConfigurationBase() - : ArgonMemory(262144) - , ArgonIterations(3) + : ArgonIterations(3) , ArgonLanes(1) , ArgonSalt("RandomX\x03") - , CacheAccesses(8) - , SuperscalarLatency(170) - , DatasetBaseSize(2147483648) - , DatasetExtraSize(33554368) , ScratchpadL1_Size(16384) , ScratchpadL2_Size(262144) , ScratchpadL3_Size(2097152) , ProgramSize(256) , ProgramIterations(2048) , ProgramCount(8) - , JumpBits(8) - , JumpOffset(8) , RANDOMX_FREQ_IADD_RS(16) , RANDOMX_FREQ_IADD_M(7) , RANDOMX_FREQ_ISUB_R(16) @@ -233,11 +226,6 @@ void RandomX_ConfigurationBase::Apply() ScratchpadL3Mask_Calculated = (((ScratchpadL3_Size / sizeof(uint64_t)) - 1) * 8); ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64; - CacheLineAlignMask_Calculated = (DatasetBaseSize - 1) & ~(RANDOMX_DATASET_ITEM_SIZE - 1); - DatasetExtraItems_Calculated = DatasetExtraSize / RANDOMX_DATASET_ITEM_SIZE; - - ConditionMask_Calculated = (1 << JumpBits) - 1; - #if defined(_M_X64) || defined(__x86_64__) *(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1; // Not needed right now because all variants use default dataset base size @@ -295,16 +283,16 @@ void RandomX_ConfigurationBase::Apply() #define JIT_HANDLE(x, prev) #endif - constexpr int CEIL_NULL = 0; - int k = 0; + uint32_t k = 0; + uint32_t freq_sum = 0; #define INST_HANDLE(x, prev) \ - CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \ - for (; k < CEIL_##x; ++k) { JIT_HANDLE(x, prev); } + freq_sum += RANDOMX_FREQ_##x; \ + for (; k < freq_sum; ++k) { JIT_HANDLE(x, prev); } #define INST_HANDLE2(x, func_name, prev) \ - CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \ - for (; k < CEIL_##x; ++k) { JIT_HANDLE(func_name, prev); } + freq_sum += RANDOMX_FREQ_##x; \ + for (; k < freq_sum; ++k) { JIT_HANDLE(func_name, prev); } INST_HANDLE(IADD_RS, NULL); INST_HANDLE(IADD_M, IADD_RS); @@ -343,7 +331,13 @@ void RandomX_ConfigurationBase::Apply() INST_HANDLE(FMUL_R, FSCAL_R); INST_HANDLE(FDIV_M, FMUL_R); INST_HANDLE(FSQRT_R, FDIV_M); - INST_HANDLE(CBRANCH, FSQRT_R); + + if (xmrig::Cpu::info()->jccErratum()) { + INST_HANDLE2(CBRANCH, CBRANCH, FSQRT_R); + } + else { + INST_HANDLE2(CBRANCH, CBRANCH, FSQRT_R); + } #if defined(_M_X64) || defined(__x86_64__) if (xmrig::Cpu::info()->hasBMI2()) { diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 3379e2242..4f2804edd 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -64,15 +64,24 @@ struct RandomX_ConfigurationBase void Apply(); - uint32_t ArgonMemory; + // Common parameters for all RandomX variants + enum Params : uint64_t + { + ArgonMemory = 262144, + CacheAccesses = 8, + SuperscalarLatency = 170, + DatasetBaseSize = 2147483648, + DatasetExtraSize = 33554368, + JumpBits = 8, + JumpOffset = 8, + CacheLineAlignMask_Calculated = (DatasetBaseSize - 1) & ~(RANDOMX_DATASET_ITEM_SIZE - 1), + DatasetExtraItems_Calculated = DatasetExtraSize / RANDOMX_DATASET_ITEM_SIZE, + ConditionMask_Calculated = ((1 << JumpBits) - 1) << JumpOffset, + }; + uint32_t ArgonIterations; uint32_t ArgonLanes; const char* ArgonSalt; - uint32_t CacheAccesses; - uint32_t SuperscalarLatency; - - uint32_t DatasetBaseSize; - uint32_t DatasetExtraSize; uint32_t ScratchpadL1_Size; uint32_t ScratchpadL2_Size; @@ -82,9 +91,6 @@ struct RandomX_ConfigurationBase uint32_t ProgramIterations; uint32_t ProgramCount; - uint32_t JumpBits; - uint32_t JumpOffset; - uint32_t RANDOMX_FREQ_IADD_RS; uint32_t RANDOMX_FREQ_IADD_M; uint32_t RANDOMX_FREQ_ISUB_R; @@ -126,15 +132,10 @@ struct RandomX_ConfigurationBase uint8_t codeReadDatasetLightSshInitTweaked[68]; uint8_t codePrefetchScratchpadTweaked[32]; - uint32_t CacheLineAlignMask_Calculated; - uint32_t DatasetExtraItems_Calculated; - uint32_t AddressMask_Calculated[4]; uint32_t ScratchpadL3Mask_Calculated; uint32_t ScratchpadL3Mask64_Calculated; - uint32_t ConditionMask_Calculated; - #if defined(XMRIG_ARMv8) uint32_t Log2_ScratchpadL1; uint32_t Log2_ScratchpadL2; @@ -142,37 +143,6 @@ struct RandomX_ConfigurationBase uint32_t Log2_DatasetBaseSize; uint32_t Log2_CacheSize; #endif - - int CEIL_IADD_RS; - int CEIL_IADD_M; - int CEIL_ISUB_R; - int CEIL_ISUB_M; - int CEIL_IMUL_R; - int CEIL_IMUL_M; - int CEIL_IMULH_R; - int CEIL_IMULH_M; - int CEIL_ISMULH_R; - int CEIL_ISMULH_M; - int CEIL_IMUL_RCP; - int CEIL_INEG_R; - int CEIL_IXOR_R; - int CEIL_IXOR_M; - int CEIL_IROR_R; - int CEIL_IROL_R; - int CEIL_ISWAP_R; - int CEIL_FSWAP_R; - int CEIL_FADD_R; - int CEIL_FADD_M; - int CEIL_FSUB_R; - int CEIL_FSUB_M; - int CEIL_FSCAL_R; - int CEIL_FMUL_R; - int CEIL_FDIV_M; - int CEIL_FSQRT_R; - int CEIL_CBRANCH; - int CEIL_CFROUND; - int CEIL_ISTORE; - int CEIL_NOP; }; struct RandomX_ConfigurationMonero : public RandomX_ConfigurationBase {}; From 768a4581e09b6e6c898150cd5a8d3f2472651c50 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sat, 19 Sep 2020 23:12:05 +0200 Subject: [PATCH 10/13] Fixed Cryptonight OpenCL for AMD 20.7.2 drivers Vega 64 + Windows 10 + AMD 20.7.2 drivers were broken on Cryptonight algorithms. --- src/backend/opencl/cl/cn/cryptonight.cl | 2 +- src/backend/opencl/cl/cn/cryptonight_cl.h | 173 +++++++++++----------- 2 files changed, 88 insertions(+), 87 deletions(-) diff --git a/src/backend/opencl/cl/cn/cryptonight.cl b/src/backend/opencl/cl/cn/cryptonight.cl index 000737c53..8b69185b7 100644 --- a/src/backend/opencl/cl/cn/cryptonight.cl +++ b/src/backend/opencl/cl/cn/cryptonight.cl @@ -899,7 +899,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u ((uint8 *)h)[0] = vload8(0U, c_IV256); - for (uint i = 0; i < 3; ++i) { + for (volatile uint i = 0; i < 3; ++i) { ((uint16 *)m)[0] = vload16(i, (__global uint *)states); for (uint x = 0; x < 16; ++x) { m[x] = SWAP4(m[x]); diff --git a/src/backend/opencl/cl/cn/cryptonight_cl.h b/src/backend/opencl/cl/cn/cryptonight_cl.h index 52d9139b1..585c8c300 100644 --- a/src/backend/opencl/cl/cn/cryptonight_cl.h +++ b/src/backend/opencl/cl/cn/cryptonight_cl.h @@ -2,7 +2,7 @@ namespace xmrig { -static const char cryptonight_cl[60954] = { +static const char cryptonight_cl[60963] = { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x63,0x6c,0x5f,0x63,0x6c,0x61,0x6e,0x67,0x5f,0x73,0x74,0x6f,0x72,0x61,0x67,0x65,0x5f,0x63,0x6c,0x61,0x73,0x73,0x5f,0x73,0x70, 0x65,0x63,0x69,0x66,0x69,0x65,0x72,0x73,0x0a,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f, 0x4e,0x20,0x63,0x6c,0x5f,0x63,0x6c,0x61,0x6e,0x67,0x5f,0x73,0x74,0x6f,0x72,0x61,0x67,0x65,0x5f,0x63,0x6c,0x61,0x73,0x73,0x5f,0x73,0x70,0x65,0x63,0x69,0x66,0x69, @@ -1823,91 +1823,92 @@ static const char cryptonight_cl[60954] = { 0x2a,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x5b,0x69,0x64,0x78,0x5d,0x3b,0x0a,0x75,0x6e,0x73,0x69,0x67,0x6e,0x65,0x64,0x20,0x69,0x6e,0x74,0x20,0x6d,0x5b, 0x31,0x36,0x5d,0x3b,0x0a,0x75,0x6e,0x73,0x69,0x67,0x6e,0x65,0x64,0x20,0x69,0x6e,0x74,0x20,0x76,0x5b,0x31,0x36,0x5d,0x3b,0x0a,0x75,0x69,0x6e,0x74,0x20,0x68,0x5b, 0x38,0x5d,0x3b,0x0a,0x75,0x69,0x6e,0x74,0x20,0x62,0x69,0x74,0x6c,0x65,0x6e,0x3d,0x30,0x3b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x68,0x29,0x5b, - 0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x30,0x55,0x2c,0x63,0x5f,0x49,0x56,0x32,0x35,0x36,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74, - 0x20,0x69,0x3d,0x30,0x3b,0x20,0x69,0x3c,0x33,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x20,0x7b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x6d,0x29,0x5b, - 0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x28,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x29,0x73,0x74, - 0x61,0x74,0x65,0x73,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x31,0x36,0x3b,0x20,0x2b,0x2b,0x78,0x29, - 0x20,0x7b,0x0a,0x6d,0x5b,0x78,0x5d,0x3d,0x53,0x57,0x41,0x50,0x34,0x28,0x6d,0x5b,0x78,0x5d,0x29,0x3b,0x0a,0x7d,0x0a,0x62,0x69,0x74,0x6c,0x65,0x6e,0x2b,0x3d,0x35, - 0x31,0x32,0x3b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x2e,0x6c,0x6f,0x3d,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20, - 0x2a,0x29,0x68,0x29,0x5b,0x30,0x5d,0x3b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x2e,0x68,0x69,0x3d,0x76,0x6c,0x6f, - 0x61,0x64,0x38,0x28,0x30,0x55,0x2c,0x63,0x5f,0x75,0x32,0x35,0x36,0x29,0x3b,0x0a,0x76,0x5b,0x31,0x32,0x5d,0x20,0x5e,0x3d,0x20,0x62,0x69,0x74,0x6c,0x65,0x6e,0x3b, - 0x0a,0x76,0x5b,0x31,0x33,0x5d,0x20,0x5e,0x3d,0x20,0x62,0x69,0x74,0x6c,0x65,0x6e,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x72,0x3d,0x30,0x3b, - 0x20,0x72,0x3c,0x31,0x34,0x3b,0x20,0x72,0x2b,0x2b,0x29,0x20,0x7b,0x0a,0x47,0x53,0x28,0x30,0x2c,0x34,0x2c,0x30,0x78,0x38,0x2c,0x30,0x78,0x43,0x2c,0x30,0x78,0x30, - 0x29,0x3b,0x0a,0x47,0x53,0x28,0x31,0x2c,0x35,0x2c,0x30,0x78,0x39,0x2c,0x30,0x78,0x44,0x2c,0x30,0x78,0x32,0x29,0x3b,0x0a,0x47,0x53,0x28,0x32,0x2c,0x36,0x2c,0x30, - 0x78,0x41,0x2c,0x30,0x78,0x45,0x2c,0x30,0x78,0x34,0x29,0x3b,0x0a,0x47,0x53,0x28,0x33,0x2c,0x37,0x2c,0x30,0x78,0x42,0x2c,0x30,0x78,0x46,0x2c,0x30,0x78,0x36,0x29, - 0x3b,0x0a,0x47,0x53,0x28,0x30,0x2c,0x35,0x2c,0x30,0x78,0x41,0x2c,0x30,0x78,0x46,0x2c,0x30,0x78,0x38,0x29,0x3b,0x0a,0x47,0x53,0x28,0x31,0x2c,0x36,0x2c,0x30,0x78, - 0x42,0x2c,0x30,0x78,0x43,0x2c,0x30,0x78,0x41,0x29,0x3b,0x0a,0x47,0x53,0x28,0x32,0x2c,0x37,0x2c,0x30,0x78,0x38,0x2c,0x30,0x78,0x44,0x2c,0x30,0x78,0x43,0x29,0x3b, - 0x0a,0x47,0x53,0x28,0x33,0x2c,0x34,0x2c,0x30,0x78,0x39,0x2c,0x30,0x78,0x45,0x2c,0x30,0x78,0x45,0x29,0x3b,0x0a,0x7d,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20, - 0x2a,0x29,0x68,0x29,0x5b,0x30,0x5d,0x20,0x5e,0x3d,0x20,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x5e,0x28,0x28,0x75,0x69,0x6e, - 0x74,0x38,0x20,0x2a,0x29,0x76,0x29,0x5b,0x31,0x5d,0x3b,0x0a,0x7d,0x0a,0x6d,0x5b,0x30,0x5d,0x3d,0x53,0x57,0x41,0x50,0x34,0x28,0x28,0x28,0x5f,0x5f,0x67,0x6c,0x6f, - 0x62,0x61,0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x29,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x5b,0x34,0x38,0x5d,0x29,0x3b,0x0a,0x6d,0x5b,0x31,0x5d,0x3d,0x53,0x57, - 0x41,0x50,0x34,0x28,0x28,0x28,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x29,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x5b,0x34,0x39, - 0x5d,0x29,0x3b,0x0a,0x6d,0x5b,0x32,0x5d,0x3d,0x30,0x78,0x38,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x33,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55, - 0x3b,0x0a,0x6d,0x5b,0x34,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x35,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x36,0x5d,0x3d,0x30, - 0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x37,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x38,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b, - 0x39,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x30,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x31,0x5d,0x3d,0x30,0x78,0x30, - 0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x32,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x33,0x5d,0x3d,0x31,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x34,0x5d, - 0x3d,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x35,0x5d,0x3d,0x30,0x78,0x36,0x34,0x30,0x3b,0x0a,0x62,0x69,0x74,0x6c,0x65,0x6e,0x2b,0x3d,0x36,0x34,0x3b,0x0a,0x28,0x28, - 0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x2e,0x6c,0x6f,0x3d,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x68,0x29,0x5b,0x30, - 0x5d,0x3b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x2e,0x68,0x69,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x30,0x55, - 0x2c,0x63,0x5f,0x75,0x32,0x35,0x36,0x29,0x3b,0x0a,0x76,0x5b,0x31,0x32,0x5d,0x20,0x5e,0x3d,0x20,0x62,0x69,0x74,0x6c,0x65,0x6e,0x3b,0x0a,0x76,0x5b,0x31,0x33,0x5d, - 0x20,0x5e,0x3d,0x20,0x62,0x69,0x74,0x6c,0x65,0x6e,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x72,0x3d,0x30,0x3b,0x20,0x72,0x3c,0x31,0x34,0x3b, - 0x20,0x72,0x2b,0x2b,0x29,0x20,0x7b,0x0a,0x47,0x53,0x28,0x30,0x2c,0x34,0x2c,0x30,0x78,0x38,0x2c,0x30,0x78,0x43,0x2c,0x30,0x78,0x30,0x29,0x3b,0x0a,0x47,0x53,0x28, - 0x31,0x2c,0x35,0x2c,0x30,0x78,0x39,0x2c,0x30,0x78,0x44,0x2c,0x30,0x78,0x32,0x29,0x3b,0x0a,0x47,0x53,0x28,0x32,0x2c,0x36,0x2c,0x30,0x78,0x41,0x2c,0x30,0x78,0x45, - 0x2c,0x30,0x78,0x34,0x29,0x3b,0x0a,0x47,0x53,0x28,0x33,0x2c,0x37,0x2c,0x30,0x78,0x42,0x2c,0x30,0x78,0x46,0x2c,0x30,0x78,0x36,0x29,0x3b,0x0a,0x47,0x53,0x28,0x30, - 0x2c,0x35,0x2c,0x30,0x78,0x41,0x2c,0x30,0x78,0x46,0x2c,0x30,0x78,0x38,0x29,0x3b,0x0a,0x47,0x53,0x28,0x31,0x2c,0x36,0x2c,0x30,0x78,0x42,0x2c,0x30,0x78,0x43,0x2c, - 0x30,0x78,0x41,0x29,0x3b,0x0a,0x47,0x53,0x28,0x32,0x2c,0x37,0x2c,0x30,0x78,0x38,0x2c,0x30,0x78,0x44,0x2c,0x30,0x78,0x43,0x29,0x3b,0x0a,0x47,0x53,0x28,0x33,0x2c, - 0x34,0x2c,0x30,0x78,0x39,0x2c,0x30,0x78,0x45,0x2c,0x30,0x78,0x45,0x29,0x3b,0x0a,0x7d,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x68,0x29,0x5b,0x30, - 0x5d,0x20,0x5e,0x3d,0x20,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x5e,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x76, - 0x29,0x5b,0x31,0x5d,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x69,0x3d,0x30,0x3b,0x20,0x69,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x20,0x7b, - 0x0a,0x68,0x5b,0x69,0x5d,0x3d,0x53,0x57,0x41,0x50,0x34,0x28,0x68,0x5b,0x69,0x5d,0x29,0x3b,0x0a,0x7d,0x0a,0x75,0x69,0x6e,0x74,0x32,0x20,0x74,0x3d,0x28,0x75,0x69, - 0x6e,0x74,0x32,0x29,0x28,0x68,0x5b,0x36,0x5d,0x2c,0x68,0x5b,0x37,0x5d,0x29,0x3b,0x0a,0x69,0x66,0x28,0x61,0x73,0x5f,0x75,0x6c,0x6f,0x6e,0x67,0x28,0x74,0x29,0x3c, - 0x3d,0x54,0x61,0x72,0x67,0x65,0x74,0x29,0x20,0x7b,0x0a,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x6f,0x75,0x74,0x49,0x64,0x78,0x3d,0x61,0x74,0x6f,0x6d,0x69,0x63,0x5f,0x69, - 0x6e,0x63,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2b,0x30,0x78,0x46,0x46,0x29,0x3b,0x0a,0x69,0x66,0x28,0x6f,0x75,0x74,0x49,0x64,0x78,0x3c,0x30,0x78,0x46,0x46,0x29, - 0x20,0x7b,0x0a,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x6f,0x75,0x74,0x49,0x64,0x78,0x5d,0x3d,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x5b,0x69,0x64,0x78,0x5d, - 0x2b,0x28,0x75,0x69,0x6e,0x74,0x29,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x28,0x30,0x29,0x3b,0x0a,0x7d,0x0a, - 0x7d,0x0a,0x7d,0x0a,0x7d,0x0a,0x23,0x75,0x6e,0x64,0x65,0x66,0x20,0x53,0x57,0x41,0x50,0x34,0x0a,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64, - 0x20,0x47,0x72,0x6f,0x65,0x73,0x74,0x6c,0x28,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x2a,0x73,0x74,0x61,0x74,0x65,0x73,0x2c, - 0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x2c,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61, - 0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x54,0x61,0x72,0x67,0x65,0x74,0x2c,0x75,0x69,0x6e,0x74, - 0x20,0x54,0x68,0x72,0x65,0x61,0x64,0x73,0x29,0x0a,0x7b,0x0a,0x63,0x6f,0x6e,0x73,0x74,0x20,0x75,0x69,0x6e,0x74,0x20,0x69,0x64,0x78,0x3d,0x67,0x65,0x74,0x5f,0x67, - 0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2d,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x28,0x30,0x29, - 0x3b,0x0a,0x69,0x66,0x28,0x69,0x64,0x78,0x3c,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x5b,0x54,0x68,0x72,0x65,0x61,0x64,0x73,0x5d,0x29,0x20,0x7b,0x0a,0x73, - 0x74,0x61,0x74,0x65,0x73,0x2b,0x3d,0x32,0x35,0x2a,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x5b,0x69,0x64,0x78,0x5d,0x3b,0x0a,0x75,0x6c,0x6f,0x6e,0x67,0x20, - 0x53,0x74,0x61,0x74,0x65,0x5b,0x38,0x5d,0x3d,0x7b,0x20,0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30, - 0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30,0x78,0x30,0x30,0x30,0x31,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x55,0x4c,0x20,0x7d,0x3b,0x0a,0x75, - 0x6c,0x6f,0x6e,0x67,0x20,0x48,0x5b,0x38,0x5d,0x2c,0x4d,0x5b,0x38,0x5d,0x3b,0x0a,0x7b,0x0a,0x28,0x28,0x75,0x6c,0x6f,0x6e,0x67,0x38,0x20,0x2a,0x29,0x4d,0x29,0x5b, - 0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x30,0x2c,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78, - 0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x48,0x5b,0x78,0x5d,0x3d,0x4d,0x5b,0x78,0x5d,0x5e,0x53,0x74,0x61,0x74,0x65,0x5b, - 0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50,0x28,0x48,0x29,0x3b,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c, - 0x4c,0x5f,0x51,0x28,0x4d,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29, - 0x20,0x7b,0x0a,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x20,0x5e,0x3d,0x20,0x48,0x5b,0x78,0x5d,0x5e,0x4d,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x7d,0x0a,0x7b,0x0a, - 0x28,0x28,0x75,0x6c,0x6f,0x6e,0x67,0x38,0x20,0x2a,0x29,0x4d,0x29,0x5b,0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x31,0x2c,0x73,0x74,0x61,0x74,0x65,0x73, - 0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x48,0x5b, - 0x78,0x5d,0x3d,0x4d,0x5b,0x78,0x5d,0x5e,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50, - 0x28,0x48,0x29,0x3b,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x51,0x28,0x4d,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20, - 0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x20,0x5e,0x3d,0x20,0x48,0x5b,0x78, - 0x5d,0x5e,0x4d,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x7d,0x0a,0x7b,0x0a,0x28,0x28,0x75,0x6c,0x6f,0x6e,0x67,0x38,0x20,0x2a,0x29,0x4d,0x29,0x5b,0x30,0x5d,0x3d,0x76, - 0x6c,0x6f,0x61,0x64,0x38,0x28,0x32,0x2c,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20, - 0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x48,0x5b,0x78,0x5d,0x3d,0x4d,0x5b,0x78,0x5d,0x5e,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x3b,0x0a, - 0x7d,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50,0x28,0x48,0x29,0x3b,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x51,0x28, - 0x4d,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x53, - 0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x20,0x5e,0x3d,0x20,0x48,0x5b,0x78,0x5d,0x5e,0x4d,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x7d,0x0a,0x4d,0x5b,0x30,0x5d,0x3d,0x73, - 0x74,0x61,0x74,0x65,0x73,0x5b,0x32,0x34,0x5d,0x3b,0x0a,0x4d,0x5b,0x31,0x5d,0x3d,0x30,0x78,0x38,0x30,0x55,0x4c,0x3b,0x0a,0x4d,0x5b,0x32,0x5d,0x3d,0x30,0x55,0x4c, - 0x3b,0x0a,0x4d,0x5b,0x33,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a,0x4d,0x5b,0x34,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a,0x4d,0x5b,0x35,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a, - 0x4d,0x5b,0x36,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a,0x4d,0x5b,0x37,0x5d,0x3d,0x30,0x78,0x30,0x34,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30, - 0x30,0x55,0x4c,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a, - 0x48,0x5b,0x78,0x5d,0x3d,0x4d,0x5b,0x78,0x5d,0x5e,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c, - 0x5f,0x50,0x28,0x48,0x29,0x3b,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x51,0x28,0x4d,0x29,0x3b,0x0a,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x74,0x6d, - 0x70,0x5b,0x38,0x5d,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x69,0x3d,0x30,0x3b,0x20,0x69,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x20,0x7b, - 0x0a,0x74,0x6d,0x70,0x5b,0x69,0x5d,0x3d,0x53,0x74,0x61,0x74,0x65,0x5b,0x69,0x5d,0x20,0x5e,0x3d,0x20,0x48,0x5b,0x69,0x5d,0x5e,0x4d,0x5b,0x69,0x5d,0x3b,0x0a,0x7d, - 0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50,0x28,0x53,0x74,0x61,0x74,0x65,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20, - 0x69,0x3d,0x30,0x3b,0x20,0x69,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x20,0x7b,0x0a,0x53,0x74,0x61,0x74,0x65,0x5b,0x69,0x5d,0x20,0x5e,0x3d,0x20,0x74,0x6d,0x70, - 0x5b,0x69,0x5d,0x3b,0x0a,0x7d,0x0a,0x69,0x66,0x28,0x53,0x74,0x61,0x74,0x65,0x5b,0x37,0x5d,0x3c,0x3d,0x54,0x61,0x72,0x67,0x65,0x74,0x29,0x20,0x7b,0x0a,0x75,0x6c, - 0x6f,0x6e,0x67,0x20,0x6f,0x75,0x74,0x49,0x64,0x78,0x3d,0x61,0x74,0x6f,0x6d,0x69,0x63,0x5f,0x69,0x6e,0x63,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2b,0x30,0x78,0x46, - 0x46,0x29,0x3b,0x0a,0x69,0x66,0x28,0x6f,0x75,0x74,0x49,0x64,0x78,0x3c,0x30,0x78,0x46,0x46,0x29,0x20,0x7b,0x0a,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x6f,0x75,0x74, - 0x49,0x64,0x78,0x5d,0x3d,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x5b,0x69,0x64,0x78,0x5d,0x2b,0x28,0x75,0x69,0x6e,0x74,0x29,0x20,0x67,0x65,0x74,0x5f,0x67, - 0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x28,0x30,0x29,0x3b,0x0a,0x7d,0x0a,0x7d,0x0a,0x7d,0x0a,0x7d,0x0a,0x00 + 0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x30,0x55,0x2c,0x63,0x5f,0x49,0x56,0x32,0x35,0x36,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x76,0x6f,0x6c,0x61, + 0x74,0x69,0x6c,0x65,0x20,0x75,0x69,0x6e,0x74,0x20,0x69,0x3d,0x30,0x3b,0x20,0x69,0x3c,0x33,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x20,0x7b,0x0a,0x28,0x28,0x75,0x69,0x6e, + 0x74,0x31,0x36,0x20,0x2a,0x29,0x6d,0x29,0x5b,0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x28,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20, + 0x75,0x69,0x6e,0x74,0x20,0x2a,0x29,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78, + 0x3c,0x31,0x36,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x6d,0x5b,0x78,0x5d,0x3d,0x53,0x57,0x41,0x50,0x34,0x28,0x6d,0x5b,0x78,0x5d,0x29,0x3b,0x0a,0x7d,0x0a, + 0x62,0x69,0x74,0x6c,0x65,0x6e,0x2b,0x3d,0x35,0x31,0x32,0x3b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x2e,0x6c,0x6f, + 0x3d,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x68,0x29,0x5b,0x30,0x5d,0x3b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x76,0x29,0x5b, + 0x30,0x5d,0x2e,0x68,0x69,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x30,0x55,0x2c,0x63,0x5f,0x75,0x32,0x35,0x36,0x29,0x3b,0x0a,0x76,0x5b,0x31,0x32,0x5d,0x20,0x5e, + 0x3d,0x20,0x62,0x69,0x74,0x6c,0x65,0x6e,0x3b,0x0a,0x76,0x5b,0x31,0x33,0x5d,0x20,0x5e,0x3d,0x20,0x62,0x69,0x74,0x6c,0x65,0x6e,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28, + 0x75,0x69,0x6e,0x74,0x20,0x72,0x3d,0x30,0x3b,0x20,0x72,0x3c,0x31,0x34,0x3b,0x20,0x72,0x2b,0x2b,0x29,0x20,0x7b,0x0a,0x47,0x53,0x28,0x30,0x2c,0x34,0x2c,0x30,0x78, + 0x38,0x2c,0x30,0x78,0x43,0x2c,0x30,0x78,0x30,0x29,0x3b,0x0a,0x47,0x53,0x28,0x31,0x2c,0x35,0x2c,0x30,0x78,0x39,0x2c,0x30,0x78,0x44,0x2c,0x30,0x78,0x32,0x29,0x3b, + 0x0a,0x47,0x53,0x28,0x32,0x2c,0x36,0x2c,0x30,0x78,0x41,0x2c,0x30,0x78,0x45,0x2c,0x30,0x78,0x34,0x29,0x3b,0x0a,0x47,0x53,0x28,0x33,0x2c,0x37,0x2c,0x30,0x78,0x42, + 0x2c,0x30,0x78,0x46,0x2c,0x30,0x78,0x36,0x29,0x3b,0x0a,0x47,0x53,0x28,0x30,0x2c,0x35,0x2c,0x30,0x78,0x41,0x2c,0x30,0x78,0x46,0x2c,0x30,0x78,0x38,0x29,0x3b,0x0a, + 0x47,0x53,0x28,0x31,0x2c,0x36,0x2c,0x30,0x78,0x42,0x2c,0x30,0x78,0x43,0x2c,0x30,0x78,0x41,0x29,0x3b,0x0a,0x47,0x53,0x28,0x32,0x2c,0x37,0x2c,0x30,0x78,0x38,0x2c, + 0x30,0x78,0x44,0x2c,0x30,0x78,0x43,0x29,0x3b,0x0a,0x47,0x53,0x28,0x33,0x2c,0x34,0x2c,0x30,0x78,0x39,0x2c,0x30,0x78,0x45,0x2c,0x30,0x78,0x45,0x29,0x3b,0x0a,0x7d, + 0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x68,0x29,0x5b,0x30,0x5d,0x20,0x5e,0x3d,0x20,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x76,0x29, + 0x5b,0x30,0x5d,0x5e,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x76,0x29,0x5b,0x31,0x5d,0x3b,0x0a,0x7d,0x0a,0x6d,0x5b,0x30,0x5d,0x3d,0x53,0x57,0x41,0x50, + 0x34,0x28,0x28,0x28,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x29,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x5b,0x34,0x38,0x5d,0x29, + 0x3b,0x0a,0x6d,0x5b,0x31,0x5d,0x3d,0x53,0x57,0x41,0x50,0x34,0x28,0x28,0x28,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x29,0x73, + 0x74,0x61,0x74,0x65,0x73,0x29,0x5b,0x34,0x39,0x5d,0x29,0x3b,0x0a,0x6d,0x5b,0x32,0x5d,0x3d,0x30,0x78,0x38,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x55,0x3b,0x0a,0x6d, + 0x5b,0x33,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x34,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x35,0x5d,0x3d,0x30,0x78,0x30,0x30, + 0x55,0x3b,0x0a,0x6d,0x5b,0x36,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x37,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x38,0x5d,0x3d, + 0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x39,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x30,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a, + 0x6d,0x5b,0x31,0x31,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x32,0x5d,0x3d,0x30,0x78,0x30,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x33,0x5d,0x3d, + 0x31,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x34,0x5d,0x3d,0x30,0x55,0x3b,0x0a,0x6d,0x5b,0x31,0x35,0x5d,0x3d,0x30,0x78,0x36,0x34,0x30,0x3b,0x0a,0x62,0x69,0x74,0x6c,0x65, + 0x6e,0x2b,0x3d,0x36,0x34,0x3b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x2e,0x6c,0x6f,0x3d,0x28,0x28,0x75,0x69,0x6e, + 0x74,0x38,0x20,0x2a,0x29,0x68,0x29,0x5b,0x30,0x5d,0x3b,0x0a,0x28,0x28,0x75,0x69,0x6e,0x74,0x31,0x36,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x2e,0x68,0x69,0x3d, + 0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x30,0x55,0x2c,0x63,0x5f,0x75,0x32,0x35,0x36,0x29,0x3b,0x0a,0x76,0x5b,0x31,0x32,0x5d,0x20,0x5e,0x3d,0x20,0x62,0x69,0x74,0x6c, + 0x65,0x6e,0x3b,0x0a,0x76,0x5b,0x31,0x33,0x5d,0x20,0x5e,0x3d,0x20,0x62,0x69,0x74,0x6c,0x65,0x6e,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x72, + 0x3d,0x30,0x3b,0x20,0x72,0x3c,0x31,0x34,0x3b,0x20,0x72,0x2b,0x2b,0x29,0x20,0x7b,0x0a,0x47,0x53,0x28,0x30,0x2c,0x34,0x2c,0x30,0x78,0x38,0x2c,0x30,0x78,0x43,0x2c, + 0x30,0x78,0x30,0x29,0x3b,0x0a,0x47,0x53,0x28,0x31,0x2c,0x35,0x2c,0x30,0x78,0x39,0x2c,0x30,0x78,0x44,0x2c,0x30,0x78,0x32,0x29,0x3b,0x0a,0x47,0x53,0x28,0x32,0x2c, + 0x36,0x2c,0x30,0x78,0x41,0x2c,0x30,0x78,0x45,0x2c,0x30,0x78,0x34,0x29,0x3b,0x0a,0x47,0x53,0x28,0x33,0x2c,0x37,0x2c,0x30,0x78,0x42,0x2c,0x30,0x78,0x46,0x2c,0x30, + 0x78,0x36,0x29,0x3b,0x0a,0x47,0x53,0x28,0x30,0x2c,0x35,0x2c,0x30,0x78,0x41,0x2c,0x30,0x78,0x46,0x2c,0x30,0x78,0x38,0x29,0x3b,0x0a,0x47,0x53,0x28,0x31,0x2c,0x36, + 0x2c,0x30,0x78,0x42,0x2c,0x30,0x78,0x43,0x2c,0x30,0x78,0x41,0x29,0x3b,0x0a,0x47,0x53,0x28,0x32,0x2c,0x37,0x2c,0x30,0x78,0x38,0x2c,0x30,0x78,0x44,0x2c,0x30,0x78, + 0x43,0x29,0x3b,0x0a,0x47,0x53,0x28,0x33,0x2c,0x34,0x2c,0x30,0x78,0x39,0x2c,0x30,0x78,0x45,0x2c,0x30,0x78,0x45,0x29,0x3b,0x0a,0x7d,0x0a,0x28,0x28,0x75,0x69,0x6e, + 0x74,0x38,0x20,0x2a,0x29,0x68,0x29,0x5b,0x30,0x5d,0x20,0x5e,0x3d,0x20,0x28,0x28,0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x76,0x29,0x5b,0x30,0x5d,0x5e,0x28,0x28, + 0x75,0x69,0x6e,0x74,0x38,0x20,0x2a,0x29,0x76,0x29,0x5b,0x31,0x5d,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x69,0x3d,0x30,0x3b,0x20,0x69,0x3c, + 0x38,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x20,0x7b,0x0a,0x68,0x5b,0x69,0x5d,0x3d,0x53,0x57,0x41,0x50,0x34,0x28,0x68,0x5b,0x69,0x5d,0x29,0x3b,0x0a,0x7d,0x0a,0x75,0x69, + 0x6e,0x74,0x32,0x20,0x74,0x3d,0x28,0x75,0x69,0x6e,0x74,0x32,0x29,0x28,0x68,0x5b,0x36,0x5d,0x2c,0x68,0x5b,0x37,0x5d,0x29,0x3b,0x0a,0x69,0x66,0x28,0x61,0x73,0x5f, + 0x75,0x6c,0x6f,0x6e,0x67,0x28,0x74,0x29,0x3c,0x3d,0x54,0x61,0x72,0x67,0x65,0x74,0x29,0x20,0x7b,0x0a,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x6f,0x75,0x74,0x49,0x64,0x78, + 0x3d,0x61,0x74,0x6f,0x6d,0x69,0x63,0x5f,0x69,0x6e,0x63,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2b,0x30,0x78,0x46,0x46,0x29,0x3b,0x0a,0x69,0x66,0x28,0x6f,0x75,0x74, + 0x49,0x64,0x78,0x3c,0x30,0x78,0x46,0x46,0x29,0x20,0x7b,0x0a,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x6f,0x75,0x74,0x49,0x64,0x78,0x5d,0x3d,0x42,0x72,0x61,0x6e,0x63, + 0x68,0x42,0x75,0x66,0x5b,0x69,0x64,0x78,0x5d,0x2b,0x28,0x75,0x69,0x6e,0x74,0x29,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x6f,0x66,0x66,0x73, + 0x65,0x74,0x28,0x30,0x29,0x3b,0x0a,0x7d,0x0a,0x7d,0x0a,0x7d,0x0a,0x7d,0x0a,0x23,0x75,0x6e,0x64,0x65,0x66,0x20,0x53,0x57,0x41,0x50,0x34,0x0a,0x5f,0x5f,0x6b,0x65, + 0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x47,0x72,0x6f,0x65,0x73,0x74,0x6c,0x28,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x6c,0x6f,0x6e,0x67, + 0x20,0x2a,0x73,0x74,0x61,0x74,0x65,0x73,0x2c,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75, + 0x66,0x2c,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x75,0x69,0x6e,0x74,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x54,0x61, + 0x72,0x67,0x65,0x74,0x2c,0x75,0x69,0x6e,0x74,0x20,0x54,0x68,0x72,0x65,0x61,0x64,0x73,0x29,0x0a,0x7b,0x0a,0x63,0x6f,0x6e,0x73,0x74,0x20,0x75,0x69,0x6e,0x74,0x20, + 0x69,0x64,0x78,0x3d,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2d,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f, + 0x6f,0x66,0x66,0x73,0x65,0x74,0x28,0x30,0x29,0x3b,0x0a,0x69,0x66,0x28,0x69,0x64,0x78,0x3c,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x5b,0x54,0x68,0x72,0x65, + 0x61,0x64,0x73,0x5d,0x29,0x20,0x7b,0x0a,0x73,0x74,0x61,0x74,0x65,0x73,0x2b,0x3d,0x32,0x35,0x2a,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x5b,0x69,0x64,0x78, + 0x5d,0x3b,0x0a,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x53,0x74,0x61,0x74,0x65,0x5b,0x38,0x5d,0x3d,0x7b,0x20,0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c, + 0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30,0x55,0x4c,0x2c,0x30,0x78,0x30,0x30,0x30,0x31,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30, + 0x30,0x30,0x55,0x4c,0x20,0x7d,0x3b,0x0a,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x48,0x5b,0x38,0x5d,0x2c,0x4d,0x5b,0x38,0x5d,0x3b,0x0a,0x7b,0x0a,0x28,0x28,0x75,0x6c,0x6f, + 0x6e,0x67,0x38,0x20,0x2a,0x29,0x4d,0x29,0x5b,0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x30,0x2c,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x3b,0x0a,0x66,0x6f, + 0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x48,0x5b,0x78,0x5d,0x3d,0x4d,0x5b, + 0x78,0x5d,0x5e,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50,0x28,0x48,0x29,0x3b,0x0a, + 0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x51,0x28,0x4d,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20, + 0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x20,0x5e,0x3d,0x20,0x48,0x5b,0x78,0x5d,0x5e,0x4d,0x5b,0x78, + 0x5d,0x3b,0x0a,0x7d,0x0a,0x7d,0x0a,0x7b,0x0a,0x28,0x28,0x75,0x6c,0x6f,0x6e,0x67,0x38,0x20,0x2a,0x29,0x4d,0x29,0x5b,0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38, + 0x28,0x31,0x2c,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20, + 0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x48,0x5b,0x78,0x5d,0x3d,0x4d,0x5b,0x78,0x5d,0x5e,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x50,0x45,0x52, + 0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50,0x28,0x48,0x29,0x3b,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x51,0x28,0x4d,0x29,0x3b,0x0a,0x66, + 0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x53,0x74,0x61,0x74,0x65,0x5b, + 0x78,0x5d,0x20,0x5e,0x3d,0x20,0x48,0x5b,0x78,0x5d,0x5e,0x4d,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x7d,0x0a,0x7b,0x0a,0x28,0x28,0x75,0x6c,0x6f,0x6e,0x67,0x38,0x20, + 0x2a,0x29,0x4d,0x29,0x5b,0x30,0x5d,0x3d,0x76,0x6c,0x6f,0x61,0x64,0x38,0x28,0x32,0x2c,0x73,0x74,0x61,0x74,0x65,0x73,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75, + 0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x48,0x5b,0x78,0x5d,0x3d,0x4d,0x5b,0x78,0x5d,0x5e,0x53, + 0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50,0x28,0x48,0x29,0x3b,0x0a,0x50,0x45,0x52,0x4d, + 0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x51,0x28,0x4d,0x29,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38,0x3b, + 0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x20,0x5e,0x3d,0x20,0x48,0x5b,0x78,0x5d,0x5e,0x4d,0x5b,0x78,0x5d,0x3b,0x0a,0x7d, + 0x0a,0x7d,0x0a,0x4d,0x5b,0x30,0x5d,0x3d,0x73,0x74,0x61,0x74,0x65,0x73,0x5b,0x32,0x34,0x5d,0x3b,0x0a,0x4d,0x5b,0x31,0x5d,0x3d,0x30,0x78,0x38,0x30,0x55,0x4c,0x3b, + 0x0a,0x4d,0x5b,0x32,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a,0x4d,0x5b,0x33,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a,0x4d,0x5b,0x34,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a,0x4d, + 0x5b,0x35,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a,0x4d,0x5b,0x36,0x5d,0x3d,0x30,0x55,0x4c,0x3b,0x0a,0x4d,0x5b,0x37,0x5d,0x3d,0x30,0x78,0x30,0x34,0x30,0x30,0x30,0x30, + 0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x30,0x55,0x4c,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x78,0x3d,0x30,0x3b,0x20,0x78,0x3c,0x38, + 0x3b,0x20,0x2b,0x2b,0x78,0x29,0x20,0x7b,0x0a,0x48,0x5b,0x78,0x5d,0x3d,0x4d,0x5b,0x78,0x5d,0x5e,0x53,0x74,0x61,0x74,0x65,0x5b,0x78,0x5d,0x3b,0x0a,0x7d,0x0a,0x50, + 0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50,0x28,0x48,0x29,0x3b,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x51,0x28,0x4d,0x29,0x3b, + 0x0a,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x74,0x6d,0x70,0x5b,0x38,0x5d,0x3b,0x0a,0x66,0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x69,0x3d,0x30,0x3b,0x20,0x69,0x3c, + 0x38,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x20,0x7b,0x0a,0x74,0x6d,0x70,0x5b,0x69,0x5d,0x3d,0x53,0x74,0x61,0x74,0x65,0x5b,0x69,0x5d,0x20,0x5e,0x3d,0x20,0x48,0x5b,0x69, + 0x5d,0x5e,0x4d,0x5b,0x69,0x5d,0x3b,0x0a,0x7d,0x0a,0x50,0x45,0x52,0x4d,0x5f,0x53,0x4d,0x41,0x4c,0x4c,0x5f,0x50,0x28,0x53,0x74,0x61,0x74,0x65,0x29,0x3b,0x0a,0x66, + 0x6f,0x72,0x20,0x28,0x75,0x69,0x6e,0x74,0x20,0x69,0x3d,0x30,0x3b,0x20,0x69,0x3c,0x38,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x20,0x7b,0x0a,0x53,0x74,0x61,0x74,0x65,0x5b, + 0x69,0x5d,0x20,0x5e,0x3d,0x20,0x74,0x6d,0x70,0x5b,0x69,0x5d,0x3b,0x0a,0x7d,0x0a,0x69,0x66,0x28,0x53,0x74,0x61,0x74,0x65,0x5b,0x37,0x5d,0x3c,0x3d,0x54,0x61,0x72, + 0x67,0x65,0x74,0x29,0x20,0x7b,0x0a,0x75,0x6c,0x6f,0x6e,0x67,0x20,0x6f,0x75,0x74,0x49,0x64,0x78,0x3d,0x61,0x74,0x6f,0x6d,0x69,0x63,0x5f,0x69,0x6e,0x63,0x28,0x6f, + 0x75,0x74,0x70,0x75,0x74,0x2b,0x30,0x78,0x46,0x46,0x29,0x3b,0x0a,0x69,0x66,0x28,0x6f,0x75,0x74,0x49,0x64,0x78,0x3c,0x30,0x78,0x46,0x46,0x29,0x20,0x7b,0x0a,0x6f, + 0x75,0x74,0x70,0x75,0x74,0x5b,0x6f,0x75,0x74,0x49,0x64,0x78,0x5d,0x3d,0x42,0x72,0x61,0x6e,0x63,0x68,0x42,0x75,0x66,0x5b,0x69,0x64,0x78,0x5d,0x2b,0x28,0x75,0x69, + 0x6e,0x74,0x29,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x28,0x30,0x29,0x3b,0x0a,0x7d,0x0a,0x7d,0x0a,0x7d,0x0a, + 0x7d,0x0a,0x00 }; } // namespace xmrig From 891a46382e1edb4b6b99b34f2c236cc256b24993 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 21 Sep 2020 17:51:08 +0200 Subject: [PATCH 11/13] RandomX: AES improvements - A bit faster hardware AES code when compiled with MSVC - More reliable software AES benchmark --- src/crypto/randomx/aes_hash.cpp | 47 +++++++++++++-------------------- src/crypto/randomx/soft_aes.cpp | 46 +++++++++++++++++--------------- 2 files changed, 44 insertions(+), 49 deletions(-) diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index 7c4b0c818..a15f75ad5 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -244,38 +244,29 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi for (int i = 0; i < 2; ++i) { //process 64 bytes at a time in 4 lanes while (scratchpadPtr < scratchpadEnd) { - hash_state0 = aesenc(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0)); - hash_state1 = aesdec(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1)); - hash_state2 = aesenc(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2)); - hash_state3 = aesdec(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3)); +#define HASH_STATE(k) \ + hash_state0 = aesenc(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 0)); \ + hash_state1 = aesdec(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 1)); \ + hash_state2 = aesenc(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2)); \ + hash_state3 = aesdec(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3)); - fill_state0 = aesdec(fill_state0, key0); - fill_state1 = aesenc(fill_state1, key1); - fill_state2 = aesdec(fill_state2, key2); - fill_state3 = aesenc(fill_state3, key3); +#define FILL_STATE(k) \ + fill_state0 = aesdec(fill_state0, key0); \ + fill_state1 = aesenc(fill_state1, key1); \ + fill_state2 = aesdec(fill_state2, key2); \ + fill_state3 = aesenc(fill_state3, key3); \ + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 0, fill_state0); \ + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 1, fill_state1); \ + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2, fill_state2); \ + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3, fill_state3); - rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0); - rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1); - rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2); - rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3); + HASH_STATE(0); + HASH_STATE(1); + + FILL_STATE(0); + FILL_STATE(1); rx_prefetch_t0(prefetchPtr); - - hash_state0 = aesenc(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 4)); - hash_state1 = aesdec(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 5)); - hash_state2 = aesenc(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 6)); - hash_state3 = aesdec(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 7)); - - fill_state0 = aesdec(fill_state0, key0); - fill_state1 = aesenc(fill_state1, key1); - fill_state2 = aesdec(fill_state2, key2); - fill_state3 = aesenc(fill_state3, key3); - - rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 4, fill_state0); - rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 5, fill_state1); - rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 6, fill_state2); - rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 7, fill_state3); - rx_prefetch_t0(prefetchPtr + 64); scratchpadPtr += 128; diff --git a/src/crypto/randomx/soft_aes.cpp b/src/crypto/randomx/soft_aes.cpp index a205398c8..ad6f9ffe6 100644 --- a/src/crypto/randomx/soft_aes.cpp +++ b/src/crypto/randomx/soft_aes.cpp @@ -131,31 +131,35 @@ uint32_t GetSoftAESImpl() void SelectSoftAESImpl() { constexpr int test_length_ms = 100; - double speed[2]; + double speed[2] = {}; - for (int i = 0; i < 2; ++i) - { - std::vector scratchpad(10 * 1024); - uint8_t hash[64] = {}; - uint8_t state[64] = {}; + for (int run = 0; run < 3; ++run) { + for (int i = 0; i < 2; ++i) { + std::vector scratchpad(10 * 1024); + uint8_t hash[64] = {}; + uint8_t state[64] = {}; - uint64_t t1, t2; + uint64_t t1, t2; - uint32_t count = 0; - t1 = xmrig::Chrono::highResolutionMSecs(); - do { - if (i == 0) { - hashAndFillAes1Rx4<1>(scratchpad.data(), scratchpad.size(), hash, state); + uint32_t count = 0; + t1 = xmrig::Chrono::highResolutionMSecs(); + do { + if (i == 0) { + hashAndFillAes1Rx4<1>(scratchpad.data(), scratchpad.size(), hash, state); + } + else { + hashAndFillAes1Rx4<2>(scratchpad.data(), scratchpad.size(), hash, state); + } + ++count; + + t2 = xmrig::Chrono::highResolutionMSecs(); + } while (t2 - t1 < test_length_ms); + + const double x = count * 1e3 / (t2 - t1); + if (x > speed[i]) { + speed[i] = x; } - else { - hashAndFillAes1Rx4<2>(scratchpad.data(), scratchpad.size(), hash, state); - } - ++count; - - t2 = xmrig::Chrono::highResolutionMSecs(); - } while (t2 - t1 < test_length_ms); - - speed[i] = count * 1e3 / (t2 - t1); + } } softAESImpl = (speed[0] > speed[1]) ? 1 : 2; From 9768bf65d165c9715a6a2ac2a303773741d04e4d Mon Sep 17 00:00:00 2001 From: SChernykh Date: Tue, 22 Sep 2020 13:48:11 +0200 Subject: [PATCH 12/13] RandomX improved performance of GCC compiled binaries JIT compilator was slower compared to MSVC compiled binary. Up to +0.1% speedup on rx/wow in Linux. --- src/base/tools/Profiler.cpp | 1 + src/base/tools/Profiler.h | 1 + src/crypto/randomx/jit_compiler_x86.cpp | 16 +++++++++++----- src/crypto/randomx/jit_compiler_x86.hpp | 2 +- src/crypto/randomx/randomx.cpp | 7 ++++++- 5 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/base/tools/Profiler.cpp b/src/base/tools/Profiler.cpp index f6f066f37..ac2a6d2cb 100644 --- a/src/base/tools/Profiler.cpp +++ b/src/base/tools/Profiler.cpp @@ -20,6 +20,7 @@ #include "base/tools/Profiler.h" #include "base/io/log/Log.h" #include "base/io/log/Tags.h" +#include #include #include #include diff --git a/src/base/tools/Profiler.h b/src/base/tools/Profiler.h index c74277151..ae3470f8f 100644 --- a/src/base/tools/Profiler.h +++ b/src/base/tools/Profiler.h @@ -37,6 +37,7 @@ #include +#include #include #if defined(_MSC_VER) diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 8edf5a720..437f1040d 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -168,6 +168,12 @@ namespace randomx { # endif } +# ifdef _MSC_VER + static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); } +# else + static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return (a << shift) | (a >> (-shift & 31)); } +# endif + static std::atomic codeOffset; JitCompilerX86::JitCompilerX86() { @@ -310,10 +316,10 @@ namespace randomx { InstructionGeneratorX86 gen3 = engine[instr3.opcode]; InstructionGeneratorX86 gen4 = engine[instr4.opcode]; - (this->*gen1)(instr1); - (this->*gen2)(instr2); - (this->*gen3)(instr3); - (this->*gen4)(instr4); + (*gen1)(this, instr1); + (*gen2)(this, instr2); + (*gen3)(this, instr3); + (*gen4)(this, instr4); } *(uint64_t*)(code + codePos) = 0xc03341c08b41ull + (static_cast(pcfg.readReg2) << 16) + (static_cast(pcfg.readReg3) << 40); @@ -1060,7 +1066,7 @@ namespace randomx { *(uint32_t*)(p + pos) = 0x00c08149 + (reg << 16); const int shift = instr.getModCond(); const uint32_t or_mask = (1UL << RandomX_ConfigurationBase::JumpOffset) << shift; - const uint32_t and_mask = ~((1UL << (RandomX_ConfigurationBase::JumpOffset - 1)) << shift); + const uint32_t and_mask = rotl32(~static_cast(1UL << (RandomX_ConfigurationBase::JumpOffset - 1)), shift); *(uint32_t*)(p + pos + 3) = (instr.getImm32() | or_mask) & and_mask; *(uint32_t*)(p + pos + 7) = 0x00c0f749 + (reg << 16); *(uint32_t*)(p + pos + 10) = RandomX_ConfigurationBase::ConditionMask_Calculated << shift; diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index 3a9163b5e..b8e6a9fe7 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -41,7 +41,7 @@ namespace randomx { class JitCompilerX86; class Instruction; - typedef void(JitCompilerX86::*InstructionGeneratorX86)(const Instruction&); + typedef void(*InstructionGeneratorX86)(JitCompilerX86*, const Instruction&); constexpr uint32_t CodeSize = 64 * 1024; diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 5cfaddca3..2804b1b78 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -267,7 +267,12 @@ void RandomX_ConfigurationBase::Apply() } } -#define JIT_HANDLE(x, prev) randomx::JitCompilerX86::engine[k] = &randomx::JitCompilerX86::h_##x +typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx::Instruction&); + +#define JIT_HANDLE(x, prev) do { \ + const InstructionGeneratorX86_2 p = &randomx::JitCompilerX86::h_##x; \ + memcpy(randomx::JitCompilerX86::engine + k, &p, sizeof(p)); \ + } while (0) #elif defined(XMRIG_ARMv8) From 51a72afb0eb8b6ebbf732ad1a7170166c0d3cb8d Mon Sep 17 00:00:00 2001 From: xmrig Date: Wed, 23 Sep 2020 05:29:29 +0700 Subject: [PATCH 13/13] Update CHANGELOG.md --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 87c5714d4..887ba8957 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +# v6.3.4 +- [#1823](https://github.com/xmrig/xmrig/pull/1823) RandomX: added new option `scratchpad_prefetch_mode`. +- [#1827](https://github.com/xmrig/xmrig/pull/1827) [#1831](https://github.com/xmrig/xmrig/pull/1831) Improved nonce iteration performance. +- [#1828](https://github.com/xmrig/xmrig/pull/1828) RandomX: added SSE4.1-optimized Blake2b. +- [#1830](https://github.com/xmrig/xmrig/pull/1830) RandomX: added performance profiler (for developers). +- [#1835](https://github.com/xmrig/xmrig/pull/1835) RandomX: returned old soft AES implementation and added auto-select between the two. +- [#1840](https://github.com/xmrig/xmrig/pull/1840) RandomX: moved more stuff to compile time, small x86 JIT compiler speedup. +- [#1841](https://github.com/xmrig/xmrig/pull/1841) Fixed Cryptonight OpenCL for AMD 20.7.2 drivers. +- [#1842](https://github.com/xmrig/xmrig/pull/1842) RandomX: AES improvements, a bit faster hardware AES code when compiled with MSVC. +- [#1843](https://github.com/xmrig/xmrig/pull/1843) RandomX: improved performance of GCC compiled binaries. + # v6.3.3 - [#1817](https://github.com/xmrig/xmrig/pull/1817) Fixed self-select login sequence. - Added brand new [build from source](https://xmrig.com/docs/miner/build) documentation.