From 828fc065b0852f9f00543881597b68f0ee5c8718 Mon Sep 17 00:00:00 2001 From: XMRig Date: Sat, 27 Jul 2019 19:41:59 +0700 Subject: [PATCH] Added support for allocate RandomX dataset on each NUMA node. --- src/backend/common/Worker.cpp | 4 +- src/backend/common/Worker.h | 1 + src/backend/cpu/CpuBackend.cpp | 2 +- src/backend/cpu/CpuWorker.cpp | 12 +- src/backend/cpu/platform/HwlocCpuInfo.cpp | 14 +- src/backend/cpu/platform/HwlocCpuInfo.h | 4 +- src/core/Miner.cpp | 2 +- src/crypto/common/VirtualMemory.cpp | 19 ++- src/crypto/common/VirtualMemory.h | 2 +- src/crypto/rx/Rx.cpp | 178 +++++++++++++++------- src/crypto/rx/Rx.h | 10 +- 11 files changed, 178 insertions(+), 70 deletions(-) diff --git a/src/backend/common/Worker.cpp b/src/backend/common/Worker.cpp index 92438b39f..91ef0c7ad 100644 --- a/src/backend/common/Worker.cpp +++ b/src/backend/common/Worker.cpp @@ -37,9 +37,9 @@ xmrig::Worker::Worker(size_t id, int64_t affinity, int priority) : m_timestamp(0), m_count(0) { - VirtualMemory::bindToNUMANode(affinity); - Platform::trySetThreadAffinity(affinity); + m_node = VirtualMemory::bindToNUMANode(affinity); + Platform::trySetThreadAffinity(affinity); Platform::setThreadPriority(priority); } diff --git a/src/backend/common/Worker.h b/src/backend/common/Worker.h index a601ab05c..5f5df9250 100644 --- a/src/backend/common/Worker.h +++ b/src/backend/common/Worker.h @@ -54,6 +54,7 @@ protected: const size_t m_id; std::atomic m_hashCount; std::atomic m_timestamp; + uint32_t m_node = 0; uint64_t m_count; }; diff --git a/src/backend/cpu/CpuBackend.cpp b/src/backend/cpu/CpuBackend.cpp index 2fc34b59c..539ab1f13 100644 --- a/src/backend/cpu/CpuBackend.cpp +++ b/src/backend/cpu/CpuBackend.cpp @@ -295,7 +295,7 @@ rapidjson::Value xmrig::CpuBackend::toJSON(rapidjson::Document &doc) const # ifdef XMRIG_ALGO_RANDOMX if (d_ptr->algo.family() == Algorithm::RANDOM_X) { - RxDataset *dataset = Rx::dataset(-1); // FIXME + RxDataset *dataset = Rx::dataset(0); // FIXME if (dataset) { const auto rxPages = dataset->hugePages(); pages[0] += rxPages.first; diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp index 28642cd73..cd804199e 100644 --- a/src/backend/cpu/CpuWorker.cpp +++ b/src/backend/cpu/CpuWorker.cpp @@ -24,6 +24,7 @@ */ +#include #include @@ -81,15 +82,20 @@ xmrig::CpuWorker::~CpuWorker() template void xmrig::CpuWorker::allocateRandomX_VM() { - while (!Rx::isReady(m_job.currentJob(), m_affinity)) { + while (!Rx::isReady(m_job.currentJob(), m_node)) { std::this_thread::sleep_for(std::chrono::milliseconds(200)); if (Nonce::sequence(Nonce::CPU) == 0) { - break; + return; } } - RxDataset *dataset = Rx::dataset(m_affinity); + RxDataset *dataset = Rx::dataset(m_node); + assert(dataset != nullptr); + + if (!dataset) { + return; + } if (!m_vm) { m_vm = new RxVm(dataset, m_memory->scratchpad(), !m_hwAES); diff --git a/src/backend/cpu/platform/HwlocCpuInfo.cpp b/src/backend/cpu/platform/HwlocCpuInfo.cpp index 4e48f65e2..eee59a3a4 100644 --- a/src/backend/cpu/platform/HwlocCpuInfo.cpp +++ b/src/backend/cpu/platform/HwlocCpuInfo.cpp @@ -38,6 +38,7 @@ namespace xmrig { +std::vector HwlocCpuInfo::m_nodeIndexes; uint32_t HwlocCpuInfo::m_features = 0; @@ -151,8 +152,17 @@ xmrig::HwlocCpuInfo::HwlocCpuInfo() : BasicCpuInfo(), m_nodes = std::max(countByType(m_topology, HWLOC_OBJ_NUMANODE), 1); m_packages = countByType(m_topology, HWLOC_OBJ_PACKAGE); - if (nodes() > 1 && hwloc_topology_get_support(m_topology)->membind->set_thisthread_membind) { - m_features |= SET_THISTHREAD_MEMBIND; + if (m_nodes > 1) { + if (hwloc_topology_get_support(m_topology)->membind->set_thisthread_membind) { + m_features |= SET_THISTHREAD_MEMBIND; + } + + m_nodeIndexes.reserve(m_nodes); + hwloc_obj_t node = nullptr; + + while ((node = hwloc_get_next_obj_by_type(m_topology, HWLOC_OBJ_NUMANODE, node)) != nullptr) { + m_nodeIndexes.emplace_back(node->os_index); + } } } diff --git a/src/backend/cpu/platform/HwlocCpuInfo.h b/src/backend/cpu/platform/HwlocCpuInfo.h index a5678fa94..340864f50 100644 --- a/src/backend/cpu/platform/HwlocCpuInfo.h +++ b/src/backend/cpu/platform/HwlocCpuInfo.h @@ -47,7 +47,8 @@ public: HwlocCpuInfo(); ~HwlocCpuInfo() override; - static inline bool has(Feature feature) { return m_features & feature; } + static inline bool has(Feature feature) { return m_features & feature; } + static inline const std::vector &nodeIndexes() { return m_nodeIndexes; } protected: CpuThreads threads(const Algorithm &algorithm) const override; @@ -62,6 +63,7 @@ protected: private: void processTopLevelCache(hwloc_obj_t obj, const Algorithm &algorithm, CpuThreads &threads) const; + static std::vector m_nodeIndexes; static uint32_t m_features; char m_backend[20]; diff --git a/src/core/Miner.cpp b/src/core/Miner.cpp index 42f4247eb..8d0596c2e 100644 --- a/src/core/Miner.cpp +++ b/src/core/Miner.cpp @@ -366,7 +366,7 @@ void xmrig::Miner::setJob(const Job &job, bool donate) # ifdef XMRIG_ALGO_RANDOMX const CpuConfig &cpu = d_ptr->controller->config()->cpu(); - Rx::init(job, cpu.initThreads(), cpu.isHugePages()); + Rx::init(job, cpu.initThreads(), cpu.isHugePages(), true); # endif uv_rwlock_wrunlock(&d_ptr->rwlock); diff --git a/src/crypto/common/VirtualMemory.cpp b/src/crypto/common/VirtualMemory.cpp index db27087cc..edacf0dfd 100644 --- a/src/crypto/common/VirtualMemory.cpp +++ b/src/crypto/common/VirtualMemory.cpp @@ -35,11 +35,11 @@ #include "crypto/common/VirtualMemory.h" -void xmrig::VirtualMemory::bindToNUMANode(int64_t affinity) +uint32_t xmrig::VirtualMemory::bindToNUMANode(int64_t affinity) { # ifdef XMRIG_FEATURE_HWLOC if (affinity < 0 || !HwlocCpuInfo::has(HwlocCpuInfo::SET_THISTHREAD_MEMBIND)) { - return; + return 0; } hwloc_topology_t topology; @@ -53,6 +53,21 @@ void xmrig::VirtualMemory::bindToNUMANode(int64_t affinity) LOG_WARN("CPU #%02u warning: \"can't bind memory\"", puId); } + hwloc_obj_t node = nullptr; + uint32_t nodeId = 0; + + while ((node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, node)) != nullptr) { + if (hwloc_bitmap_intersects(node->cpuset, pu->cpuset)) { + nodeId = node->os_index; + + break; + } + } + hwloc_topology_destroy(topology); + + return nodeId; +# else + return 0; # endif } diff --git a/src/crypto/common/VirtualMemory.h b/src/crypto/common/VirtualMemory.h index b6ea680a0..ac2f75dd8 100644 --- a/src/crypto/common/VirtualMemory.h +++ b/src/crypto/common/VirtualMemory.h @@ -52,9 +52,9 @@ public: return std::pair(isHugePages() ? (align(size()) / 2097152) : 0, align(size()) / 2097152); } + static uint32_t bindToNUMANode(int64_t affinity); static void *allocateExecutableMemory(size_t size); static void *allocateLargePagesMemory(size_t size); - static void bindToNUMANode(int64_t affinity); static void flushInstructionCache(void *p, size_t size); static void freeLargePagesMemory(void *p, size_t size); static void init(bool hugePages); diff --git a/src/crypto/rx/Rx.cpp b/src/crypto/rx/Rx.cpp index fe4202e6b..94d1f30ba 100644 --- a/src/crypto/rx/Rx.cpp +++ b/src/crypto/rx/Rx.cpp @@ -25,12 +25,20 @@ */ +#include #include #include +#ifdef XMRIG_FEATURE_HWLOC +# include +# include "backend/cpu/platform/HwlocCpuInfo.h" +#endif + + #include "backend/cpu/Cpu.h" #include "base/io/log/Log.h" +#include "base/kernel/Platform.h" #include "base/net/stratum/Job.h" #include "base/tools/Buffer.h" #include "base/tools/Chrono.h" @@ -42,6 +50,9 @@ namespace xmrig { +static const char *tag = BLUE_BG(WHITE_BOLD_S " rx ") " "; + + class RxPrivate { public: @@ -53,10 +64,12 @@ public: inline ~RxPrivate() { - for (RxDataset *dataset : datasets) { - delete dataset; + for (auto const &item : datasets) { + delete item.second; } + datasets.clear(); + uv_mutex_destroy(&mutex); } @@ -65,23 +78,79 @@ public: inline void unlock() { uv_mutex_unlock(&mutex); } - std::vector datasets; + static void allocate(RxPrivate *self, uint32_t nodeId) + { + const uint64_t ts = Chrono::steadyMSecs(); + +# ifdef XMRIG_FEATURE_HWLOC + if (self->numa) { + hwloc_topology_t topology; + hwloc_topology_init(&topology); + hwloc_topology_load(topology); + + hwloc_obj_t node = hwloc_get_numanode_obj_by_os_index(topology, nodeId); + if (node) { + if (HwlocCpuInfo::has(HwlocCpuInfo::SET_THISTHREAD_MEMBIND)) { + hwloc_set_membind_nodeset(topology, node->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_THREAD); + } + + Platform::setThreadAffinity(static_cast(hwloc_bitmap_first(node->cpuset))); + } + + hwloc_topology_destroy(topology); + } +# endif + + LOG_INFO("%s" CYAN_BOLD("#%u") MAGENTA_BOLD(" allocate") CYAN_BOLD(" %zu MB") BLACK_BOLD(" (%zu+%zu) for RandomX dataset & cache"), + tag, + nodeId, + (RxDataset::size() + RxCache::size()) / 1024 / 1024, + RxDataset::size() / 1024 / 1024, + RxCache::size() / 1024 / 1024 + ); + + RxDataset *dataset = new RxDataset(self->hugePages); + self->datasets[nodeId] = dataset; + + if (dataset->get() != nullptr) { + const auto hugePages = dataset->hugePages(); + const double percent = hugePages.first == 0 ? 0.0 : static_cast(hugePages.first) / hugePages.second * 100.0; + + LOG_INFO("%s" CYAN_BOLD("#%u") GREEN(" allocate done") " huge pages %s%u/%u %1.0f%%" CLEAR " %sJIT" BLACK_BOLD(" (%" PRIu64 " ms)"), + tag, + nodeId, + (hugePages.first == hugePages.second ? GREEN_BOLD_S : (hugePages.first == 0 ? RED_BOLD_S : YELLOW_BOLD_S)), + hugePages.first, + hugePages.second, + percent, + dataset->cache()->isJIT() ? GREEN_BOLD_S "+" : RED_BOLD_S "-", + Chrono::steadyMSecs() - ts + ); + } + else { + LOG_WARN(CLEAR "%s" CYAN_BOLD("#%u") YELLOW_BOLD_S " failed to allocate RandomX dataset, switching to slow mode", tag, nodeId); + } + } + + + bool hugePages = true; + bool numa = true; + std::map datasets; uv_mutex_t mutex; }; static RxPrivate *d_ptr = new RxPrivate(); -static const char *tag = BLUE_BG(WHITE_BOLD_S " rx "); } // namespace xmrig -bool xmrig::Rx::isReady(const Job &job, int64_t) +bool xmrig::Rx::isReady(const Job &job, uint32_t nodeId) { d_ptr->lock(); - const bool rc = isReady(job.seedHash(), job.algorithm()); + const bool rc = isReady(job.seedHash(), job.algorithm(), d_ptr->numa ? nodeId : 0); d_ptr->unlock(); return rc; @@ -89,38 +158,56 @@ bool xmrig::Rx::isReady(const Job &job, int64_t) -xmrig::RxDataset *xmrig::Rx::dataset(int64_t) +xmrig::RxDataset *xmrig::Rx::dataset(uint32_t nodeId) { d_ptr->lock(); - RxDataset *dataset = d_ptr->datasets[0]; + RxDataset *dataset = d_ptr->datasets[d_ptr->numa ? nodeId : 0]; d_ptr->unlock(); return dataset; } -void xmrig::Rx::init(const Job &job, int initThreads, bool hugePages) +void xmrig::Rx::init(const Job &job, int initThreads, bool hugePages, bool numa) { if (job.algorithm().family() != Algorithm::RANDOM_X) { return; } d_ptr->lock(); - if (d_ptr->datasets.empty()) { - d_ptr->datasets.push_back(nullptr); + + size_t ready = 0; + + for (auto const &item : d_ptr->datasets) { + if (isReady(job.seedHash(), job.algorithm(), item.first)) { + ready++; + } } - if (isReady(job.seedHash(), job.algorithm())) { + if (!d_ptr->datasets.empty() && ready == d_ptr->datasets.size()) { d_ptr->unlock(); return; } - const uint32_t threads = initThreads < 1 ? static_cast(Cpu::info()->threads()) - : static_cast(initThreads); + d_ptr->hugePages = hugePages; + d_ptr->numa = numa && Cpu::info()->nodes() > 1; + const uint32_t threads = initThreads < 1 ? static_cast(Cpu::info()->threads()) + : static_cast(initThreads); - std::thread thread(initDataset, 0, job.seedHash(), job.algorithm(), threads, hugePages); - thread.detach(); +# ifdef XMRIG_FEATURE_HWLOC + if (d_ptr->numa) { + for (uint32_t nodeId : HwlocCpuInfo::nodeIndexes()) { + std::thread thread(initDataset, nodeId, job.seedHash(), job.algorithm(), threads); + thread.detach(); + } + } + else +# endif + { + std::thread thread(initDataset, 0, job.seedHash(), job.algorithm(), threads); + thread.detach(); + } d_ptr->unlock(); } @@ -134,69 +221,56 @@ void xmrig::Rx::stop() } -bool xmrig::Rx::isReady(const uint8_t *seed, const Algorithm &algorithm) +bool xmrig::Rx::isReady(const uint8_t *seed, const Algorithm &algorithm, uint32_t nodeId) { - return !d_ptr->datasets.empty() && d_ptr->datasets[0] != nullptr && d_ptr->datasets[0]->isReady(seed, algorithm); + return !d_ptr->datasets.empty() && d_ptr->datasets[nodeId] != nullptr && d_ptr->datasets[nodeId]->isReady(seed, algorithm); } -void xmrig::Rx::initDataset(size_t index, const uint8_t *seed, const Algorithm &algorithm, uint32_t threads, bool hugePages) +void xmrig::Rx::initDataset(uint32_t nodeId, const uint8_t *seed, const Algorithm &algorithm, uint32_t threads) { d_ptr->lock(); - if (!d_ptr->datasets[index]) { - const uint64_t ts = Chrono::steadyMSecs(); + RxDataset *dataset = d_ptr->datasets[nodeId]; - LOG_INFO("%s" MAGENTA_BOLD(" allocate") CYAN_BOLD(" %zu MB") BLACK_BOLD(" (%zu+%zu) for RandomX dataset & cache"), - tag, - (RxDataset::size() + RxCache::size()) / 1024 / 1024, - RxDataset::size() / 1024 / 1024, - RxCache::size() / 1024 / 1024 - ); - - d_ptr->datasets[index] = new RxDataset(hugePages); - - if (d_ptr->datasets[index]->get() != nullptr) { - const auto hugePages = d_ptr->datasets[index]->hugePages(); - const double percent = hugePages.first == 0 ? 0.0 : static_cast(hugePages.first) / hugePages.second * 100.0; - - LOG_INFO("%s" GREEN(" allocate done") " huge pages %s%u/%u %1.0f%%" CLEAR " %sJIT" BLACK_BOLD(" (%" PRIu64 " ms)"), - tag, - (hugePages.first == hugePages.second ? GREEN_BOLD_S : (hugePages.first == 0 ? RED_BOLD_S : YELLOW_BOLD_S)), - hugePages.first, - hugePages.second, - percent, - d_ptr->datasets[index]->cache()->isJIT() ? GREEN_BOLD_S "+" : RED_BOLD_S "-", - Chrono::steadyMSecs() - ts - ); - } - else { - LOG_WARN(CLEAR "%s" YELLOW_BOLD_S " failed to allocate RandomX dataset, switching to slow mode", tag); + if (!dataset) { +# ifdef XMRIG_FEATURE_HWLOC + if (d_ptr->numa) { + std::thread thread(RxPrivate::allocate, d_ptr, nodeId); + thread.join(); + } else +# endif + { + RxPrivate::allocate(d_ptr, nodeId); } + + dataset = d_ptr->datasets[nodeId]; } - if (!d_ptr->datasets[index]->isReady(seed, algorithm)) { + if (!dataset->isReady(seed, algorithm)) { const uint64_t ts = Chrono::steadyMSecs(); - if (d_ptr->datasets[index]->get() != nullptr) { - LOG_INFO("%s" MAGENTA_BOLD(" init dataset") " algo " WHITE_BOLD("%s (") CYAN_BOLD("%u") WHITE_BOLD(" threads)") BLACK_BOLD(" seed %s..."), + if (dataset->get() != nullptr) { + LOG_INFO("%s" CYAN_BOLD("#%u") MAGENTA_BOLD(" init dataset") " algo " WHITE_BOLD("%s (") CYAN_BOLD("%u") WHITE_BOLD(" threads)") BLACK_BOLD(" seed %s..."), tag, + nodeId, algorithm.shortName(), threads, Buffer::toHex(seed, 8).data() ); } else { - LOG_INFO("%s" MAGENTA_BOLD(" init cache") " algo " WHITE_BOLD("%s") BLACK_BOLD(" seed %s..."), + LOG_INFO("%s" CYAN_BOLD("#%u") MAGENTA_BOLD(" init cache") " algo " WHITE_BOLD("%s") BLACK_BOLD(" seed %s..."), tag, + nodeId, algorithm.shortName(), Buffer::toHex(seed, 8).data() ); } - d_ptr->datasets[index]->init(seed, algorithm, threads); + dataset->init(seed, algorithm, threads); - LOG_INFO("%s" GREEN(" init done") BLACK_BOLD(" (%" PRIu64 " ms)"), tag, Chrono::steadyMSecs() - ts); + LOG_INFO("%s" CYAN_BOLD("#%u") GREEN(" init done") BLACK_BOLD(" (%" PRIu64 " ms)"), tag, nodeId, Chrono::steadyMSecs() - ts); } d_ptr->unlock(); diff --git a/src/crypto/rx/Rx.h b/src/crypto/rx/Rx.h index 456dfe4e0..815c8690f 100644 --- a/src/crypto/rx/Rx.h +++ b/src/crypto/rx/Rx.h @@ -43,14 +43,14 @@ class Job; class Rx { public: - static bool isReady(const Job &job, int64_t affinity); - static RxDataset *dataset(int64_t affinity); - static void init(const Job &job, int initThreads, bool hugePages); + static bool isReady(const Job &job, uint32_t nodeId); + static RxDataset *dataset(uint32_t nodeId); + static void init(const Job &job, int initThreads, bool hugePages, bool numa); static void stop(); private: - static bool isReady(const uint8_t *seed, const Algorithm &algorithm); - static void initDataset(size_t index, const uint8_t *seed, const Algorithm &algorithm, uint32_t threads, bool hugePages); + static bool isReady(const uint8_t *seed, const Algorithm &algorithm, uint32_t nodeId); + static void initDataset(uint32_t nodeId, const uint8_t *seed, const Algorithm &algorithm, uint32_t threads); };