diff --git a/CHANGELOG.md b/CHANGELOG.md index b6dedbcf3..ee596c3b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# v6.8.2 +- [#2080](https://github.com/xmrig/xmrig/pull/2080) Fixed compile error in Termux. +- [#2089](https://github.com/xmrig/xmrig/pull/2089) Optimized CryptoNight-Heavy for Zen3, 7-8% speedup. + # v6.8.1 - [#2064](https://github.com/xmrig/xmrig/pull/2064) Added documentation for config.json CPU options. - [#2066](https://github.com/xmrig/xmrig/issues/2066) Fixed AMD GPUs health data readings on Linux. diff --git a/doc/releases/5_0_1/SHA256SUMS b/doc/releases/5_0_1/SHA256SUMS deleted file mode 100644 index 1d94745aa..000000000 --- a/doc/releases/5_0_1/SHA256SUMS +++ /dev/null @@ -1,5 +0,0 @@ -6bb1a2e3a0fbca5195be6022f2a9fbff8a353c37c7542e7ab89420cb45b64505 xmrig-5.0.1-gcc-win32.zip -24dba9ec281acfb2ea2c401ebd0e4e2d1f1ee5fd557da5ff3c7049020c1f78b6 xmrig-5.0.1-gcc-win64.zip -86d65c6693ec9e35cd7547329580638b85c9eb0cf8383892a1c15199de5b556f xmrig-5.0.1-msvc-cuda10_1-win64.zip -0fbfe518b1c4b6993b0f66ff01302626375b15620ccf8f64d6fb97845068ffca xmrig-5.0.1-msvc-win64.zip -aa34890738a3494de2fa0e44db346937fea7339852f5f10b5d4655f95e2d8f1f xmrig-5.0.1-xenial-x64.tar.gz diff --git a/doc/releases/5_0_1/SHA256SUMS.sig b/doc/releases/5_0_1/SHA256SUMS.sig deleted file mode 100644 index 678e2e6bd..000000000 --- a/doc/releases/5_0_1/SHA256SUMS.sig +++ /dev/null @@ -1,11 +0,0 @@ ------BEGIN PGP SIGNATURE----- - -iQEzBAABCgAdFiEEmsTOqOZuNaXHzdwbRGpTY4vpRAkFAl3VcsoACgkQRGpTY4vp -RAm9vQgA1MyTUU2jley2TCYLUzQy2Fffc8fbXYv64r44jbWOjC/6qo2iIlRgPhIc -oVyPKr5TYS3QjDzCEm8IvozS0YudS6soESbPzqDonboK8pd0K4bsML9TQY2feV7A -NL5vln0rfVHp1wxLLrQpfBqAgvJUXEyaHece6gFQN79JOGhEo2bHL2NyrOl+FViS -b2BaMtXq410Fh+XT6ShnOaG/2EuO8ZqSGdCO6A/2LHQw1UY+mZiCvue6P6B06HmB -WD/urOv38V389v+V+Sp4UlEW6VpBOOjvtChoVWtLt+tKzydrnt2EmoWWWg475pka -4G6whHuMWS8CTt5/PDhJpvVXNQTIOw== -=C764 ------END PGP SIGNATURE----- diff --git a/src/backend/cpu/CpuBackend.cpp b/src/backend/cpu/CpuBackend.cpp index 1e47a6c5f..e4e82aa6b 100644 --- a/src/backend/cpu/CpuBackend.cpp +++ b/src/backend/cpu/CpuBackend.cpp @@ -81,6 +81,7 @@ public: inline void start(const std::vector &threads, size_t memory) { + m_workersMemory.clear(); m_hugePages.reset(); m_memory = memory; m_started = 0; @@ -95,8 +96,10 @@ public: if (ready) { m_started++; - m_hugePages += worker->memory()->hugePages(); - m_ways += worker->intensity(); + if (m_workersMemory.insert(worker->memory()).second) { + m_hugePages += worker->memory()->hugePages(); + } + m_ways += worker->intensity(); } else { m_errors++; @@ -126,6 +129,7 @@ public: } private: + std::set m_workersMemory; HugePagesInfo m_hugePages; size_t m_errors = 0; size_t m_memory = 0; diff --git a/src/backend/cpu/CpuConfig.cpp b/src/backend/cpu/CpuConfig.cpp index 3f7430408..8965f6c91 100644 --- a/src/backend/cpu/CpuConfig.cpp +++ b/src/backend/cpu/CpuConfig.cpp @@ -103,12 +103,16 @@ rapidjson::Value xmrig::CpuConfig::toJSON(rapidjson::Document &doc) const size_t xmrig::CpuConfig::memPoolSize() const { - return m_memoryPool < 0 ? Cpu::info()->threads() : m_memoryPool; + return m_memoryPool < 0 ? std::max(Cpu::info()->threads(), Cpu::info()->L3() >> 21) : m_memoryPool; } std::vector xmrig::CpuConfig::get(const Miner *miner, const Algorithm &algorithm) const { + if (algorithm.family() == Algorithm::KAWPOW) { + return {}; + } + std::vector out; const auto &threads = m_threads.get(algorithm); diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp index f55d153c5..794d773ff 100644 --- a/src/backend/cpu/CpuWorker.cpp +++ b/src/backend/cpu/CpuWorker.cpp @@ -19,8 +19,10 @@ #include #include +#include +#include "backend/cpu/Cpu.h" #include "backend/cpu/CpuWorker.h" #include "base/tools/Chrono.h" #include "core/config/Config.h" @@ -55,6 +57,12 @@ namespace xmrig { static constexpr uint32_t kReserveCount = 32768; + +#ifdef XMRIG_ALGO_CN_HEAVY +static std::mutex cn_heavyZen3MemoryMutex; +VirtualMemory* cn_heavyZen3Memory = nullptr; +#endif + } // namespace xmrig @@ -73,7 +81,20 @@ xmrig::CpuWorker::CpuWorker(size_t id, const CpuLaunchData &data) : m_threads(data.threads), m_ctx() { - m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node()); +# ifdef XMRIG_ALGO_CN_HEAVY + // cn-heavy optimization for Zen3 CPUs + if ((N == 1) && (m_av == CnHash::AV_SINGLE) && (m_algorithm.family() == Algorithm::CN_HEAVY) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3)) { + std::lock_guard lock(cn_heavyZen3MemoryMutex); + if (!cn_heavyZen3Memory) { + cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * m_threads, data.hugePages, false, false, node()); + } + m_memory = cn_heavyZen3Memory; + } + else +# endif + { + m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node()); + } } @@ -85,7 +106,13 @@ xmrig::CpuWorker::~CpuWorker() # endif CnCtx::release(m_ctx, N); - delete m_memory; + +# ifdef XMRIG_ALGO_CN_HEAVY + if (m_memory != cn_heavyZen3Memory) +# endif + { + delete m_memory; + } } @@ -387,7 +414,16 @@ template void xmrig::CpuWorker::allocateCnCtx() { if (m_ctx[0] == nullptr) { - CnCtx::create(m_ctx, m_memory->scratchpad(), m_algorithm.l3(), N); + int shift = 0; + +# ifdef XMRIG_ALGO_CN_HEAVY + // cn-heavy optimization for Zen3 CPUs + if (m_memory == cn_heavyZen3Memory) { + shift = (id() / 8) * m_algorithm.l3() * 8 + (id() % 8) * 64; + } +# endif + + CnCtx::create(m_ctx, m_memory->scratchpad() + shift, m_algorithm.l3(), N); } } diff --git a/src/backend/cpu/platform/HwlocCpuInfo.cpp b/src/backend/cpu/platform/HwlocCpuInfo.cpp index b1dd7ca92..01efc4b04 100644 --- a/src/backend/cpu/platform/HwlocCpuInfo.cpp +++ b/src/backend/cpu/platform/HwlocCpuInfo.cpp @@ -363,10 +363,14 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith return; } + std::vector> threads_data; + threads_data.reserve(cores.size()); + size_t pu_id = 0; while (cacheHashes > 0 && PUs > 0) { bool allocated_pu = false; + threads_data.clear(); for (hwloc_obj_t core : cores) { const std::vector units = findByType(core, HWLOC_OBJ_PU); if (units.size() <= pu_id) { @@ -377,18 +381,31 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith PUs--; allocated_pu = true; - threads.add(units[pu_id]->os_index, intensity); + threads_data.emplace_back(units[pu_id]->os_index, intensity); if (cacheHashes == 0) { break; } } + // Reversing of "threads_data" and "cores" is done to fill in virtual cores starting from the last one, but still in order + // For example, cn-heavy threads on 6-core Zen2/Zen3 will have affinity [0,2,4,6,8,10,9,11] + // This is important for Zen3 cn-heavy optimization + + if (pu_id & 1) { + std::reverse(threads_data.begin(), threads_data.end()); + } + + for (const auto& t : threads_data) { + threads.add(t.first, t.second); + } + if (!allocated_pu) { break; } pu_id++; + std::reverse(cores.begin(), cores.end()); } # endif } diff --git a/src/config.json b/src/config.json index bfb01696d..8b5532d5e 100644 --- a/src/config.json +++ b/src/config.json @@ -37,9 +37,9 @@ "asm": true, "argon2-impl": null, "astrobwt-max-size": 550, + "astrobwt-avx2": false, "cn/0": false, - "cn-lite/0": false, - "kawpow": false + "cn-lite/0": false }, "opencl": { "enabled": false, diff --git a/src/core/config/Config_default.h b/src/core/config/Config_default.h index afeecbcec..dd7a20832 100644 --- a/src/core/config/Config_default.h +++ b/src/core/config/Config_default.h @@ -67,9 +67,9 @@ R"===( "asm": true, "argon2-impl": null, "astrobwt-max-size": 550, + "astrobwt-avx2": false, "cn/0": false, - "cn-lite/0": false, - "kawpow": false + "cn-lite/0": false }, "opencl": { "enabled": false, diff --git a/src/crypto/cn/CnHash.cpp b/src/crypto/cn/CnHash.cpp index 9a9e5ea5f..89caa1b02 100644 --- a/src/crypto/cn/CnHash.cpp +++ b/src/crypto/cn/CnHash.cpp @@ -49,15 +49,15 @@ #define ADD_FN(algo) \ - m_map[algo][AV_SINGLE][Assembly::NONE] = cryptonight_single_hash; \ - m_map[algo][AV_SINGLE_SOFT][Assembly::NONE] = cryptonight_single_hash; \ - m_map[algo][AV_DOUBLE][Assembly::NONE] = cryptonight_double_hash; \ - m_map[algo][AV_DOUBLE_SOFT][Assembly::NONE] = cryptonight_double_hash; \ - m_map[algo][AV_TRIPLE][Assembly::NONE] = cryptonight_triple_hash; \ - m_map[algo][AV_TRIPLE_SOFT][Assembly::NONE] = cryptonight_triple_hash; \ - m_map[algo][AV_QUAD][Assembly::NONE] = cryptonight_quad_hash; \ - m_map[algo][AV_QUAD_SOFT][Assembly::NONE] = cryptonight_quad_hash; \ - m_map[algo][AV_PENTA][Assembly::NONE] = cryptonight_penta_hash; \ + m_map[algo][AV_SINGLE][Assembly::NONE] = cryptonight_single_hash; \ + m_map[algo][AV_SINGLE_SOFT][Assembly::NONE] = cryptonight_single_hash; \ + m_map[algo][AV_DOUBLE][Assembly::NONE] = cryptonight_double_hash; \ + m_map[algo][AV_DOUBLE_SOFT][Assembly::NONE] = cryptonight_double_hash; \ + m_map[algo][AV_TRIPLE][Assembly::NONE] = cryptonight_triple_hash; \ + m_map[algo][AV_TRIPLE_SOFT][Assembly::NONE] = cryptonight_triple_hash; \ + m_map[algo][AV_QUAD][Assembly::NONE] = cryptonight_quad_hash; \ + m_map[algo][AV_QUAD_SOFT][Assembly::NONE] = cryptonight_quad_hash; \ + m_map[algo][AV_PENTA][Assembly::NONE] = cryptonight_penta_hash; \ m_map[algo][AV_PENTA_SOFT][Assembly::NONE] = cryptonight_penta_hash; @@ -298,6 +298,22 @@ xmrig::cn_hash_fun xmrig::CnHash::fn(const Algorithm &algorithm, AlgoVariant av, return nullptr; } +# ifdef XMRIG_ALGO_CN_HEAVY + // cn-heavy optimization for Zen3 CPUs + if ((av == AV_SINGLE) && (xmrig::Cpu::info()->arch() == xmrig::ICpuInfo::ARCH_ZEN3)) { + switch (algorithm.id()) { + case xmrig::Algorithm::CN_HEAVY_0: + return cryptonight_single_hash; + case xmrig::Algorithm::CN_HEAVY_TUBE: + return cryptonight_single_hash; + case xmrig::Algorithm::CN_HEAVY_XHV: + return cryptonight_single_hash; + default: + break; + } + } +# endif + # ifdef XMRIG_FEATURE_ASM cn_hash_fun fun = cnHash.m_map[algorithm][av][Cpu::assembly(assembly)]; if (fun) { diff --git a/src/crypto/cn/CryptoNight_arm.h b/src/crypto/cn/CryptoNight_arm.h index 9899c27a1..e8a91c359 100644 --- a/src/crypto/cn/CryptoNight_arm.h +++ b/src/crypto/cn/CryptoNight_arm.h @@ -431,7 +431,7 @@ static inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) } -template +template inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { constexpr CnAlgo props; diff --git a/src/crypto/cn/CryptoNight_x86.h b/src/crypto/cn/CryptoNight_x86.h index 48748cf08..7cc4e062f 100644 --- a/src/crypto/cn/CryptoNight_x86.h +++ b/src/crypto/cn/CryptoNight_x86.h @@ -306,7 +306,21 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3 namespace xmrig { -template +template +static inline constexpr uint64_t interleaved_index(uint64_t k) +{ + return ((k & ~63ULL) << interleave) | (k & 63); +} + + +template<> +inline constexpr uint64_t interleaved_index<0>(uint64_t k) +{ + return k; +} + + +template static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) { constexpr CnAlgo props; @@ -343,6 +357,11 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) } for (size_t i = 0; i < props.memory() / sizeof(__m128i); i += 8) { + if (interleave > 0) { + _mm_prefetch((const char*)(output), _MM_HINT_T0); + _mm_prefetch((const char*)(output + (64 << interleave) / sizeof(__m128i)), _MM_HINT_T0); + } + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); @@ -354,19 +373,21 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - _mm_store_si128(output + i + 0, xin0); - _mm_store_si128(output + i + 1, xin1); - _mm_store_si128(output + i + 2, xin2); - _mm_store_si128(output + i + 3, xin3); - _mm_store_si128(output + i + 4, xin4); - _mm_store_si128(output + i + 5, xin5); - _mm_store_si128(output + i + 6, xin6); - _mm_store_si128(output + i + 7, xin7); + _mm_store_si128(output + 0, xin0); + _mm_store_si128(output + 1, xin1); + _mm_store_si128(output + 2, xin2); + _mm_store_si128(output + 3, xin3); + output += (64 << interleave) / sizeof(__m128i); + _mm_store_si128(output + 0, xin4); + _mm_store_si128(output + 1, xin5); + _mm_store_si128(output + 2, xin6); + _mm_store_si128(output + 3, xin7); + output += (64 << interleave) / sizeof(__m128i); } } -template +template static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) { constexpr CnAlgo props; @@ -387,15 +408,25 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout6 = _mm_load_si128(output + 10); xout7 = _mm_load_si128(output + 11); - for (size_t i = 0; i < props.memory() / sizeof(__m128i); i += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + const __m128i* input_begin = input; + for (size_t i = 0; i < props.memory() / sizeof(__m128i);) { + xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3); + input += (64 << interleave) / sizeof(__m128i); + xout4 = _mm_xor_si128(_mm_load_si128(input + 0), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(input + 1), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(input + 2), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(input + 3), xout7); + input += (64 << interleave) / sizeof(__m128i); + + i += 8; + + if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) { + _mm_prefetch((const char*)(input), _MM_HINT_T0); + _mm_prefetch((const char*)(input + (64 << interleave) / sizeof(__m128i)), _MM_HINT_T0); + } aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -414,15 +445,25 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) } if (IS_HEAVY) { - for (size_t i = 0; i < props.memory() / sizeof(__m128i); i += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + input = input_begin; + for (size_t i = 0; i < props.memory() / sizeof(__m128i);) { + xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3); + input += (64 << interleave) / sizeof(__m128i); + xout4 = _mm_xor_si128(_mm_load_si128(input + 0), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(input + 1), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(input + 2), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(input + 3), xout7); + input += (64 << interleave) / sizeof(__m128i); + + i += 8; + + if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) { + _mm_prefetch((const char*)(input), _MM_HINT_T0); + _mm_prefetch((const char*)(input + (64 << interleave) / sizeof(__m128i)), _MM_HINT_T0); + } aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -558,7 +599,7 @@ static inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc)); } -template +template inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { constexpr CnAlgo props; @@ -577,7 +618,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si } keccak(input, size, ctx[0]->state); - cn_explode_scratchpad(reinterpret_cast(ctx[0]->state), reinterpret_cast<__m128i *>(ctx[0]->memory)); + cn_explode_scratchpad(reinterpret_cast(ctx[0]->state), reinterpret_cast<__m128i *>(ctx[0]->memory)); uint64_t *h0 = reinterpret_cast(ctx[0]->state); uint8_t *l0 = ctx[0]->memory; @@ -620,7 +661,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si for (size_t i = 0; i < props.iterations(); i++) { __m128i cx; if (IS_CN_HEAVY_TUBE || !SOFT_AES) { - cx = _mm_load_si128(reinterpret_cast(&l0[idx0 & MASK])); + cx = _mm_load_si128(reinterpret_cast(&l0[interleaved_index(idx0 & MASK)])); if (ALGO == Algorithm::CN_CCX) { cryptonight_conceal_tweak(cx, conc_var); } @@ -632,12 +673,12 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si } else if (SOFT_AES) { if (ALGO == Algorithm::CN_CCX) { - cx = _mm_load_si128(reinterpret_cast(&l0[idx0 & MASK])); + cx = _mm_load_si128(reinterpret_cast(&l0[interleaved_index(idx0 & MASK)])); cryptonight_conceal_tweak(cx, conc_var); cx = soft_aesenc(&cx, ax0, reinterpret_cast(saes_table)); } else { - cx = soft_aesenc(&l0[idx0 & MASK], ax0, reinterpret_cast(saes_table)); + cx = soft_aesenc(&l0[interleaved_index(idx0 & MASK)], ax0, reinterpret_cast(saes_table)); } } else { @@ -645,16 +686,16 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si } if (BASE == Algorithm::CN_1 || BASE == Algorithm::CN_2) { - cryptonight_monero_tweak(reinterpret_cast(&l0[idx0 & MASK]), l0, idx0 & MASK, ax0, bx0, bx1, cx); + cryptonight_monero_tweak(reinterpret_cast(&l0[interleaved_index(idx0 & MASK)]), l0, idx0 & MASK, ax0, bx0, bx1, cx); } else { - _mm_store_si128(reinterpret_cast<__m128i *>(&l0[idx0 & MASK]), _mm_xor_si128(bx0, cx)); + _mm_store_si128(reinterpret_cast<__m128i *>(&l0[interleaved_index(idx0 & MASK)]), _mm_xor_si128(bx0, cx)); } idx0 = static_cast(_mm_cvtsi128_si64(cx)); uint64_t hi, lo, cl, ch; - cl = (reinterpret_cast(&l0[idx0 & MASK]))[0]; - ch = (reinterpret_cast(&l0[idx0 & MASK]))[1]; + cl = (reinterpret_cast(&l0[interleaved_index(idx0 & MASK)]))[0]; + ch = (reinterpret_cast(&l0[interleaved_index(idx0 & MASK)]))[1]; if (BASE == Algorithm::CN_2) { if (props.isR()) { @@ -681,14 +722,14 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si al0 += hi; ah0 += lo; - reinterpret_cast(&l0[idx0 & MASK])[0] = al0; + reinterpret_cast(&l0[interleaved_index(idx0 & MASK)])[0] = al0; if (IS_CN_HEAVY_TUBE || ALGO == Algorithm::CN_RTO) { - reinterpret_cast(&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0; + reinterpret_cast(&l0[interleaved_index(idx0 & MASK)])[1] = ah0 ^ tweak1_2_0 ^ al0; } else if (BASE == Algorithm::CN_1) { - reinterpret_cast(&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; + reinterpret_cast(&l0[interleaved_index(idx0 & MASK)])[1] = ah0 ^ tweak1_2_0; } else { - reinterpret_cast(&l0[idx0 & MASK])[1] = ah0; + reinterpret_cast(&l0[interleaved_index(idx0 & MASK)])[1] = ah0; } al0 ^= cl; @@ -697,11 +738,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si # ifdef XMRIG_ALGO_CN_HEAVY if (props.isHeavy()) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*)&l0[interleaved_index(idx0 & MASK)])[0]; + int32_t d = ((int32_t*)&l0[interleaved_index(idx0 & MASK)])[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + ((int64_t*)&l0[interleaved_index(idx0 & MASK)])[0] = n ^ q; if (ALGO == Algorithm::CN_HEAVY_XHV) { d = ~d; @@ -722,7 +763,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si } # endif - cn_implode_scratchpad(reinterpret_cast(ctx[0]->memory), reinterpret_cast<__m128i *>(ctx[0]->state)); + cn_implode_scratchpad(reinterpret_cast(ctx[0]->memory), reinterpret_cast<__m128i *>(ctx[0]->state)); keccakf(h0, 24); extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } @@ -810,7 +851,7 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_ } keccak(input, size, ctx[0]->state); - cn_explode_scratchpad(reinterpret_cast(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory)); + cn_explode_scratchpad(reinterpret_cast(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory)); if (ALGO == Algorithm::CN_2) { if (ASM == Assembly::INTEL) { @@ -887,7 +928,7 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_ ctx[0]->generated_code(ctx); } - cn_implode_scratchpad(reinterpret_cast(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); + cn_implode_scratchpad(reinterpret_cast(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); keccakf(reinterpret_cast(ctx[0]->state), 24); extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } @@ -909,8 +950,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ keccak(input, size, ctx[0]->state); keccak(input + size, size, ctx[1]->state); - cn_explode_scratchpad(reinterpret_cast(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory)); - cn_explode_scratchpad(reinterpret_cast(ctx[1]->state), reinterpret_cast<__m128i*>(ctx[1]->memory)); + cn_explode_scratchpad(reinterpret_cast(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory)); + cn_explode_scratchpad(reinterpret_cast(ctx[1]->state), reinterpret_cast<__m128i*>(ctx[1]->memory)); if (ALGO == Algorithm::CN_2) { cnv2_double_mainloop_sandybridge_asm(ctx); @@ -939,8 +980,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ ctx[0]->generated_code(ctx); } - cn_implode_scratchpad(reinterpret_cast(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); - cn_implode_scratchpad(reinterpret_cast(ctx[1]->memory), reinterpret_cast<__m128i*>(ctx[1]->state)); + cn_implode_scratchpad(reinterpret_cast(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); + cn_implode_scratchpad(reinterpret_cast(ctx[1]->memory), reinterpret_cast<__m128i*>(ctx[1]->state)); keccakf(reinterpret_cast(ctx[0]->state), 24); keccakf(reinterpret_cast(ctx[1]->state), 24); @@ -991,8 +1032,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si VARIANT4_RANDOM_MATH_INIT(0); VARIANT4_RANDOM_MATH_INIT(1); - cn_explode_scratchpad(reinterpret_cast(h0), reinterpret_cast<__m128i *>(l0)); - cn_explode_scratchpad(reinterpret_cast(h1), reinterpret_cast<__m128i *>(l1)); + cn_explode_scratchpad(reinterpret_cast(h0), reinterpret_cast<__m128i *>(l0)); + cn_explode_scratchpad(reinterpret_cast(h1), reinterpret_cast<__m128i *>(l1)); uint64_t al0 = h0[0] ^ h0[4]; uint64_t al1 = h1[0] ^ h1[4]; @@ -1187,8 +1228,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si bx10 = cx1; } - cn_implode_scratchpad(reinterpret_cast(l0), reinterpret_cast<__m128i *>(h0)); - cn_implode_scratchpad(reinterpret_cast(l1), reinterpret_cast<__m128i *>(h1)); + cn_implode_scratchpad(reinterpret_cast(l0), reinterpret_cast<__m128i *>(h0)); + cn_implode_scratchpad(reinterpret_cast(l1), reinterpret_cast<__m128i *>(h1)); keccakf(h0, 24); keccakf(h1, 24); @@ -1333,7 +1374,7 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si for (size_t i = 0; i < 3; i++) { keccak(input + size * i, size, ctx[i]->state); - cn_explode_scratchpad(reinterpret_cast(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory)); + cn_explode_scratchpad(reinterpret_cast(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory)); } uint8_t* l0 = ctx[0]->memory; @@ -1378,7 +1419,7 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si } for (size_t i = 0; i < 3; i++) { - cn_implode_scratchpad(reinterpret_cast(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state)); + cn_implode_scratchpad(reinterpret_cast(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state)); keccakf(reinterpret_cast(ctx[i]->state), 24); extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i); } @@ -1407,7 +1448,7 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size for (size_t i = 0; i < 4; i++) { keccak(input + size * i, size, ctx[i]->state); - cn_explode_scratchpad(reinterpret_cast(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory)); + cn_explode_scratchpad(reinterpret_cast(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory)); } uint8_t* l0 = ctx[0]->memory; @@ -1460,7 +1501,7 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size } for (size_t i = 0; i < 4; i++) { - cn_implode_scratchpad(reinterpret_cast(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state)); + cn_implode_scratchpad(reinterpret_cast(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state)); keccakf(reinterpret_cast(ctx[i]->state), 24); extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i); } @@ -1489,7 +1530,7 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz for (size_t i = 0; i < 5; i++) { keccak(input + size * i, size, ctx[i]->state); - cn_explode_scratchpad(reinterpret_cast(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory)); + cn_explode_scratchpad(reinterpret_cast(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory)); } uint8_t* l0 = ctx[0]->memory; @@ -1550,7 +1591,7 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz } for (size_t i = 0; i < 5; i++) { - cn_implode_scratchpad(reinterpret_cast(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state)); + cn_implode_scratchpad(reinterpret_cast(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state)); keccakf(reinterpret_cast(ctx[i]->state), 24); extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i); } diff --git a/src/crypto/common/VirtualMemory_unix.cpp b/src/crypto/common/VirtualMemory_unix.cpp index e3362bc1b..edda231f1 100644 --- a/src/crypto/common/VirtualMemory_unix.cpp +++ b/src/crypto/common/VirtualMemory_unix.cpp @@ -65,18 +65,13 @@ #endif -namespace xmrig { - - -#ifdef XMRIG_OS_LINUX +#if defined(XMRIG_OS_LINUX) || (!defined(XMRIG_OS_APPLE) && !defined(__FreeBSD__)) static inline int hugePagesFlag(size_t size) { return (static_cast(log2(size)) & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; } #endif -} // namespace xmrig - bool xmrig::VirtualMemory::isHugepagesAvailable() { @@ -165,7 +160,7 @@ void *xmrig::VirtualMemory::allocateExecutableMemory(size_t size, bool hugePages void *xmrig::VirtualMemory::allocateLargePagesMemory(size_t size) { -# if defined(__APPLE__) +# if defined(XMRIG_OS_APPLE) void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); # elif defined(__FreeBSD__) void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0); diff --git a/src/crypto/rx/Rx.cpp b/src/crypto/rx/Rx.cpp index a485b382f..19523abe6 100644 --- a/src/crypto/rx/Rx.cpp +++ b/src/crypto/rx/Rx.cpp @@ -88,7 +88,12 @@ void xmrig::Rx::init(IRxListener *listener) template bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu) { - if (seed.algorithm().family() != Algorithm::RANDOM_X) { + const Algorithm::Family f = seed.algorithm().family(); + if ((f != Algorithm::RANDOM_X) +# ifdef XMRIG_ALGO_CN_HEAVY + && (f != Algorithm::CN_HEAVY) +# endif + ) { # ifdef XMRIG_FEATURE_MSR RxMsr::destroy(); # endif @@ -96,16 +101,22 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu return true; } - randomx_set_scratchpad_prefetch_mode(config.scratchpadPrefetchMode()); - randomx_set_huge_pages_jit(cpu.isHugePagesJit()); - randomx_set_optimized_dataset_init(config.initDatasetAVX2()); - # ifdef XMRIG_FEATURE_MSR if (!RxMsr::isInitialized()) { RxMsr::init(config, cpu.threads().get(seed.algorithm()).data()); } # endif +# ifdef XMRIG_ALGO_CN_HEAVY + if (f == Algorithm::CN_HEAVY) { + return true; + } +# endif + + randomx_set_scratchpad_prefetch_mode(config.scratchpadPrefetchMode()); + randomx_set_huge_pages_jit(cpu.isHugePagesJit()); + randomx_set_optimized_dataset_init(config.initDatasetAVX2()); + if (!osInitialized) { # ifdef XMRIG_FIX_RYZEN RxFix::setupMainLoopExceptionFrame(); diff --git a/src/version.h b/src/version.h index 4bac4d04e..ab76d0d67 100644 --- a/src/version.h +++ b/src/version.h @@ -28,7 +28,7 @@ #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "XMRig miner" -#define APP_VERSION "6.8.1" +#define APP_VERSION "6.8.2-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2021 xmrig.com" @@ -36,7 +36,7 @@ #define APP_VER_MAJOR 6 #define APP_VER_MINOR 8 -#define APP_VER_PATCH 1 +#define APP_VER_PATCH 2 #ifdef _MSC_VER # if (_MSC_VER >= 1920)