Optimized cn-heavy for Zen3

- Uses scratchpad interleaving to access only the closest L3 slice from each CPU core. - Also activates MSR mod for cn-heavy because CPU prefetchers get confused with interleaving - 7-8% speedup on Zen3
2025-03-20 14:19:10 +00:00 · 2021-02-07 22:05:11 +01:00 · 2021-02-07 22:05:11 +01:00 · 8af8df25aa
commit 8af8df25aa
parent b1e14dc1d3
8 changed files with 187 additions and 81 deletions
--- a/src/backend/cpu/CpuBackend.cpp
+++ b/src/backend/cpu/CpuBackend.cpp
@ -81,6 +81,7 @@ public:

    inline void start(const std::vector<CpuLaunchData> &threads, size_t memory)
    {
+        m_workersMemory.clear();
        m_hugePages.reset();
        m_memory    = memory;
        m_started   = 0;
@ -95,8 +96,10 @@ public:
        if (ready) {
            m_started++;

-            m_hugePages += worker->memory()->hugePages();
-            m_ways      += worker->intensity();
+            if (m_workersMemory.insert(worker->memory()).second) {
+                m_hugePages += worker->memory()->hugePages();
+            }
+            m_ways += worker->intensity();
        }
        else {
            m_errors++;
@ -126,6 +129,7 @@ public:
    }

 private:
+    std::set<const VirtualMemory*> m_workersMemory;
    HugePagesInfo m_hugePages;
    size_t m_errors       = 0;
    size_t m_memory       = 0;
--- a/src/backend/cpu/CpuConfig.cpp
+++ b/src/backend/cpu/CpuConfig.cpp
@ -103,7 +103,7 @@ rapidjson::Value xmrig::CpuConfig::toJSON(rapidjson::Document &doc) const

 size_t xmrig::CpuConfig::memPoolSize() const
 {
-    return m_memoryPool < 0 ? Cpu::info()->threads() : m_memoryPool;
+    return m_memoryPool < 0 ? std::max(Cpu::info()->threads(), Cpu::info()->L3() >> 21) : m_memoryPool;
 }


--- a/src/backend/cpu/CpuWorker.cpp
+++ b/src/backend/cpu/CpuWorker.cpp
@ -19,8 +19,10 @@

 #include <cassert>
 #include <thread>
+#include <mutex>


+#include "backend/cpu/Cpu.h"
 #include "backend/cpu/CpuWorker.h"
 #include "base/tools/Chrono.h"
 #include "core/config/Config.h"
@ -55,6 +57,12 @@ namespace xmrig {

 static constexpr uint32_t kReserveCount = 32768;

+
+#ifdef XMRIG_ALGO_CN_HEAVY
+static std::mutex cn_heavyZen3MemoryMutex;
+VirtualMemory* cn_heavyZen3Memory = nullptr;
+#endif
+
 } // namespace xmrig


@ -73,7 +81,20 @@ xmrig::CpuWorker<N>::CpuWorker(size_t id, const CpuLaunchData &data) :
    m_threads(data.threads),
    m_ctx()
 {
-    m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
+#   ifdef XMRIG_ALGO_CN_HEAVY
+    // cn-heavy optimization for Zen3 CPUs
+    if ((N == 1) && (m_av == CnHash::AV_SINGLE) && (m_algorithm.family() == Algorithm::CN_HEAVY) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3)) {
+        std::lock_guard<std::mutex> lock(cn_heavyZen3MemoryMutex);
+        if (!cn_heavyZen3Memory) {
+            cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * m_threads, data.hugePages, false, false, node());
+        }
+        m_memory = cn_heavyZen3Memory;
+    }
+    else
+#   endif
+    {
+        m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
+    }
 }


@ -85,7 +106,13 @@ xmrig::CpuWorker<N>::~CpuWorker()
 #   endif

    CnCtx::release(m_ctx, N);
-    delete m_memory;
+
+#   ifdef XMRIG_ALGO_CN_HEAVY
+    if (m_memory != cn_heavyZen3Memory)
+#   endif
+    {
+        delete m_memory;
+    }
 }


@ -387,7 +414,16 @@ template<size_t N>
 void xmrig::CpuWorker<N>::allocateCnCtx()
 {
    if (m_ctx[0] == nullptr) {
-        CnCtx::create(m_ctx, m_memory->scratchpad(), m_algorithm.l3(), N);
+        int shift = 0;
+
+#       ifdef XMRIG_ALGO_CN_HEAVY
+        // cn-heavy optimization for Zen3 CPUs
+        if (m_memory == cn_heavyZen3Memory) {
+            shift = (id() / 8) * m_algorithm.l3() * 8 + (id() % 8) * 64;
+        }
+#       endif
+
+        CnCtx::create(m_ctx, m_memory->scratchpad() + shift, m_algorithm.l3(), N);
    }
 }

--- a/src/backend/cpu/platform/HwlocCpuInfo.cpp
+++ b/src/backend/cpu/platform/HwlocCpuInfo.cpp
@ -363,10 +363,14 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
        return;
    }

+    std::vector<std::pair<int64_t, int32_t>> threads_data;
+    threads_data.reserve(cores.size());
+
    size_t pu_id = 0;
    while (cacheHashes > 0 && PUs > 0) {
        bool allocated_pu = false;

+        threads_data.clear();
        for (hwloc_obj_t core : cores) {
            const std::vector<hwloc_obj_t> units = findByType(core, HWLOC_OBJ_PU);
            if (units.size() <= pu_id) {
@ -377,18 +381,31 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
            PUs--;

            allocated_pu = true;
-            threads.add(units[pu_id]->os_index, intensity);
+            threads_data.emplace_back(units[pu_id]->os_index, intensity);

            if (cacheHashes == 0) {
                break;
            }
        }

+        // Reversing of "threads_data" and "cores" is done to fill in virtual cores starting from the last one, but still in order
+        // For example, cn-heavy threads on 6-core Zen2/Zen3 will have affinity [0,2,4,6,8,10,9,11]
+        // This is important for Zen3 cn-heavy optimization
+
+        if (pu_id & 1) {
+            std::reverse(threads_data.begin(), threads_data.end());
+        }
+
+        for (const auto& t : threads_data) {
+            threads.add(t.first, t.second);
+        }
+
        if (!allocated_pu) {
            break;
        }

        pu_id++;
+        std::reverse(cores.begin(), cores.end());
    }
 #   endif
 }
--- a/src/crypto/cn/CnHash.cpp
+++ b/src/crypto/cn/CnHash.cpp
@ -49,15 +49,15 @@


 #define ADD_FN(algo) \
-    m_map[algo][AV_SINGLE][Assembly::NONE]      = cryptonight_single_hash<algo, false>; \
-    m_map[algo][AV_SINGLE_SOFT][Assembly::NONE] = cryptonight_single_hash<algo, true>;  \
-    m_map[algo][AV_DOUBLE][Assembly::NONE]      = cryptonight_double_hash<algo, false>; \
-    m_map[algo][AV_DOUBLE_SOFT][Assembly::NONE] = cryptonight_double_hash<algo, true>;  \
-    m_map[algo][AV_TRIPLE][Assembly::NONE]      = cryptonight_triple_hash<algo, false>; \
-    m_map[algo][AV_TRIPLE_SOFT][Assembly::NONE] = cryptonight_triple_hash<algo, true>;  \
-    m_map[algo][AV_QUAD][Assembly::NONE]        = cryptonight_quad_hash<algo,   false>; \
-    m_map[algo][AV_QUAD_SOFT][Assembly::NONE]   = cryptonight_quad_hash<algo,   true>;  \
-    m_map[algo][AV_PENTA][Assembly::NONE]       = cryptonight_penta_hash<algo,  false>; \
+    m_map[algo][AV_SINGLE][Assembly::NONE]      = cryptonight_single_hash<algo, false, 0>; \
+    m_map[algo][AV_SINGLE_SOFT][Assembly::NONE] = cryptonight_single_hash<algo, true,  0>; \
+    m_map[algo][AV_DOUBLE][Assembly::NONE]      = cryptonight_double_hash<algo, false>;    \
+    m_map[algo][AV_DOUBLE_SOFT][Assembly::NONE] = cryptonight_double_hash<algo, true>;     \
+    m_map[algo][AV_TRIPLE][Assembly::NONE]      = cryptonight_triple_hash<algo, false>;    \
+    m_map[algo][AV_TRIPLE_SOFT][Assembly::NONE] = cryptonight_triple_hash<algo, true>;     \
+    m_map[algo][AV_QUAD][Assembly::NONE]        = cryptonight_quad_hash<algo,   false>;    \
+    m_map[algo][AV_QUAD_SOFT][Assembly::NONE]   = cryptonight_quad_hash<algo,   true>;     \
+    m_map[algo][AV_PENTA][Assembly::NONE]       = cryptonight_penta_hash<algo,  false>;    \
    m_map[algo][AV_PENTA_SOFT][Assembly::NONE]  = cryptonight_penta_hash<algo,  true>;


@ -298,6 +298,22 @@ xmrig::cn_hash_fun xmrig::CnHash::fn(const Algorithm &algorithm, AlgoVariant av,
        return nullptr;
    }

+#   ifdef XMRIG_ALGO_CN_HEAVY
+    // cn-heavy optimization for Zen3 CPUs
+    if ((av == AV_SINGLE) && (xmrig::Cpu::info()->arch() == xmrig::ICpuInfo::ARCH_ZEN3)) {
+        switch (algorithm.id()) {
+        case xmrig::Algorithm::CN_HEAVY_0:
+            return cryptonight_single_hash<xmrig::Algorithm::CN_HEAVY_0, false, 3>;
+        case xmrig::Algorithm::CN_HEAVY_TUBE:
+            return cryptonight_single_hash<xmrig::Algorithm::CN_HEAVY_TUBE, false, 3>;
+        case xmrig::Algorithm::CN_HEAVY_XHV:
+            return cryptonight_single_hash<xmrig::Algorithm::CN_HEAVY_XHV, false, 3>;
+        default:
+            break;
+        }
+    }
+#   endif
+
 #   ifdef XMRIG_FEATURE_ASM
    cn_hash_fun fun = cnHash.m_map[algorithm][av][Cpu::assembly(assembly)];
    if (fun) {
--- a/src/crypto/cn/CryptoNight_arm.h
+++ b/src/crypto/cn/CryptoNight_arm.h
@ -431,7 +431,7 @@ static inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 }


-template<Algorithm::Id ALGO, bool SOFT_AES>
+template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
 inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
 {
    constexpr CnAlgo<ALGO> props;
--- a/src/crypto/cn/CryptoNight_x86.h
+++ b/src/crypto/cn/CryptoNight_x86.h
@ -306,7 +306,21 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3
 namespace xmrig {


-template<Algorithm::Id ALGO, bool SOFT_AES>
+template<int interleave>
+static inline constexpr uint64_t interleaved_index(uint64_t k)
+{
+    return ((k & ~63ULL) << interleave) | (k & 63);
+}
+
+
+template<>
+inline constexpr uint64_t interleaved_index<0>(uint64_t k)
+{
+    return k;
+}
+
+
+template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
 static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
 {
    constexpr CnAlgo<ALGO> props;
@ -354,19 +368,21 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
        aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
        aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);

-        _mm_store_si128(output + i + 0, xin0);
-        _mm_store_si128(output + i + 1, xin1);
-        _mm_store_si128(output + i + 2, xin2);
-        _mm_store_si128(output + i + 3, xin3);
-        _mm_store_si128(output + i + 4, xin4);
-        _mm_store_si128(output + i + 5, xin5);
-        _mm_store_si128(output + i + 6, xin6);
-        _mm_store_si128(output + i + 7, xin7);
+        _mm_store_si128(output + 0, xin0);
+        _mm_store_si128(output + 1, xin1);
+        _mm_store_si128(output + 2, xin2);
+        _mm_store_si128(output + 3, xin3);
+        output += (64 << interleave) / sizeof(__m128i);
+        _mm_store_si128(output + 0, xin4);
+        _mm_store_si128(output + 1, xin5);
+        _mm_store_si128(output + 2, xin6);
+        _mm_store_si128(output + 3, xin7);
+        output += (64 << interleave) / sizeof(__m128i);
    }
 }


-template<Algorithm::Id ALGO, bool SOFT_AES>
+template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
 static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
 {
    constexpr CnAlgo<ALGO> props;
@ -387,15 +403,18 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
    xout6 = _mm_load_si128(output + 10);
    xout7 = _mm_load_si128(output + 11);

+    const __m128i* input_begin = input;
    for (size_t i = 0; i < props.memory() / sizeof(__m128i); i += 8) {
-        xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
-        xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
-        xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
-        xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
-        xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
-        xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
-        xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
-        xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+        xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0);
+        xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
+        xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
+        xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);
+        input += (64 << interleave) / sizeof(__m128i);
+        xout4 = _mm_xor_si128(_mm_load_si128(input + 0), xout4);
+        xout5 = _mm_xor_si128(_mm_load_si128(input + 1), xout5);
+        xout6 = _mm_xor_si128(_mm_load_si128(input + 2), xout6);
+        xout7 = _mm_xor_si128(_mm_load_si128(input + 3), xout7);
+        input += (64 << interleave) / sizeof(__m128i);

        aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
        aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
@ -414,15 +433,18 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
    }

    if (IS_HEAVY) {
+        input = input_begin;
        for (size_t i = 0; i < props.memory() / sizeof(__m128i); i += 8) {
-            xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
-            xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
-            xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
-            xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
-            xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
-            xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
-            xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
-            xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+            xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0);
+            xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
+            xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
+            xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);
+            input += (64 << interleave) / sizeof(__m128i);
+            xout4 = _mm_xor_si128(_mm_load_si128(input + 0), xout4);
+            xout5 = _mm_xor_si128(_mm_load_si128(input + 1), xout5);
+            xout6 = _mm_xor_si128(_mm_load_si128(input + 2), xout6);
+            xout7 = _mm_xor_si128(_mm_load_si128(input + 3), xout7);
+            input += (64 << interleave) / sizeof(__m128i);

            aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
            aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
@ -558,7 +580,7 @@ static inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
    cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc));
 }

-template<Algorithm::Id ALGO, bool SOFT_AES>
+template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
 inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
 {
    constexpr CnAlgo<ALGO> props;
@ -577,7 +599,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
    }

    keccak(input, size, ctx[0]->state);
-    cn_explode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i *>(ctx[0]->state), reinterpret_cast<__m128i *>(ctx[0]->memory));
+    cn_explode_scratchpad<ALGO, SOFT_AES, interleave>(reinterpret_cast<const __m128i *>(ctx[0]->state), reinterpret_cast<__m128i *>(ctx[0]->memory));

    uint64_t *h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
    uint8_t *l0   = ctx[0]->memory;
@ -620,7 +642,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
    for (size_t i = 0; i < props.iterations(); i++) {
        __m128i cx;
        if (IS_CN_HEAVY_TUBE || !SOFT_AES) {
-            cx = _mm_load_si128(reinterpret_cast<const __m128i *>(&l0[idx0 & MASK]));
+            cx = _mm_load_si128(reinterpret_cast<const __m128i *>(&l0[interleaved_index<interleave>(idx0 & MASK)]));
            if (ALGO == Algorithm::CN_CCX) {
                cryptonight_conceal_tweak(cx, conc_var);
            }
@ -632,12 +654,12 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
        }
        else if (SOFT_AES) {
            if (ALGO == Algorithm::CN_CCX) {
-                cx = _mm_load_si128(reinterpret_cast<const __m128i*>(&l0[idx0 & MASK]));
+                cx = _mm_load_si128(reinterpret_cast<const __m128i*>(&l0[interleaved_index<interleave>(idx0 & MASK)]));
                cryptonight_conceal_tweak(cx, conc_var);
                cx = soft_aesenc(&cx, ax0, reinterpret_cast<const uint32_t*>(saes_table));
            }
            else {
-                cx = soft_aesenc(&l0[idx0 & MASK], ax0, reinterpret_cast<const uint32_t*>(saes_table));
+                cx = soft_aesenc(&l0[interleaved_index<interleave>(idx0 & MASK)], ax0, reinterpret_cast<const uint32_t*>(saes_table));
            }
        }
        else {
@ -645,16 +667,16 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
        }

        if (BASE == Algorithm::CN_1 || BASE == Algorithm::CN_2) {
-            cryptonight_monero_tweak<ALGO>(reinterpret_cast<uint64_t*>(&l0[idx0 & MASK]), l0, idx0 & MASK, ax0, bx0, bx1, cx);
+            cryptonight_monero_tweak<ALGO>(reinterpret_cast<uint64_t*>(&l0[interleaved_index<interleave>(idx0 & MASK)]), l0, idx0 & MASK, ax0, bx0, bx1, cx);
        } else {
-            _mm_store_si128(reinterpret_cast<__m128i *>(&l0[idx0 & MASK]), _mm_xor_si128(bx0, cx));
+            _mm_store_si128(reinterpret_cast<__m128i *>(&l0[interleaved_index<interleave>(idx0 & MASK)]), _mm_xor_si128(bx0, cx));
        }

        idx0 = static_cast<uint64_t>(_mm_cvtsi128_si64(cx));

        uint64_t hi, lo, cl, ch;
-        cl = (reinterpret_cast<uint64_t*>(&l0[idx0 & MASK]))[0];
-        ch = (reinterpret_cast<uint64_t*>(&l0[idx0 & MASK]))[1];
+        cl = (reinterpret_cast<uint64_t*>(&l0[interleaved_index<interleave>(idx0 & MASK)]))[0];
+        ch = (reinterpret_cast<uint64_t*>(&l0[interleaved_index<interleave>(idx0 & MASK)]))[1];

        if (BASE == Algorithm::CN_2) {
            if (props.isR()) {
@ -681,14 +703,14 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
        al0 += hi;
        ah0 += lo;

-        reinterpret_cast<uint64_t*>(&l0[idx0 & MASK])[0] = al0;
+        reinterpret_cast<uint64_t*>(&l0[interleaved_index<interleave>(idx0 & MASK)])[0] = al0;

        if (IS_CN_HEAVY_TUBE || ALGO == Algorithm::CN_RTO) {
-            reinterpret_cast<uint64_t*>(&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0;
+            reinterpret_cast<uint64_t*>(&l0[interleaved_index<interleave>(idx0 & MASK)])[1] = ah0 ^ tweak1_2_0 ^ al0;
        } else if (BASE == Algorithm::CN_1) {
-            reinterpret_cast<uint64_t*>(&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0;
+            reinterpret_cast<uint64_t*>(&l0[interleaved_index<interleave>(idx0 & MASK)])[1] = ah0 ^ tweak1_2_0;
        } else {
-            reinterpret_cast<uint64_t*>(&l0[idx0 & MASK])[1] = ah0;
+            reinterpret_cast<uint64_t*>(&l0[interleaved_index<interleave>(idx0 & MASK)])[1] = ah0;
        }

        al0 ^= cl;
@ -697,11 +719,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si

 #       ifdef XMRIG_ALGO_CN_HEAVY
        if (props.isHeavy()) {
-            int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
-            int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
+            int64_t n = ((int64_t*)&l0[interleaved_index<interleave>(idx0 & MASK)])[0];
+            int32_t d = ((int32_t*)&l0[interleaved_index<interleave>(idx0 & MASK)])[2];
            int64_t q = n / (d | 0x5);

-            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+            ((int64_t*)&l0[interleaved_index<interleave>(idx0 & MASK)])[0] = n ^ q;

            if (ALGO == Algorithm::CN_HEAVY_XHV) {
                d = ~d;
@ -722,7 +744,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
    }
 #   endif

-    cn_implode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i *>(ctx[0]->memory), reinterpret_cast<__m128i *>(ctx[0]->state));
+    cn_implode_scratchpad<ALGO, SOFT_AES, interleave>(reinterpret_cast<const __m128i *>(ctx[0]->memory), reinterpret_cast<__m128i *>(ctx[0]->state));
    keccakf(h0, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
@ -810,7 +832,7 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
    }

    keccak(input, size, ctx[0]->state);
-    cn_explode_scratchpad<ALGO, false>(reinterpret_cast<const __m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));
+    cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));

    if (ALGO == Algorithm::CN_2) {
        if (ASM == Assembly::INTEL) {
@ -887,7 +909,7 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
        ctx[0]->generated_code(ctx);
    }

-    cn_implode_scratchpad<ALGO, false>(reinterpret_cast<const __m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
+    cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
    keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
@ -909,8 +931,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
    keccak(input,        size, ctx[0]->state);
    keccak(input + size, size, ctx[1]->state);

-    cn_explode_scratchpad<ALGO, false>(reinterpret_cast<const __m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));
-    cn_explode_scratchpad<ALGO, false>(reinterpret_cast<const __m128i*>(ctx[1]->state), reinterpret_cast<__m128i*>(ctx[1]->memory));
+    cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));
+    cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[1]->state), reinterpret_cast<__m128i*>(ctx[1]->memory));

    if (ALGO == Algorithm::CN_2) {
        cnv2_double_mainloop_sandybridge_asm(ctx);
@ -939,8 +961,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
        ctx[0]->generated_code(ctx);
    }

-    cn_implode_scratchpad<ALGO, false>(reinterpret_cast<const __m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
-    cn_implode_scratchpad<ALGO, false>(reinterpret_cast<const __m128i*>(ctx[1]->memory), reinterpret_cast<__m128i*>(ctx[1]->state));
+    cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
+    cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[1]->memory), reinterpret_cast<__m128i*>(ctx[1]->state));

    keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
    keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
@ -991,8 +1013,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
    VARIANT4_RANDOM_MATH_INIT(0);
    VARIANT4_RANDOM_MATH_INIT(1);

-    cn_explode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i *>(h0), reinterpret_cast<__m128i *>(l0));
-    cn_explode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i *>(h1), reinterpret_cast<__m128i *>(l1));
+    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(h0), reinterpret_cast<__m128i *>(l0));
+    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(h1), reinterpret_cast<__m128i *>(l1));

    uint64_t al0 = h0[0] ^ h0[4];
    uint64_t al1 = h1[0] ^ h1[4];
@ -1187,8 +1209,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
        bx10 = cx1;
    }

-    cn_implode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i *>(l0), reinterpret_cast<__m128i *>(h0));
-    cn_implode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i *>(l1), reinterpret_cast<__m128i *>(h1));
+    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(l0), reinterpret_cast<__m128i *>(h0));
+    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(l1), reinterpret_cast<__m128i *>(h1));

    keccakf(h0, 24);
    keccakf(h1, 24);
@ -1333,7 +1355,7 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si

    for (size_t i = 0; i < 3; i++) {
        keccak(input + size * i, size, ctx[i]->state);
-        cn_explode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
    }

    uint8_t* l0  = ctx[0]->memory;
@ -1378,7 +1400,7 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si
    }

    for (size_t i = 0; i < 3; i++) {
-        cn_implode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
        keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
        extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
    }
@ -1407,7 +1429,7 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size

    for (size_t i = 0; i < 4; i++) {
        keccak(input + size * i, size, ctx[i]->state);
-        cn_explode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
    }

    uint8_t* l0  = ctx[0]->memory;
@ -1460,7 +1482,7 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
    }

    for (size_t i = 0; i < 4; i++) {
-        cn_implode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
        keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
        extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
    }
@ -1489,7 +1511,7 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz

    for (size_t i = 0; i < 5; i++) {
        keccak(input + size * i, size, ctx[i]->state);
-        cn_explode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
    }

    uint8_t* l0  = ctx[0]->memory;
@ -1550,7 +1572,7 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz
    }

    for (size_t i = 0; i < 5; i++) {
-        cn_implode_scratchpad<ALGO, SOFT_AES>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
        keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
        extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
    }
--- a/src/crypto/rx/Rx.cpp
+++ b/src/crypto/rx/Rx.cpp
@ -88,7 +88,12 @@ void xmrig::Rx::init(IRxListener *listener)
 template<typename T>
 bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu)
 {
-    if (seed.algorithm().family() != Algorithm::RANDOM_X) {
+    const Algorithm::Family f = seed.algorithm().family();
+    if ((f != Algorithm::RANDOM_X)
+#       ifdef XMRIG_ALGO_CN_HEAVY
+        && (f != Algorithm::CN_HEAVY)
+#       endif
+        ) {
 #       ifdef XMRIG_FEATURE_MSR
        RxMsr::destroy();
 #       endif
@ -96,16 +101,22 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu
        return true;
    }

-    randomx_set_scratchpad_prefetch_mode(config.scratchpadPrefetchMode());
-    randomx_set_huge_pages_jit(cpu.isHugePagesJit());
-    randomx_set_optimized_dataset_init(config.initDatasetAVX2());
-
 #   ifdef XMRIG_FEATURE_MSR
    if (!RxMsr::isInitialized()) {
        RxMsr::init(config, cpu.threads().get(seed.algorithm()).data());
    }
 #   endif

+#   ifdef XMRIG_ALGO_CN_HEAVY
+    if (f == Algorithm::CN_HEAVY) {
+        return true;
+    }
+#   endif
+
+    randomx_set_scratchpad_prefetch_mode(config.scratchpadPrefetchMode());
+    randomx_set_huge_pages_jit(cpu.isHugePagesJit());
+    randomx_set_optimized_dataset_init(config.initDatasetAVX2());
+
    if (!osInitialized) {
 #       ifdef XMRIG_FIX_RYZEN
        RxFix::setupMainLoopExceptionFrame();