Merge branch 'dev'

2024-12-23 12:09:22 +00:00 · 2021-04-20 20:55:35 +07:00 · 2021-04-20 20:55:35 +07:00 · e2c757d9dd
commit e2c757d9dd
parent e53e48b88c da35de993f
43 changed files with 6619 additions and 5742 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,9 @@
+# v6.12.0
+- [#2276](https://github.com/xmrig/xmrig/pull/2276) Added support for Uplexa (`cn/upx2` algorithm).
+- [#2261](https://github.com/xmrig/xmrig/pull/2261) Show total hashrate if compiled without OpenCL.
+- [#2289](https://github.com/xmrig/xmrig/pull/2289) RandomX: optimized `IMUL_RCP` instruction.
+- Added support for `--user` command line option for online benchmark.
+
 # v6.11.2
 - [#2207](https://github.com/xmrig/xmrig/issues/2207) Fixed regression in HTTP parser and llhttp updated to v5.1.0.

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -5,6 +5,7 @@ option(WITH_HWLOC           "Enable hwloc support" ON)
 option(WITH_CN_LITE         "Enable CryptoNight-Lite algorithms family" ON)
 option(WITH_CN_HEAVY        "Enable CryptoNight-Heavy algorithms family" ON)
 option(WITH_CN_PICO         "Enable CryptoNight-Pico algorithm" ON)
+option(WITH_CN_FEMTO        "Enable CryptoNight-UPX2 algorithm" ON)
 option(WITH_RANDOMX         "Enable RandomX algorithms family" ON)
 option(WITH_ARGON2          "Enable Argon2 algorithms family" ON)
 option(WITH_ASTROBWT        "Enable AstroBWT algorithms family" ON)
@ -196,6 +197,10 @@ if (WITH_CN_PICO)
    add_definitions(/DXMRIG_ALGO_CN_PICO)
 endif()

+if (WITH_CN_FEMTO)
+    add_definitions(/DXMRIG_ALGO_CN_FEMTO)
+endif()
+
 if (WITH_EMBEDDED_CONFIG)
    add_definitions(/DXMRIG_FEATURE_EMBEDDED_CONFIG)
 endif()
--- a/src/backend/cpu/CpuBackend.cpp
+++ b/src/backend/cpu/CpuBackend.cpp
@ -334,13 +334,11 @@ void xmrig::CpuBackend::printHashrate(bool details)
         i++;
    }

-#   ifdef XMRIG_FEATURE_OPENCL
    Log::print(WHITE_BOLD_S "|        - |        - | %7s | %7s | %7s |",
               Hashrate::format(hashrate()->calc(Hashrate::ShortInterval),  num,         sizeof num / 3),
               Hashrate::format(hashrate()->calc(Hashrate::MediumInterval), num + 8,     sizeof num / 3),
               Hashrate::format(hashrate()->calc(Hashrate::LargeInterval),  num + 8 * 2, sizeof num / 3)
               );
-#   endif
 }


--- a/src/backend/cpu/CpuConfig.cpp
+++ b/src/backend/cpu/CpuConfig.cpp
@ -197,6 +197,7 @@ void xmrig::CpuConfig::generate()
    count += xmrig::generate<Algorithm::CN_LITE>(m_threads, m_limit);
    count += xmrig::generate<Algorithm::CN_HEAVY>(m_threads, m_limit);
    count += xmrig::generate<Algorithm::CN_PICO>(m_threads, m_limit);
+    count += xmrig::generate<Algorithm::CN_FEMTO>(m_threads, m_limit);
    count += xmrig::generate<Algorithm::RANDOM_X>(m_threads, m_limit);
    count += xmrig::generate<Algorithm::ARGON2>(m_threads, m_limit);
    count += xmrig::generate<Algorithm::ASTROBWT>(m_threads, m_limit);
--- a/src/backend/cpu/CpuConfig_gen.h
+++ b/src/backend/cpu/CpuConfig_gen.h
@ -100,6 +100,15 @@ size_t inline generate<Algorithm::CN_PICO>(Threads<CpuThreads> &threads, uint32_
 #endif


+#ifdef XMRIG_ALGO_CN_FEMTO
+template<>
+size_t inline generate<Algorithm::CN_FEMTO>(Threads<CpuThreads>& threads, uint32_t limit)
+{
+    return generate("cn/upx2", threads, Algorithm::CN_UPX2, limit);
+}
+#endif
+
+
 #ifdef XMRIG_ALGO_RANDOMX
 template<>
 size_t inline generate<Algorithm::RANDOM_X>(Threads<CpuThreads> &threads, uint32_t limit)
--- a/src/backend/cpu/CpuWorker.cpp
+++ b/src/backend/cpu/CpuWorker.cpp
@ -193,6 +193,12 @@ bool xmrig::CpuWorker<N>::selfTest()
    }
 #   endif

+#   ifdef XMRIG_ALGO_CN_FEMTO
+    if (m_algorithm.family() == Algorithm::CN_FEMTO) {
+        return verify(Algorithm::CN_UPX2, test_output_femto_upx2);
+    }
+#   endif
+
 #   ifdef XMRIG_ALGO_ARGON2
    if (m_algorithm.family() == Algorithm::ARGON2) {
        return verify(Algorithm::AR2_CHUKWA, argon2_chukwa_test_out) &&
--- a/src/backend/cpu/platform/BasicCpuInfo.cpp
+++ b/src/backend/cpu/platform/BasicCpuInfo.cpp
@ -309,26 +309,34 @@ xmrig::CpuThreads xmrig::BasicCpuInfo::threads(const Algorithm &algorithm, uint3
        return 1;
    }

+    Algorithm::Family f = algorithm.family();
+
 #   ifdef XMRIG_ALGO_CN_LITE
-    if (algorithm.family() == Algorithm::CN_LITE) {
+    if (f == Algorithm::CN_LITE) {
        return CpuThreads(count, 1);
    }
 #   endif

 #   ifdef XMRIG_ALGO_CN_PICO
-    if (algorithm.family() == Algorithm::CN_PICO) {
+    if (f == Algorithm::CN_PICO) {
+        return CpuThreads(count, 2);
+    }
+#   endif
+
+#   ifdef XMRIG_ALGO_CN_FEMTO
+    if (f == Algorithm::CN_FEMTO) {
        return CpuThreads(count, 2);
    }
 #   endif

 #   ifdef XMRIG_ALGO_CN_HEAVY
-    if (algorithm.family() == Algorithm::CN_HEAVY) {
+    if (f == Algorithm::CN_HEAVY) {
        return CpuThreads(std::max<size_t>(count / 4, 1), 1);
    }
 #   endif

 #   ifdef XMRIG_ALGO_RANDOMX
-    if (algorithm.family() == Algorithm::RANDOM_X) {
+    if (f == Algorithm::RANDOM_X) {
        if (algorithm == Algorithm::RX_WOW) {
            return count;
        }
@ -338,13 +346,13 @@ xmrig::CpuThreads xmrig::BasicCpuInfo::threads(const Algorithm &algorithm, uint3
 #   endif

 #   ifdef XMRIG_ALGO_ARGON2
-    if (algorithm.family() == Algorithm::ARGON2) {
+    if (f == Algorithm::ARGON2) {
        return count;
    }
 #   endif

 #   ifdef XMRIG_ALGO_ASTROBWT
-    if (algorithm.family() == Algorithm::ASTROBWT) {
+    if (f == Algorithm::ASTROBWT) {
        CpuThreads threads;
        for (size_t i = 0; i < count; ++i) {
            threads.add(i, 0);
--- a/src/backend/cpu/platform/HwlocCpuInfo.cpp
+++ b/src/backend/cpu/platform/HwlocCpuInfo.cpp
@ -336,11 +336,10 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith

    size_t cacheHashes = ((L3 + extra) + (scratchpad / 2)) / scratchpad;

-#   ifdef XMRIG_ALGO_CN_PICO
-    if (intensity && algorithm == Algorithm::CN_PICO_0 && (cacheHashes / PUs) >= 2) {
+    Algorithm::Family family = algorithm.family();
+    if (intensity && ((family == Algorithm::CN_PICO) || (family == Algorithm::CN_FEMTO)) && (cacheHashes / PUs) >= 2) {
        intensity = 2;
    }
-#   endif

 #   ifdef XMRIG_ALGO_RANDOMX
    if (extra == 0 && algorithm.l2() > 0) {
--- a/src/backend/cuda/CudaConfig.cpp
+++ b/src/backend/cuda/CudaConfig.cpp
@ -179,6 +179,7 @@ void xmrig::CudaConfig::generate()
    count += xmrig::generate<Algorithm::CN_LITE>(m_threads, devices);
    count += xmrig::generate<Algorithm::CN_HEAVY>(m_threads, devices);
    count += xmrig::generate<Algorithm::CN_PICO>(m_threads, devices);
+    count += xmrig::generate<Algorithm::CN_FEMTO>(m_threads, devices);
    count += xmrig::generate<Algorithm::RANDOM_X>(m_threads, devices);
    count += xmrig::generate<Algorithm::ASTROBWT>(m_threads, devices);
    count += xmrig::generate<Algorithm::KAWPOW>(m_threads, devices);
--- a/src/backend/cuda/CudaConfig_gen.h
+++ b/src/backend/cuda/CudaConfig_gen.h
@ -102,6 +102,15 @@ size_t inline generate<Algorithm::CN_PICO>(Threads<CudaThreads> &threads, const
 #endif


+#ifdef XMRIG_ALGO_CN_FEMTO
+template<>
+size_t inline generate<Algorithm::CN_FEMTO>(Threads<CudaThreads>& threads, const std::vector<CudaDevice>& devices)
+{
+    return generate("cn/upx2", threads, Algorithm::CN_UPX2, devices);
+}
+#endif
+
+
 #ifdef XMRIG_ALGO_RANDOMX
 template<>
 size_t inline generate<Algorithm::RANDOM_X>(Threads<CudaThreads> &threads, const std::vector<CudaDevice> &devices)
--- a/src/backend/opencl/OclConfig.cpp
+++ b/src/backend/opencl/OclConfig.cpp
@ -219,6 +219,7 @@ void xmrig::OclConfig::generate()
    count += xmrig::generate<Algorithm::CN_LITE>(m_threads, devices);
    count += xmrig::generate<Algorithm::CN_HEAVY>(m_threads, devices);
    count += xmrig::generate<Algorithm::CN_PICO>(m_threads, devices);
+    count += xmrig::generate<Algorithm::CN_FEMTO>(m_threads, devices);
    count += xmrig::generate<Algorithm::RANDOM_X>(m_threads, devices);
    count += xmrig::generate<Algorithm::ASTROBWT>(m_threads, devices);
    count += xmrig::generate<Algorithm::KAWPOW>(m_threads, devices);
--- a/src/backend/opencl/OclConfig_gen.h
+++ b/src/backend/opencl/OclConfig_gen.h
@ -101,6 +101,15 @@ size_t inline generate<Algorithm::CN_PICO>(Threads<OclThreads> &threads, const s
 #endif


+#ifdef XMRIG_ALGO_CN_FEMTO
+template<>
+size_t inline generate<Algorithm::CN_FEMTO>(Threads<OclThreads>& threads, const std::vector<OclDevice>& devices)
+{
+    return generate("cn/upx2", threads, Algorithm::CN_UPX2, devices);
+}
+#endif
+
+
 #ifdef XMRIG_ALGO_RANDOMX
 template<>
 size_t inline generate<Algorithm::RANDOM_X>(Threads<OclThreads> &threads, const std::vector<OclDevice> &devices)
--- a/src/backend/opencl/cl/cn/algorithm.cl
+++ b/src/backend/opencl/cl/cn/algorithm.cl
@ -17,16 +17,17 @@
 #define ALGO_CN_PICO_0      16
 #define ALGO_CN_PICO_TLO    17
 #define ALGO_CN_CCX         18
-#define ALGO_RX_0           19
-#define ALGO_RX_WOW         20
-#define ALGO_RX_ARQMA       21
-#define ALGO_RX_SFX         22
-#define ALGO_RX_KEVA        23
-#define ALGO_AR2_CHUKWA     24
-#define ALGO_AR2_CHUKWA_V2  25
-#define ALGO_AR2_WRKZ       26
-#define ALGO_ASTROBWT_DERO  27
-#define ALGO_KAWPOW_RVN     28
+#define ALGO_CN_UPX2        19
+#define ALGO_RX_0           20
+#define ALGO_RX_WOW         21
+#define ALGO_RX_ARQMA       22
+#define ALGO_RX_SFX         23
+#define ALGO_RX_KEVA        24
+#define ALGO_AR2_CHUKWA     25
+#define ALGO_AR2_CHUKWA_V2  26
+#define ALGO_AR2_WRKZ       27
+#define ALGO_ASTROBWT_DERO  28
+#define ALGO_KAWPOW_RVN     29

 #define FAMILY_UNKNOWN      0
 #define FAMILY_CN           1
--- a/src/backend/opencl/cl/cn/cryptonight.cl
+++ b/src/backend/opencl/cl/cn/cryptonight.cl
@ -514,7 +514,7 @@ __kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ul
        c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);

        {
-#           if (ALGO == ALGO_CN_RWZ)
+#           if ((ALGO == ALGO_CN_RWZ) || (ALGO == ALGO_CN_UPX2))
            const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(3));
            const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
            const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(1));
@ -561,7 +561,7 @@ __kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ul
            t ^= chunk2;
            const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));

-#           if (ALGO == ALGO_CN_RWZ)
+#           if ((ALGO == ALGO_CN_RWZ) || (ALGO == ALGO_CN_UPX2))
            SCRATCHPAD_CHUNK(1) = as_uint4(chunk1 + bx1);
            SCRATCHPAD_CHUNK(2) = as_uint4(chunk3 + bx0);
            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
--- a/src/backend/opencl/cl/cn/cryptonight_cl.h
+++ b/src/backend/opencl/cl/cn/cryptonight_cl.h
--- a/src/backend/opencl/cl/rx/randomx_cl.h
+++ b/src/backend/opencl/cl/rx/randomx_cl.h
--- a/src/backend/opencl/generators/ocl_vega_cn_generator.cpp
+++ b/src/backend/opencl/generators/ocl_vega_cn_generator.cpp
@ -88,7 +88,8 @@ static inline uint32_t getIntensity(const OclDevice &device, const Algorithm &al

 static inline uint32_t getWorksize(const Algorithm &algorithm)
 {
-    if (algorithm.family() == Algorithm::CN_PICO) {
+    Algorithm::Family f = algorithm.family();
+    if (f == Algorithm::CN_PICO || f == Algorithm::CN_FEMTO) {
        return 64;
    }

--- a/src/backend/opencl/runners/OclCnRunner.cpp
+++ b/src/backend/opencl/runners/OclCnRunner.cpp
@ -39,10 +39,12 @@
 xmrig::OclCnRunner::OclCnRunner(size_t index, const OclLaunchData &data) : OclBaseRunner(index, data)
 {
    uint32_t stridedIndex = data.thread.stridedIndex();
+    Algorithm::Family f = m_algorithm.family();
+
    if (data.device.vendorId() == OCL_VENDOR_NVIDIA) {
        stridedIndex = 0;
    }
-    else if (stridedIndex == 1 && (m_algorithm.family() == Algorithm::CN_PICO || (m_algorithm.family() == Algorithm::CN && CnAlgo<>::base(m_algorithm) == Algorithm::CN_2))) {
+    else if (stridedIndex == 1 && (f == Algorithm::CN_PICO || f == Algorithm::CN_FEMTO || (f == Algorithm::CN && CnAlgo<>::base(m_algorithm) == Algorithm::CN_2))) {
        stridedIndex = 2;
    }

--- a/src/base/crypto/Algorithm.cpp
+++ b/src/base/crypto/Algorithm.cpp
@ -6,8 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
- * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2021 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -129,6 +129,11 @@ static AlgoName const algorithm_names[] = {
 #   endif
    { "cryptonight/ccx",           "cn/ccx",           Algorithm::CN_CCX          },
    { "cryptonight/conceal",       "cn/conceal",       Algorithm::CN_CCX          },
+#   ifdef XMRIG_ALGO_CN_FEMTO
+    { "cryptonight/upx2",          "cn/upx2",          Algorithm::CN_UPX2         },
+    { "cn-extremelite/upx2",       nullptr,            Algorithm::CN_UPX2         },
+    { "cryptonight-upx/2",         nullptr,            Algorithm::CN_UPX2         },
+#   endif
 };


@ -199,6 +204,9 @@ size_t xmrig::Algorithm::l3() const
    case CN_PICO:
        return oneMiB / 4;

+    case CN_FEMTO:
+        return oneMiB / 8;
+
    default:
        break;
    }
@ -329,6 +337,11 @@ xmrig::Algorithm::Family xmrig::Algorithm::family(Id id)
        return CN_PICO;
 #   endif

+#   ifdef XMRIG_ALGO_CN_FEMTO
+    case CN_UPX2:
+        return CN_FEMTO;
+#   endif
+
 #   ifdef XMRIG_ALGO_RANDOMX
    case RX_0:
    case RX_WOW:
--- a/src/base/crypto/Algorithm.h
+++ b/src/base/crypto/Algorithm.h
@ -6,8 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
- * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2021 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -64,6 +64,7 @@ public:
        CN_PICO_0,     // "cn-pico"          CryptoNight-Pico
        CN_PICO_TLO,   // "cn-pico/tlo"      CryptoNight-Pico (TLO)
        CN_CCX,        // "cn/ccx"           Conceal (CCX)
+        CN_UPX2,       // "cn/upx2"          Uplexa (UPX2)
        RX_0,          // "rx/0"             RandomX (reference configuration).
        RX_WOW,        // "rx/wow"           RandomWOW (Wownero).
        RX_ARQ,        // "rx/arq"           RandomARQ (Arqma).
@ -83,6 +84,7 @@ public:
        CN_LITE,
        CN_HEAVY,
        CN_PICO,
+        CN_FEMTO,
        RANDOM_X,
        ARGON2,
        ASTROBWT,
@ -94,7 +96,7 @@ public:
    inline Algorithm(Id id) : m_id(id)                     {}
    Algorithm(const rapidjson::Value &value);

-    inline bool isCN() const                          { auto f = family(); return f == CN || f == CN_LITE || f == CN_HEAVY || f == CN_PICO; }
+    inline bool isCN() const                          { auto f = family(); return f == CN || f == CN_LITE || f == CN_HEAVY || f == CN_PICO || f == CN_FEMTO; }
    inline bool isEqual(const Algorithm &other) const { return m_id == other.m_id; }
    inline bool isValid() const                       { return m_id != INVALID && family() != UNKNOWN; }
    inline const char *name() const                   { return name(false); }
--- a/src/base/net/stratum/benchmark/BenchClient.cpp
+++ b/src/base/net/stratum/benchmark/BenchClient.cpp
@ -334,6 +334,7 @@ void xmrig::BenchClient::send(Request request)
        {
            doc.AddMember(StringRef(BenchConfig::kSize),    m_benchmark->size(), allocator);
            doc.AddMember(StringRef(BenchConfig::kAlgo),    m_benchmark->algorithm().toJSON(), allocator);
+            doc.AddMember(StringRef(BenchConfig::kUser),    m_benchmark->user().toJSON(), allocator);
            doc.AddMember("version",                        APP_VERSION, allocator);
            doc.AddMember("threads",                        m_threads, allocator);
            doc.AddMember("steady_ready_ts",                m_readyTime, allocator);
--- a/src/base/net/stratum/benchmark/BenchConfig.cpp
+++ b/src/base/net/stratum/benchmark/BenchConfig.cpp
@ -41,6 +41,7 @@ const char *BenchConfig::kSeed      = "seed";
 const char *BenchConfig::kSize      = "size";
 const char *BenchConfig::kSubmit    = "submit";
 const char *BenchConfig::kToken     = "token";
+const char *BenchConfig::kUser      = "user";
 const char *BenchConfig::kVerify    = "verify";

 #ifndef XMRIG_DEBUG_BENCHMARK_API
@ -59,8 +60,8 @@ xmrig::BenchConfig::BenchConfig(uint32_t size, const String &id, const rapidjson
    m_id(id),
    m_seed(Json::getString(object, kSeed)),
    m_token(Json::getString(object, kToken)),
-    m_size(size),
-    m_hash(0)
+    m_user(Json::getString(object, kUser)),
+    m_size(size)
 {
    if (!m_algorithm.isValid() || m_algorithm.family() != Algorithm::RANDOM_X) {
        m_algorithm = Algorithm::RX_0;
@ -111,6 +112,7 @@ rapidjson::Value xmrig::BenchConfig::toJSON(rapidjson::Document &doc) const
    out.AddMember(StringRef(kVerify),   m_id.toJSON(), allocator);
    out.AddMember(StringRef(kToken),    m_token.toJSON(), allocator);
    out.AddMember(StringRef(kSeed),     m_seed.toJSON(), allocator);
+    out.AddMember(StringRef(kUser),     m_user.toJSON(), allocator);

    if (m_hash) {
        out.AddMember(StringRef(kHash), Value(fmt::format("{:016X}", m_hash).c_str(), allocator), allocator);
--- a/src/base/net/stratum/benchmark/BenchConfig.h
+++ b/src/base/net/stratum/benchmark/BenchConfig.h
@ -39,6 +39,7 @@ public:
    static const char *kSize;
    static const char *kSubmit;
    static const char *kToken;
+    static const char *kUser;
    static const char *kVerify;

 #   ifndef XMRIG_DEBUG_BENCHMARK_API
@ -59,6 +60,7 @@ public:
    inline const String &id() const             { return m_id; }
    inline const String &seed() const           { return m_seed; }
    inline const String &token() const          { return m_token; }
+    inline const String &user() const           { return m_user; }
    inline uint32_t size() const                { return m_size; }
    inline uint64_t hash() const                { return m_hash; }

@ -73,8 +75,9 @@ private:
    String m_id;
    String m_seed;
    String m_token;
+    String m_user;
    uint32_t m_size;
-    uint64_t m_hash;
+    uint64_t m_hash = 0;
 };


--- a/src/core/config/ConfigTransform.cpp
+++ b/src/core/config/ConfigTransform.cpp
@ -262,6 +262,7 @@ void xmrig::ConfigTransform::transform(rapidjson::Document &doc, int key, const
    case IConfig::BenchTokenKey:    /* --token */
    case IConfig::BenchSeedKey:     /* --seed */
    case IConfig::BenchHashKey:     /* --hash */
+    case IConfig::UserKey:          /* --user */
        return transformBenchmark(doc, key, arg);
 #   endif

@ -347,6 +348,9 @@ void xmrig::ConfigTransform::transformBenchmark(rapidjson::Document &doc, int ke

    case IConfig::BenchHashKey: /* --hash */
        return set(doc, BenchConfig::kBenchmark, BenchConfig::kHash, arg);
+
+    case IConfig::UserKey: /* --user */
+        return set(doc, BenchConfig::kBenchmark, BenchConfig::kUser, arg);
    }
 }
 #endif
--- a/src/crypto/cn/CnAlgo.h
+++ b/src/crypto/cn/CnAlgo.h
@ -96,6 +96,11 @@ public:
            return CN_ITER / 8;
 #       endif

+#       ifdef XMRIG_ALGO_CN_FEMTO
+        case Algorithm::CN_UPX2:
+            return CN_ITER / 32;
+#       endif
+
        default:
            break;
        }
@ -111,6 +116,12 @@ public:
        }
 #       endif

+#       ifdef XMRIG_ALGO_CN_FEMTO
+        if (algo == Algorithm::CN_UPX2) {
+            return 0x1FFF0;
+        }
+#       endif
+
        return ((memory(algo) - 1) / 16) * 16;
    }

@ -149,6 +160,9 @@ public:
 #       ifdef XMRIG_ALGO_CN_PICO
        case Algorithm::CN_PICO_0:
        case Algorithm::CN_PICO_TLO:
+#       endif
+#       ifdef XMRIG_ALGO_CN_FEMTO
+        case Algorithm::CN_UPX2:
 #       endif
            return Algorithm::CN_2;

@ -176,6 +190,7 @@ template<> constexpr inline Algorithm::Id CnAlgo<Algorithm::CN_FAST>::base() con
 template<> constexpr inline Algorithm::Id CnAlgo<Algorithm::CN_RTO>::base() const           { return Algorithm::CN_1; }
 template<> constexpr inline Algorithm::Id CnAlgo<Algorithm::CN_LITE_1>::base() const        { return Algorithm::CN_1; }
 template<> constexpr inline Algorithm::Id CnAlgo<Algorithm::CN_HEAVY_TUBE>::base() const    { return Algorithm::CN_1; }
+template<> constexpr inline Algorithm::Id CnAlgo<Algorithm::CN_UPX2>::base() const          { return Algorithm::CN_2; }


 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_FAST>::iterations() const         { return CN_ITER / 2; }
@ -192,6 +207,7 @@ template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_ZLS>::iterations() con
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::iterations() const       { return CN_ITER / 8; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_TLO>::iterations() const     { return CN_ITER / 8; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_CCX>::iterations() const          { return CN_ITER / 2; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::iterations() const         { return CN_ITER / 32; }


 template<> constexpr inline size_t CnAlgo<Algorithm::CN_LITE_0>::memory() const             { return CN_MEMORY / 2; }
@ -201,9 +217,11 @@ template<> constexpr inline size_t CnAlgo<Algorithm::CN_HEAVY_TUBE>::memory() co
 template<> constexpr inline size_t CnAlgo<Algorithm::CN_HEAVY_XHV>::memory() const          { return CN_MEMORY * 2; }
 template<> constexpr inline size_t CnAlgo<Algorithm::CN_PICO_0>::memory() const             { return CN_MEMORY / 8; }
 template<> constexpr inline size_t CnAlgo<Algorithm::CN_PICO_TLO>::memory() const           { return CN_MEMORY / 8; }
+template<> constexpr inline size_t CnAlgo<Algorithm::CN_UPX2>::memory() const               { return CN_MEMORY / 16; }


 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::mask() const             { return 0x1FFF0; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::mask() const               { return 0x1FFF0; }


 } /* namespace xmrig */
--- a/src/crypto/cn/CnHash.cpp
+++ b/src/crypto/cn/CnHash.cpp
@ -99,8 +99,11 @@ cn_mainloop_fun        cn_double_mainloop_ryzen_asm               = nullptr;
 cn_mainloop_fun        cn_double_mainloop_bulldozer_asm           = nullptr;
 cn_mainloop_fun        cn_double_double_mainloop_sandybridge_asm  = nullptr;

+cn_mainloop_fun        cn_upx2_mainloop_asm                       = nullptr;
+cn_mainloop_fun        cn_upx2_double_mainloop_asm                = nullptr;

-template<typename T, typename U>
+
+template<Algorithm::Id SOURCE_ALGO = Algorithm::CN_2, typename T, typename U>
 static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask = CnAlgo<Algorithm::CN_HALF>().mask())
 {
    auto p = reinterpret_cast<const uint8_t*>(src);
@ -124,11 +127,11 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma
    auto patched_data = reinterpret_cast<uint8_t*>(dst);
    for (size_t i = 0; i + sizeof(uint32_t) <= size; ++i) {
        switch (*(uint32_t*)(patched_data + i)) {
-        case CnAlgo<Algorithm::CN_2>().iterations():
+        case CnAlgo<SOURCE_ALGO>().iterations():
            *(uint32_t*)(patched_data + i) = iterations;
            break;

-        case CnAlgo<Algorithm::CN_2>().mask():
+        case CnAlgo<SOURCE_ALGO>().mask():
            *(uint32_t*)(patched_data + i) = mask;
            break;
        }
@ -138,7 +141,7 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma

 static void patchAsmVariants()
 {
-    const int allocation_size = 81920;
+    const int allocation_size = 131072;
    auto base = static_cast<uint8_t *>(VirtualMemory::allocateExecutableMemory(allocation_size, false));

    cn_half_mainloop_ivybridge_asm              = reinterpret_cast<cn_mainloop_fun>         (base + 0x0000);
@ -170,6 +173,11 @@ static void patchAsmVariants()
    cn_tlo_double_mainloop_sandybridge_asm      = reinterpret_cast<cn_mainloop_fun>         (base + 0x13000);
 #   endif

+#   ifdef XMRIG_ALGO_CN_FEMTO
+    cn_upx2_mainloop_asm                        = reinterpret_cast<cn_mainloop_fun>         (base + 0x14000);
+    cn_upx2_double_mainloop_asm                 = reinterpret_cast<cn_mainloop_fun>         (base + 0x15000);
+#   endif
+
    {
        constexpr uint32_t ITER = CnAlgo<Algorithm::CN_HALF>().iterations();

@ -219,6 +227,16 @@ static void patchAsmVariants()
        patchCode(cn_double_double_mainloop_sandybridge_asm, cnv2_double_mainloop_sandybridge_asm,  ITER);
    }

+#   ifdef XMRIG_ALGO_CN_FEMTO
+    {
+        constexpr uint32_t ITER = CnAlgo<Algorithm::CN_UPX2>().iterations();
+        constexpr uint32_t MASK = CnAlgo<Algorithm::CN_UPX2>().mask();
+
+        patchCode<Algorithm::CN_RWZ>(cn_upx2_mainloop_asm,        cnv2_rwz_mainloop_asm,            ITER,   MASK);
+        patchCode<Algorithm::CN_RWZ>(cn_upx2_double_mainloop_asm, cnv2_rwz_double_mainloop_asm,     ITER,   MASK);
+    }
+#endif
+
    VirtualMemory::protectRX(base, allocation_size);
    VirtualMemory::flushInstructionCache(base, allocation_size);
 }
@ -272,6 +290,11 @@ xmrig::CnHash::CnHash()

    ADD_FN(Algorithm::CN_CCX);

+#   ifdef XMRIG_ALGO_CN_FEMTO
+    ADD_FN(Algorithm::CN_UPX2);
+    ADD_FN_ASM(Algorithm::CN_UPX2);
+#   endif
+
 #   ifdef XMRIG_ALGO_ARGON2
    m_map[Algorithm::AR2_CHUKWA][AV_SINGLE][Assembly::NONE]         = argon2::single_hash<Algorithm::AR2_CHUKWA>;
    m_map[Algorithm::AR2_CHUKWA][AV_SINGLE_SOFT][Assembly::NONE]    = argon2::single_hash<Algorithm::AR2_CHUKWA>;
--- a/src/crypto/cn/CryptoNight_arm.h
+++ b/src/crypto/cn/CryptoNight_arm.h
@ -395,7 +395,7 @@ static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m1
    uint64_t* mem_out = (uint64_t*)&l[idx];

    if (props.base() == Algorithm::CN_2) {
-        VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx, (ALGO == Algorithm::CN_RWZ ? 1 : 0));
+        VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0));
        _mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx));
    } else {
        __m128i tmp = _mm_xor_si128(bx0, cx);
@ -528,7 +528,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
            if (ALGO == Algorithm::CN_R) {
                VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx, 0);
            } else {
-                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo, (ALGO == Algorithm::CN_RWZ ? 1 : 0));
+                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0));
            }
        }

@ -704,7 +704,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
            if (ALGO == Algorithm::CN_R) {
                VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0, 0);
            } else {
-                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo, (ALGO == Algorithm::CN_RWZ ? 1 : 0));
+                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0));
            }
        }

@ -764,7 +764,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
            if (ALGO == Algorithm::CN_R) {
                VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1, 0);
            } else {
-                VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo, (ALGO == Algorithm::CN_RWZ ? 1 : 0));
+                VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0));
            }
        }

--- a/src/crypto/cn/CryptoNight_test.h
+++ b/src/crypto/cn/CryptoNight_test.h
@ -370,6 +370,23 @@ const static uint8_t test_output_pico_tlo[160] = {
 #endif


+#ifdef XMRIG_ALGO_CN_FEMTO
+// "cn/upx2"
+const static uint8_t test_output_femto_upx2[160] = {
+    0xAA, 0xBB, 0xB8, 0xED, 0x14, 0xA8, 0x35, 0xFA, 0x22, 0xCF, 0xB1, 0xB5, 0xDE, 0xA8, 0x72, 0xB0,
+    0xA1, 0xD6, 0xCB, 0xD8, 0x46, 0xF4, 0x39, 0x1C, 0x0F, 0x01, 0xF3, 0x87, 0x5E, 0x3A, 0x37, 0x61,
+    0x38, 0x59, 0x15, 0x72, 0xF8, 0x20, 0xD4, 0xDE, 0x25, 0x3C, 0xF5, 0x5A, 0x21, 0x92, 0xB6, 0x22,
+    0xB0, 0x28, 0x9E, 0x2E, 0x5C, 0x36, 0x16, 0xE6, 0x1E, 0x78, 0x7A, 0x8F, 0xE4, 0x62, 0xEC, 0x5A,
+    0xFD, 0x58, 0xCC, 0x6F, 0x3C, 0xD3, 0x8A, 0x0B, 0x5B, 0x6C, 0x83, 0x4E, 0x9B, 0xD4, 0xC2, 0x5A,
+    0x43, 0x2C, 0x48, 0x98, 0xF3, 0x16, 0xCA, 0x87, 0xE9, 0x5F, 0x44, 0x93, 0x53, 0x48, 0x00, 0xA3,
+    0xE8, 0xE4, 0xB6, 0x9D, 0x5A, 0x3B, 0x49, 0x2C, 0x21, 0xE9, 0x4B, 0x02, 0xFC, 0x87, 0x8D, 0x75,
+    0x66, 0x05, 0xAF, 0xA3, 0x9D, 0xC9, 0xD8, 0x88, 0x2D, 0x67, 0x31, 0x21, 0x4C, 0x4D, 0x88, 0x7D,
+    0x86, 0x9E, 0x4D, 0x74, 0xF4, 0x4C, 0x57, 0x27, 0xCF, 0xEF, 0x86, 0x01, 0xB0, 0x52, 0x18, 0xF3,
+    0xAD, 0xE4, 0x52, 0x5E, 0xB0, 0x4A, 0x97, 0xB4, 0x96, 0x18, 0xB6, 0x9C, 0x93, 0x0E, 0x49, 0xBB,
+};
+#endif
+
+
 #ifdef XMRIG_ALGO_ARGON2
 // "argon2/chukwa"
 const static uint8_t argon2_chukwa_test_out[160] = {
--- a/src/crypto/cn/CryptoNight_x86.h
+++ b/src/crypto/cn/CryptoNight_x86.h
@ -563,7 +563,7 @@ static inline void cryptonight_monero_tweak(uint64_t *mem_out, const uint8_t *l,
    constexpr CnAlgo<ALGO> props;

    if (props.base() == Algorithm::CN_2) {
-        VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx, (ALGO == Algorithm::CN_RWZ ? 1 : 0));
+        VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0));
        _mm_store_si128(reinterpret_cast<__m128i *>(mem_out), _mm_xor_si128(bx0, cx));
    } else {
        __m128i tmp = _mm_xor_si128(bx0, cx);
@ -715,7 +715,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
            if (ALGO == Algorithm::CN_R) {
                VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx, 0);
            } else {
-                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo, (ALGO == Algorithm::CN_RWZ ? 1 : 0));
+                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0));
            }
        }

@ -789,6 +789,7 @@ extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx **ctx);
+extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx * *ctx);


 namespace xmrig {
@ -822,6 +823,8 @@ extern cn_mainloop_fun cn_double_mainloop_ryzen_asm;
 extern cn_mainloop_fun cn_double_mainloop_bulldozer_asm;
 extern cn_mainloop_fun cn_double_double_mainloop_sandybridge_asm;

+extern cn_mainloop_fun cn_upx2_mainloop_asm;
+extern cn_mainloop_fun cn_upx2_double_mainloop_asm;

 } // namespace xmrig

@ -934,6 +937,11 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
            cn_double_mainloop_bulldozer_asm(ctx);
        }
    }
+#   ifdef XMRIG_ALGO_CN_FEMTO
+    else if (ALGO == Algorithm::CN_UPX2) {
+        cn_upx2_mainloop_asm(ctx);
+    }
+#   endif
    else if (props.isR()) {
        ctx[0]->generated_code(ctx);
    }
@ -976,6 +984,16 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
    else if (ALGO == Algorithm::CN_PICO_TLO) {
        cn_tlo_double_mainloop_sandybridge_asm(ctx);
    }
+#   endif
+#   ifdef XMRIG_ALGO_CN_FEMTO
+    else if (ALGO == Algorithm::CN_UPX2) {
+        if (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3) {
+            cnv2_upx_double_mainloop_zen3_asm(ctx);
+        }
+        else {
+            cn_upx2_double_mainloop_asm(ctx);
+        }
+    }
 #   endif
    else if (ALGO == Algorithm::CN_RWZ) {
        cnv2_rwz_double_mainloop_asm(ctx);
@ -1134,7 +1152,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
            if (ALGO == Algorithm::CN_R) {
                VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0, 0);
            } else {
-                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo, (ALGO == Algorithm::CN_RWZ ? 1 : 0));
+                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0));
            }
        }

@ -1192,7 +1210,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
            if (ALGO == Algorithm::CN_R) {
                VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1, 0);
            } else {
-                VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo, (ALGO == Algorithm::CN_RWZ ? 1 : 0));
+                VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0));
            }
        }

@ -1302,7 +1320,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
        if (ALGO == Algorithm::CN_R) {                                                                      \
            VARIANT2_SHUFFLE(l, idx & MASK, a, b0, b1, c, 0);                                               \
        } else {                                                                                            \
-            VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo, (ALGO == Algorithm::CN_RWZ ? 1 : 0));       \
+            VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo, (((ALGO == Algorithm::CN_RWZ) || (ALGO == Algorithm::CN_UPX2)) ? 1 : 0)); \
        }                                                                                                   \
    }                                                                                                       \
    if (ALGO == Algorithm::CN_R) {                                                                          \
--- a/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc
+++ b/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc
@ -0,0 +1,320 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	rdi, QWORD PTR [rcx+8]
+
+	mov	edx, 768
+	mov	rbx, QWORD PTR [rcx]
+	mov	ecx, 256
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	mov	r13, QWORD PTR [rdi+224]
+	movq	xmm0, QWORD PTR [rdi+104]
+	mov	r12, QWORD PTR [rbx+224]
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rax-136], xmm9
+	movaps	XMMWORD PTR [rax-152], xmm10
+	movaps	XMMWORD PTR [rsp+112], xmm11
+	movaps	XMMWORD PTR [rsp+96], xmm12
+	movaps	XMMWORD PTR [rsp+80], xmm13
+	movq	xmm13, QWORD PTR [rbx+96]
+	movaps	XMMWORD PTR [rsp+64], xmm14
+	movq	xmm14, QWORD PTR [rbx+104]
+	movaps	XMMWORD PTR [rsp+48], xmm15
+	movq	xmm15, QWORD PTR [rdi+96]
+	mov	QWORD PTR [rsp], r13
+	movdqa	XMMWORD PTR [rsp+32], xmm0
+
+	stmxcsr DWORD PTR [rsp+24]
+	mov DWORD PTR [rsp+28], 24448
+	ldmxcsr DWORD PTR [rsp+28]
+
+	mov	rcx, QWORD PTR [rbx+56]
+	xorps	xmm12, xmm12
+	xor	rcx, QWORD PTR [rbx+24]
+	mov	rax, QWORD PTR [rbx+48]
+	xor	rax, QWORD PTR [rbx+16]
+	mov	rsi, QWORD PTR [rbx+32]
+	mov	rbp, QWORD PTR [rdi+32]
+	movq	xmm0, rcx
+
+	mov	rcx, QWORD PTR [rbx+88]
+	xor	rcx, QWORD PTR [rbx+72]
+	movq	xmm7, rax
+	mov	rax, QWORD PTR [rbx+80]
+	xor	rax, QWORD PTR [rbx+64]
+	mov	r14, QWORD PTR [rbx+40]
+	mov	r15, QWORD PTR [rdi+40]
+	xor	rsi, QWORD PTR [rbx]
+	xor	rbp, QWORD PTR [rdi]
+	movq	xmm9, rax
+
+	mov	rax, QWORD PTR [rdi+48]
+	xor	rax, QWORD PTR [rdi+16]
+	xor	r14, QWORD PTR [rbx+8]
+	xor	r15, QWORD PTR [rdi+8]
+	movq	xmm8, rax
+	punpcklqdq xmm7, xmm0
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm11, rax
+	punpcklqdq xmm11, xmm11
+
+	mov	rax, QWORD PTR [rdi+80]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdi+56]
+	xor	rcx, QWORD PTR [rdi+24]
+	punpcklqdq xmm9, xmm0
+	mov	QWORD PTR [rsp+8], 16384
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdi+88]
+	xor	rcx, QWORD PTR [rdi+72]
+	xor	rax, QWORD PTR [rdi+64]
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, rcx
+	movq	xmm10, rax
+	mov	rax, 4389456576511
+	mov	QWORD PTR [rsp+16], rax
+	mov	rax, -4389456576512
+	mov	QWORD PTR [rsp+216], rax
+	punpcklqdq xmm10, xmm0
+
+	ALIGN(64)
+upx2_main_loop:
+	mov	rdx, rsi
+	mov	r9, rbp
+	and	edx, 131056
+	and	r9d, 131056
+	movdqu	xmm6, XMMWORD PTR [rdx+r12]
+	lea	r8, QWORD PTR [rdx+r12]
+	movdqu	xmm4, XMMWORD PTR [r9+r13]
+	lea	r10, QWORD PTR [r9+r13]
+	mov	ecx, edx
+	mov	eax, edx
+	xor	rax, 32
+	xor	rcx, 48
+	xor	rdx, 16
+	movq	xmm0, r14
+	movq	xmm3, rsi
+	movq	xmm5, rbp
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, r15
+	movdqu	xmm2, XMMWORD PTR [rax+r12]
+	movdqu	xmm1, XMMWORD PTR [rcx+r12]
+	paddq	xmm2, xmm3
+	punpcklqdq xmm5, xmm0
+	paddq	xmm1, xmm7
+	aesenc	xmm6, xmm3
+	aesenc	xmm4, xmm5
+	movdqa	xmm0, xmm9
+	movq	rdi, xmm4
+	paddq	xmm0, XMMWORD PTR [rdx+r12]
+	movdqu	XMMWORD PTR [rdx+r12], xmm0
+	xor	edx, edx
+	movdqu	XMMWORD PTR [rax+r12], xmm1
+	movdqa	xmm0, xmm6
+	movdqu	XMMWORD PTR [rcx+r12], xmm2
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov	ecx, r9d
+	xor	rcx, 48
+	mov	eax, r9d
+	xor	rax, 32
+	xor	r9, 16
+	movdqa	xmm0, xmm10
+	movdqu	xmm1, XMMWORD PTR [rcx+r13]
+	movdqu	xmm2, XMMWORD PTR [rax+r13]
+	paddq	xmm1, xmm8
+	paddq	xmm0, XMMWORD PTR [r9+r13]
+	paddq	xmm2, xmm5
+	movdqu	XMMWORD PTR [r9+r13], xmm0
+	movq	r9, xmm6
+	movdqu	XMMWORD PTR [rax+r13], xmm1
+	movdqa	xmm0, xmm4
+	movdqu	XMMWORD PTR [rcx+r13], xmm2
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [r10], xmm0
+	movq	rcx, xmm14
+	mov	rax, rcx
+	movq	r10, xmm13
+	shl	rax, 32
+	movdqa	xmm0, xmm6
+	xor	r10, rax
+	psrldq	xmm0, 8
+	lea	r8, QWORD PTR [rcx+rcx]
+	movq	rax, xmm0
+	add	r8d, r9d
+	mov	ecx, -2147483647
+	or	r8, rcx
+	mov	r11, r9
+	div	r8
+	and	r11d, 131056
+	movaps	xmm1, xmm12
+	mov	eax, eax
+	add	r11, r12
+	shl	rdx, 32
+	add	rdx, rax
+	xor	r10, QWORD PTR [r11]
+	mov	rbx, QWORD PTR [r11+8]
+	lea	r8, QWORD PTR [rdx+r9]
+	movq	xmm13, rdx
+	mov	rax, r8
+	shr	rax, 12
+	movq	xmm0, rax
+	paddq	xmm0, xmm11
+	sqrtsd	xmm1, xmm0
+	movq	rdx, xmm1
+	mov	rax, rdx
+	shr	rdx, 19
+	shr	rax, 20
+	mov	rcx, rdx
+	sub	rcx, rax
+	add	rax, QWORD PTR [rsp+216]
+	sub	rcx, QWORD PTR [rsp+16]
+	mov	r13, QWORD PTR [rsp]
+	imul	rcx, rax
+	mov	rax, r10
+	sub	rcx, r8
+	mov	rcx, r9
+	adc	rdx, 0
+	xor	rcx, 32
+	and	ecx, 131056
+	movq	xmm14, rdx
+	movdqu	xmm1, XMMWORD PTR [rcx+r12]
+	mul	r9
+	paddq	xmm1, xmm3
+	mov	r8, rax
+	xor	r8, QWORD PTR [rcx+r12+8]
+	add	r14, r8
+	movq	xmm0, rax
+	movq	xmm2, rdx
+	xor	rdx, QWORD PTR [rcx+r12]
+	mov	rax, r9
+	xor	rax, 48
+	punpcklqdq xmm2, xmm0
+	and	eax, 131056
+	add	rsi, rdx
+	xor	r9, 16
+	xor	edx, edx
+	and	r9d, 131056
+	movdqu	xmm0, XMMWORD PTR [rax+r12]
+	paddq	xmm0, xmm7
+	pxor	xmm2, XMMWORD PTR [r9+r12]
+	paddq	xmm2, xmm9
+	movdqu	XMMWORD PTR [r9+r12], xmm2
+	movq	r9, xmm15
+	movdqu	XMMWORD PTR [rcx+r12], xmm0
+	movdqa	xmm0, xmm4
+	mov	rcx, QWORD PTR [rsp+32]
+	movdqu	XMMWORD PTR [rax+r12], xmm1
+	mov	rax, rcx
+	shl	rax, 32
+	movaps	xmm1, xmm12
+	xor	r9, rax
+	psrldq	xmm0, 8
+	lea	r8, QWORD PTR [rcx+rcx]
+	mov	QWORD PTR [r11], rsi
+	add	r8d, edi
+	mov	QWORD PTR [r11+8], r14
+	movq	rax, xmm0
+	mov	ecx, -2147483647
+	or	r8, rcx
+	xor	rsi, r10
+	div	r8
+	mov	r10, rdi
+	xor	r14, rbx
+	mov	eax, eax
+	and	r10d, 131056
+	shl	rdx, 32
+	add	r10, r13
+	add	rdx, rax
+	xor	r9, QWORD PTR [r10]
+	mov	r11, QWORD PTR [r10+8]
+	lea	r8, QWORD PTR [rdx+rdi]
+	mov	rax, r8
+	movq	xmm15, rdx
+	shr	rax, 12
+	movq	xmm0, rax
+	paddq	xmm0, xmm11
+	sqrtsd	xmm1, xmm0
+	movq	rdx, xmm1
+	mov	rax, rdx
+	shr	rax, 20
+	shr	rdx, 19
+	mov	rcx, rdx
+	sub	rcx, rax
+	sub	rcx, QWORD PTR [rsp+16]
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	add	rax, QWORD PTR [rsp+216]
+	imul	rcx, rax
+	mov	rax, r9
+	sub	rcx, r8
+	mov	rcx, rdi
+	adc	rdx, 0
+	xor	rcx, 32
+	and	ecx, 131056
+	mov	QWORD PTR [rsp+32], rdx
+	movdqu	xmm1, XMMWORD PTR [rcx+r13]
+	mul	rdi
+	paddq	xmm1, xmm5
+	mov	r8, rax
+	xor	r8, QWORD PTR [rcx+r13+8]
+	add	r15, r8
+	movq	xmm0, rax
+	movq	xmm2, rdx
+	xor	rdx, QWORD PTR [rcx+r13]
+	mov	rax, rdi
+	xor	rdi, 16
+	punpcklqdq xmm2, xmm0
+	xor	rax, 48
+	and	edi, 131056
+	and	eax, 131056
+	add	rbp, rdx
+	pxor	xmm2, XMMWORD PTR [rdi+r13]
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	paddq	xmm2, xmm10
+	movdqu	XMMWORD PTR [rdi+r13], xmm2
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+r13], xmm0
+	movdqa	xmm10, xmm8
+	movdqu	XMMWORD PTR [rax+r13], xmm1
+	movdqa	xmm8, xmm4
+	mov	QWORD PTR [r10], rbp
+	xor	rbp, r9
+	mov	QWORD PTR [r10+8], r15
+	xor	r15, r11
+	sub	QWORD PTR [rsp+8], 1
+	jne	upx2_main_loop
+
+	ldmxcsr DWORD PTR [rsp+24]
+
+	movaps	xmm13, XMMWORD PTR [rsp+80]
+	lea	r11, QWORD PTR [rsp+232]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+64]
+	movaps	xmm15, XMMWORD PTR [rsp+48]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
--- a/src/crypto/cn/asm/cn_main_loop.S
+++ b/src/crypto/cn/asm/cn_main_loop.S
@ -17,6 +17,7 @@
 .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
 .global FN_PREFIX(cnv2_rwz_mainloop_asm)
 .global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
+.global FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm)

 ALIGN(64)
 FN_PREFIX(cnv2_mainloop_ivybridge_asm):
@ -72,6 +73,15 @@ FN_PREFIX(cnv2_rwz_double_mainloop_asm):
 	ret 0
 	mov eax, 3735929054

+ALIGN(64)
+FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_upx_double_mainloop_zen3.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
--- a/src/crypto/cn/asm/cn_main_loop.asm
+++ b/src/crypto/cn/asm/cn_main_loop.asm
@ -48,5 +48,12 @@ cnv2_rwz_double_mainloop_asm PROC
 	mov eax, 3735929054
 cnv2_rwz_double_mainloop_asm ENDP

+ALIGN(64)
+cnv2_upx_double_mainloop_zen3_asm PROC
+	INCLUDE cn2/cnv2_upx_double_mainloop_zen3.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_upx_double_mainloop_zen3_asm ENDP
+
 _TEXT_CNV2_MAINLOOP ENDS
 END
--- a/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc
+++ b/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc
@ -0,0 +1,320 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	rdi, QWORD PTR [rcx+8]
+
+	mov	edx, 768
+	mov	rbx, QWORD PTR [rcx]
+	mov	ecx, 256
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	mov	r13, QWORD PTR [rdi+224]
+	movd	xmm0, QWORD PTR [rdi+104]
+	mov	r12, QWORD PTR [rbx+224]
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rax-136], xmm9
+	movaps	XMMWORD PTR [rax-152], xmm10
+	movaps	XMMWORD PTR [rsp+112], xmm11
+	movaps	XMMWORD PTR [rsp+96], xmm12
+	movaps	XMMWORD PTR [rsp+80], xmm13
+	movd	xmm13, QWORD PTR [rbx+96]
+	movaps	XMMWORD PTR [rsp+64], xmm14
+	movd	xmm14, QWORD PTR [rbx+104]
+	movaps	XMMWORD PTR [rsp+48], xmm15
+	movd	xmm15, QWORD PTR [rdi+96]
+	mov	QWORD PTR [rsp], r13
+	movdqa	XMMWORD PTR [rsp+32], xmm0
+
+	stmxcsr DWORD PTR [rsp+24]
+	mov DWORD PTR [rsp+28], 24448
+	ldmxcsr DWORD PTR [rsp+28]
+
+	mov	rcx, QWORD PTR [rbx+56]
+	xorps	xmm12, xmm12
+	xor	rcx, QWORD PTR [rbx+24]
+	mov	rax, QWORD PTR [rbx+48]
+	xor	rax, QWORD PTR [rbx+16]
+	mov	rsi, QWORD PTR [rbx+32]
+	mov	rbp, QWORD PTR [rdi+32]
+	movd	xmm0, rcx
+
+	mov	rcx, QWORD PTR [rbx+88]
+	xor	rcx, QWORD PTR [rbx+72]
+	movd	xmm7, rax
+	mov	rax, QWORD PTR [rbx+80]
+	xor	rax, QWORD PTR [rbx+64]
+	mov	r14, QWORD PTR [rbx+40]
+	mov	r15, QWORD PTR [rdi+40]
+	xor	rsi, QWORD PTR [rbx]
+	xor	rbp, QWORD PTR [rdi]
+	movd	xmm9, rax
+
+	mov	rax, QWORD PTR [rdi+48]
+	xor	rax, QWORD PTR [rdi+16]
+	xor	r14, QWORD PTR [rbx+8]
+	xor	r15, QWORD PTR [rdi+8]
+	movd	xmm8, rax
+	punpcklqdq xmm7, xmm0
+
+	mov eax, 1023
+	shl rax, 52
+	movd xmm11, rax
+	punpcklqdq xmm11, xmm11
+
+	mov	rax, QWORD PTR [rdi+80]
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [rdi+56]
+	xor	rcx, QWORD PTR [rdi+24]
+	punpcklqdq xmm9, xmm0
+	mov	QWORD PTR [rsp+8], 16384
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [rdi+88]
+	xor	rcx, QWORD PTR [rdi+72]
+	xor	rax, QWORD PTR [rdi+64]
+	punpcklqdq xmm8, xmm0
+	movd	xmm0, rcx
+	movd	xmm10, rax
+	mov	rax, 4389456576511
+	mov	QWORD PTR [rsp+16], rax
+	mov	rax, -4389456576512
+	mov	QWORD PTR [rsp+216], rax
+	punpcklqdq xmm10, xmm0
+
+	ALIGN(64)
+upx2_main_loop:
+	mov	rdx, rsi
+	mov	r9, rbp
+	and	edx, 131056
+	and	r9d, 131056
+	movdqu	xmm6, XMMWORD PTR [rdx+r12]
+	lea	r8, QWORD PTR [rdx+r12]
+	movdqu	xmm4, XMMWORD PTR [r9+r13]
+	lea	r10, QWORD PTR [r9+r13]
+	mov	ecx, edx
+	mov	eax, edx
+	xor	rax, 32
+	xor	rcx, 48
+	xor	rdx, 16
+	movd	xmm0, r14
+	movd	xmm3, rsi
+	movd	xmm5, rbp
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, r15
+	movdqu	xmm2, XMMWORD PTR [rax+r12]
+	movdqu	xmm1, XMMWORD PTR [rcx+r12]
+	paddq	xmm2, xmm3
+	punpcklqdq xmm5, xmm0
+	paddq	xmm1, xmm7
+	aesenc	xmm6, xmm3
+	aesenc	xmm4, xmm5
+	movdqa	xmm0, xmm9
+	movd	rdi, xmm4
+	paddq	xmm0, XMMWORD PTR [rdx+r12]
+	movdqu	XMMWORD PTR [rdx+r12], xmm0
+	xor	edx, edx
+	movdqu	XMMWORD PTR [rax+r12], xmm1
+	movdqa	xmm0, xmm6
+	movdqu	XMMWORD PTR [rcx+r12], xmm2
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov	ecx, r9d
+	xor	rcx, 48
+	mov	eax, r9d
+	xor	rax, 32
+	xor	r9, 16
+	movdqa	xmm0, xmm10
+	movdqu	xmm1, XMMWORD PTR [rcx+r13]
+	movdqu	xmm2, XMMWORD PTR [rax+r13]
+	paddq	xmm1, xmm8
+	paddq	xmm0, XMMWORD PTR [r9+r13]
+	paddq	xmm2, xmm5
+	movdqu	XMMWORD PTR [r9+r13], xmm0
+	movd	r9, xmm6
+	movdqu	XMMWORD PTR [rax+r13], xmm1
+	movdqa	xmm0, xmm4
+	movdqu	XMMWORD PTR [rcx+r13], xmm2
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [r10], xmm0
+	movd	rcx, xmm14
+	mov	rax, rcx
+	movd	r10, xmm13
+	shl	rax, 32
+	movdqa	xmm0, xmm6
+	xor	r10, rax
+	psrldq	xmm0, 8
+	lea	r8, QWORD PTR [rcx+rcx]
+	movd	rax, xmm0
+	add	r8d, r9d
+	mov	ecx, -2147483647
+	or	r8, rcx
+	mov	r11, r9
+	div	r8
+	and	r11d, 131056
+	movaps	xmm1, xmm12
+	mov	eax, eax
+	add	r11, r12
+	shl	rdx, 32
+	add	rdx, rax
+	xor	r10, QWORD PTR [r11]
+	mov	rbx, QWORD PTR [r11+8]
+	lea	r8, QWORD PTR [rdx+r9]
+	movd	xmm13, rdx
+	mov	rax, r8
+	shr	rax, 12
+	movd	xmm0, rax
+	paddq	xmm0, xmm11
+	sqrtsd	xmm1, xmm0
+	movd	rdx, xmm1
+	mov	rax, rdx
+	shr	rdx, 19
+	shr	rax, 20
+	mov	rcx, rdx
+	sub	rcx, rax
+	add	rax, QWORD PTR [rsp+216]
+	sub	rcx, QWORD PTR [rsp+16]
+	mov	r13, QWORD PTR [rsp]
+	imul	rcx, rax
+	mov	rax, r10
+	sub	rcx, r8
+	mov	rcx, r9
+	adc	rdx, 0
+	xor	rcx, 32
+	and	ecx, 131056
+	movd	xmm14, rdx
+	movdqu	xmm1, XMMWORD PTR [rcx+r12]
+	mul	r9
+	paddq	xmm1, xmm3
+	mov	r8, rax
+	xor	r8, QWORD PTR [rcx+r12+8]
+	add	r14, r8
+	movd	xmm0, rax
+	movd	xmm2, rdx
+	xor	rdx, QWORD PTR [rcx+r12]
+	mov	rax, r9
+	xor	rax, 48
+	punpcklqdq xmm2, xmm0
+	and	eax, 131056
+	add	rsi, rdx
+	xor	r9, 16
+	xor	edx, edx
+	and	r9d, 131056
+	movdqu	xmm0, XMMWORD PTR [rax+r12]
+	paddq	xmm0, xmm7
+	pxor	xmm2, XMMWORD PTR [r9+r12]
+	paddq	xmm2, xmm9
+	movdqu	XMMWORD PTR [r9+r12], xmm2
+	movd	r9, xmm15
+	movdqu	XMMWORD PTR [rcx+r12], xmm0
+	movdqa	xmm0, xmm4
+	mov	rcx, QWORD PTR [rsp+32]
+	movdqu	XMMWORD PTR [rax+r12], xmm1
+	mov	rax, rcx
+	shl	rax, 32
+	movaps	xmm1, xmm12
+	xor	r9, rax
+	psrldq	xmm0, 8
+	lea	r8, QWORD PTR [rcx+rcx]
+	mov	QWORD PTR [r11], rsi
+	add	r8d, edi
+	mov	QWORD PTR [r11+8], r14
+	movd	rax, xmm0
+	mov	ecx, -2147483647
+	or	r8, rcx
+	xor	rsi, r10
+	div	r8
+	mov	r10, rdi
+	xor	r14, rbx
+	mov	eax, eax
+	and	r10d, 131056
+	shl	rdx, 32
+	add	r10, r13
+	add	rdx, rax
+	xor	r9, QWORD PTR [r10]
+	mov	r11, QWORD PTR [r10+8]
+	lea	r8, QWORD PTR [rdx+rdi]
+	mov	rax, r8
+	movd	xmm15, rdx
+	shr	rax, 12
+	movd	xmm0, rax
+	paddq	xmm0, xmm11
+	sqrtsd	xmm1, xmm0
+	movd	rdx, xmm1
+	mov	rax, rdx
+	shr	rax, 20
+	shr	rdx, 19
+	mov	rcx, rdx
+	sub	rcx, rax
+	sub	rcx, QWORD PTR [rsp+16]
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	add	rax, QWORD PTR [rsp+216]
+	imul	rcx, rax
+	mov	rax, r9
+	sub	rcx, r8
+	mov	rcx, rdi
+	adc	rdx, 0
+	xor	rcx, 32
+	and	ecx, 131056
+	mov	QWORD PTR [rsp+32], rdx
+	movdqu	xmm1, XMMWORD PTR [rcx+r13]
+	mul	rdi
+	paddq	xmm1, xmm5
+	mov	r8, rax
+	xor	r8, QWORD PTR [rcx+r13+8]
+	add	r15, r8
+	movd	xmm0, rax
+	movd	xmm2, rdx
+	xor	rdx, QWORD PTR [rcx+r13]
+	mov	rax, rdi
+	xor	rdi, 16
+	punpcklqdq xmm2, xmm0
+	xor	rax, 48
+	and	edi, 131056
+	and	eax, 131056
+	add	rbp, rdx
+	pxor	xmm2, XMMWORD PTR [rdi+r13]
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	paddq	xmm2, xmm10
+	movdqu	XMMWORD PTR [rdi+r13], xmm2
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+r13], xmm0
+	movdqa	xmm10, xmm8
+	movdqu	XMMWORD PTR [rax+r13], xmm1
+	movdqa	xmm8, xmm4
+	mov	QWORD PTR [r10], rbp
+	xor	rbp, r9
+	mov	QWORD PTR [r10+8], r15
+	xor	r15, r11
+	sub	QWORD PTR [rsp+8], 1
+	jne	upx2_main_loop
+
+	ldmxcsr DWORD PTR [rsp+24]
+
+	movaps	xmm13, XMMWORD PTR [rsp+80]
+	lea	r11, QWORD PTR [rsp+232]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+64]
+	movaps	xmm15, XMMWORD PTR [rsp+48]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
--- a/src/crypto/cn/asm/win64/cn_main_loop.S
+++ b/src/crypto/cn/asm/win64/cn_main_loop.S
@ -7,6 +7,7 @@
 .global cnv2_double_mainloop_sandybridge_asm
 .global cnv2_rwz_mainloop_asm
 .global cnv2_rwz_double_mainloop_asm
+.global cnv2_upx_double_mainloop_zen3_asm

 ALIGN(64)
 cnv2_mainloop_ivybridge_asm:
@ -43,3 +44,9 @@ cnv2_rwz_double_mainloop_asm:
 	#include "cn2/cnv2_rwz_double_main_loop.inc"
 	ret 0
 	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_upx_double_mainloop_zen3_asm:
+	#include "cn2/cnv2_upx_double_mainloop_zen3.inc"
+	ret 0
+	mov eax, 3735929054
--- a/src/crypto/cn/asm/win64/cn_main_loop.asm
+++ b/src/crypto/cn/asm/win64/cn_main_loop.asm
@ -48,5 +48,12 @@ cnv2_rwz_double_mainloop_asm PROC
 	mov eax, 3735929054
 cnv2_rwz_double_mainloop_asm ENDP

+ALIGN(64)
+cnv2_upx_double_mainloop_zen3_asm PROC
+	INCLUDE cn2/cnv2_upx_double_mainloop_zen3.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_upx_double_mainloop_zen3_asm ENDP
+
 _TEXT_CNV2_MAINLOOP ENDS
 END
--- a/src/crypto/common/HugePagesInfo.h
+++ b/src/crypto/common/HugePagesInfo.h
@ -41,7 +41,7 @@ public:
    size_t size         = 0;

    inline bool isFullyAllocated() const { return allocated == total; }
-    inline double percent() const        { return allocated == 0 ? 0.0 : static_cast<double>(allocated) / total * 100.0; }
+    inline double percent() const        { return total == 0 ? 0.0 : static_cast<double>(allocated) / total * 100.0; }
    inline void reset()                  { allocated = 0; total = 0; size = 0; }

    inline HugePagesInfo &operator+=(const HugePagesInfo &other)
--- a/src/crypto/randomx/asm/program_imul_rcp_store.inc
+++ b/src/crypto/randomx/asm/program_imul_rcp_store.inc
@ -0,0 +1,17 @@
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
+	add rsp, 128
--- a/src/crypto/randomx/jit_compiler_x86.cpp
+++ b/src/crypto/randomx/jit_compiler_x86.cpp
@ -428,7 +428,10 @@ namespace randomx {
        xmrig::RxFix::setMainLoopBounds(mainLoopBounds);
 #		endif

-		memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask));
+		imul_rcp_storage = code + (ADDR(randomx_program_imul_rcp_store) - codePrologue) + 2;
+		imul_rcp_storage_used = 0;
+
+		memcpy(imul_rcp_storage - 34, &pcfg.eMask, sizeof(pcfg.eMask));
 		codePos = codePosFirst;
 		prevCFROUND = 0;

@ -1012,13 +1015,24 @@ namespace randomx {
 		
 		uint64_t divisor = instr.getImm32();
 		if (!isZeroOrPowerOf2(divisor)) {
-			*(uint32_t*)(p + pos) = 0xb848;
-			pos += 2;
-
-			emit64(randomx_reciprocal_fast(divisor), p, pos);
-
 			const uint32_t dst = instr.dst % RegistersCount;
-			emit32(0xc0af0f4c + (dst << 27), p, pos);
+
+			const uint64_t reciprocal = randomx_reciprocal_fast(divisor);
+			if (imul_rcp_storage_used < 16) {
+				*(uint64_t*)(imul_rcp_storage) = reciprocal;
+				*(uint64_t*)(p + pos) = 0x2444AF0F4Cull + (dst << 27) + (static_cast<uint64_t>(248 - imul_rcp_storage_used * 8) << 40);
+				++imul_rcp_storage_used;
+				imul_rcp_storage += 11;
+				pos += 6;
+			}
+			else {
+				*(uint32_t*)(p + pos) = 0xb848;
+				pos += 2;
+
+				emit64(reciprocal, p, pos);
+
+				emit32(0xc0af0f4c + (dst << 27), p, pos);
+			}

 			registerUsage[dst] = pos;
 		}
--- a/src/crypto/randomx/jit_compiler_x86.hpp
+++ b/src/crypto/randomx/jit_compiler_x86.hpp
@ -104,6 +104,9 @@ namespace randomx {
 		uint8_t* allocatedCode = nullptr;
 		size_t allocatedSize = 0;

+		uint8_t* imul_rcp_storage = nullptr;
+		uint32_t imul_rcp_storage_used = 0;
+
 		void generateProgramPrologue(Program&, ProgramConfiguration&);
 		void generateProgramEpilogue(Program&, ProgramConfiguration&);
 		template<bool rax>
--- a/src/crypto/randomx/jit_compiler_x86_static.S
+++ b/src/crypto/randomx/jit_compiler_x86_static.S
@ -41,6 +41,7 @@
 .global DECL(randomx_prefetch_scratchpad_end)
 .global DECL(randomx_program_prologue)
 .global DECL(randomx_program_prologue_first_load)
+.global DECL(randomx_program_imul_rcp_store)
 .global DECL(randomx_program_loop_begin)
 .global DECL(randomx_program_loop_load)
 .global DECL(randomx_program_loop_load_xop)
@ -106,11 +107,15 @@ DECL(randomx_program_prologue_first_load):
 	nop
 	nop
 	nop
-	jmp DECL(randomx_program_loop_begin)
+	jmp DECL(randomx_program_imul_rcp_store)

 .balign 64
 	#include "asm/program_xmm_constants.inc"

+DECL(randomx_program_imul_rcp_store):
+	#include "asm/program_imul_rcp_store.inc"
+	jmp DECL(randomx_program_loop_begin)
+
 .balign 64
 DECL(randomx_program_loop_begin):
 	nop
--- a/src/crypto/randomx/jit_compiler_x86_static.asm
+++ b/src/crypto/randomx/jit_compiler_x86_static.asm
@ -32,6 +32,7 @@ PUBLIC randomx_prefetch_scratchpad
 PUBLIC randomx_prefetch_scratchpad_end
 PUBLIC randomx_program_prologue
 PUBLIC randomx_program_prologue_first_load
+PUBLIC randomx_program_imul_rcp_store
 PUBLIC randomx_program_loop_begin
 PUBLIC randomx_program_loop_load
 PUBLIC randomx_program_loop_load_xop
@ -94,12 +95,17 @@ randomx_program_prologue_first_load PROC
 	nop
 	nop
 	nop
-	jmp randomx_program_loop_begin
+	jmp randomx_program_imul_rcp_store
 randomx_program_prologue_first_load ENDP

 ALIGN 64
 	include asm/program_xmm_constants.inc

+randomx_program_imul_rcp_store PROC
+	include asm/program_imul_rcp_store.inc
+	jmp randomx_program_loop_begin
+randomx_program_imul_rcp_store ENDP
+
 ALIGN 64
 randomx_program_loop_begin PROC
 	nop
--- a/src/crypto/randomx/jit_compiler_x86_static.hpp
+++ b/src/crypto/randomx/jit_compiler_x86_static.hpp
@ -33,6 +33,7 @@ extern "C" {
 	void randomx_prefetch_scratchpad_end();
 	void randomx_program_prologue();
 	void randomx_program_prologue_first_load();
+	void randomx_program_imul_rcp_store();
 	void randomx_program_loop_begin();
 	void randomx_program_loop_load();
 	void randomx_program_loop_load_xop();
--- a/src/version.h
+++ b/src/version.h
@ -28,15 +28,15 @@
 #define APP_ID        "xmrig"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "XMRig miner"
-#define APP_VERSION   "6.11.2"
+#define APP_VERSION   "6.12.0-dev"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
 #define APP_COPYRIGHT "Copyright (C) 2016-2021 xmrig.com"
 #define APP_KIND      "miner"

 #define APP_VER_MAJOR  6
-#define APP_VER_MINOR  11
-#define APP_VER_PATCH  2
+#define APP_VER_MINOR  12
+#define APP_VER_PATCH  0

 #ifdef _MSC_VER
 #   if (_MSC_VER >= 1920)