diff --git a/src/backend/cuda/CudaConfig_gen.h b/src/backend/cuda/CudaConfig_gen.h index d92b05260..73c77e48d 100644 --- a/src/backend/cuda/CudaConfig_gen.h +++ b/src/backend/cuda/CudaConfig_gen.h @@ -139,7 +139,14 @@ size_t inline generate(Threads &threads, const template<> size_t inline generate(Threads &threads, const std::vector &devices) { - return generate(Algorithm::kASTROBWT, threads, Algorithm::ASTROBWT_DERO, devices); + size_t count = 0; + + if (!threads.isExist(Algorithm::ASTROBWT_DERO_2)) { + count += threads.move(Algorithm::kASTROBWT_DERO_2, CudaThreads(devices, Algorithm::ASTROBWT_DERO_2)); + } + + count += generate(Algorithm::kASTROBWT, threads, Algorithm::ASTROBWT_DERO, devices); + return count; } #endif diff --git a/src/backend/cuda/runners/CudaAstroBWTRunner.cpp b/src/backend/cuda/runners/CudaAstroBWTRunner.cpp index 43431aac4..6b078e4f8 100644 --- a/src/backend/cuda/runners/CudaAstroBWTRunner.cpp +++ b/src/backend/cuda/runners/CudaAstroBWTRunner.cpp @@ -32,11 +32,17 @@ constexpr uint32_t xmrig::CudaAstroBWTRunner::BWT_DATA_STRIDE; -xmrig::CudaAstroBWTRunner::CudaAstroBWTRunner(size_t index, const CudaLaunchData &data) : - CudaBaseRunner(index, data) +xmrig::CudaAstroBWTRunner::CudaAstroBWTRunner(size_t index, const CudaLaunchData &data) + : CudaBaseRunner(index, data) + , m_algorithm(data.algorithm) { m_intensity = m_data.thread.threads() * m_data.thread.blocks(); m_intensity -= m_intensity % 32; + + // Dero HE has very fast blocks, so we can't use high intensity + if ((m_algorithm == Algorithm::ASTROBWT_DERO_2) && (m_intensity > 4096)) { + m_intensity = 4096; + } } @@ -58,10 +64,14 @@ bool xmrig::CudaAstroBWTRunner::set(const Job &job, uint8_t *blob) size_t xmrig::CudaAstroBWTRunner::roundSize() const { + if (m_algorithm == Algorithm::ASTROBWT_DERO_2) { + return m_intensity; + } + constexpr uint32_t STAGE1_SIZE = 147253; constexpr uint32_t STAGE1_DATA_STRIDE = (STAGE1_SIZE + 256 + 255) & ~255U; - const uint32_t BATCH2_SIZE = m_intensity; + const uint32_t BATCH2_SIZE = static_cast(m_intensity); const uint32_t BWT_ALLOCATION_SIZE = BATCH2_SIZE * BWT_DATA_STRIDE; const uint32_t BATCH1_SIZE = (BWT_ALLOCATION_SIZE / STAGE1_DATA_STRIDE) & ~255U; diff --git a/src/backend/cuda/runners/CudaAstroBWTRunner.h b/src/backend/cuda/runners/CudaAstroBWTRunner.h index 0afee8c1c..d217d65a5 100644 --- a/src/backend/cuda/runners/CudaAstroBWTRunner.h +++ b/src/backend/cuda/runners/CudaAstroBWTRunner.h @@ -27,6 +27,7 @@ #include "backend/cuda/runners/CudaBaseRunner.h" +#include "base/crypto/Algorithm.h" namespace xmrig { @@ -50,6 +51,7 @@ protected: private: size_t m_intensity = 0; + Algorithm m_algorithm; };