From 5f0f2506e8d04fc79ba4ff917450d50732dce751 Mon Sep 17 00:00:00 2001 From: cohcho Date: Sun, 4 Oct 2020 14:47:58 +0000 Subject: [PATCH] soft_aes: fix previous optimization Previously removed unrolled variant is faster on some CPUs Some CPUs are faster with added unrolled variant The best variant depends on number of threads on some CPUs --- src/crypto/randomx/aes_hash.cpp | 110 ++++++++++++++++++++++--- src/crypto/randomx/aes_hash.hpp | 13 ++- src/crypto/randomx/soft_aes.cpp | 47 ----------- src/crypto/randomx/soft_aes.h | 3 - src/crypto/randomx/virtual_machine.cpp | 9 +- src/crypto/rx/Rx.cpp | 6 +- 6 files changed, 116 insertions(+), 72 deletions(-) diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index eed829602..bed473b7b 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -26,8 +26,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +#include + +#include "crypto/randomx/aes_hash.hpp" #include "crypto/randomx/soft_aes.h" #include "crypto/randomx/randomx.h" +#include "base/tools/Chrono.h" #include "base/tools/Profiler.h" #define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d @@ -214,7 +219,7 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); -template +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { PROFILE_SCOPE(RandomX_AES); @@ -260,7 +265,7 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2, fill_state2); \ rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3, fill_state3); - switch(softAes) { + switch (softAes) { case 0: HASH_STATE(0); HASH_STATE(1); @@ -277,13 +282,51 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi break; default: - HASH_STATE(0); - FILL_STATE(0); - rx_prefetch_t0(prefetchPtr); + switch (unroll) { + case 4: + HASH_STATE(0); + FILL_STATE(0); + rx_prefetch_t0(prefetchPtr); - scratchpadPtr += 64; - prefetchPtr += 64; + HASH_STATE(1); + FILL_STATE(1); + rx_prefetch_t0(prefetchPtr + 64); + HASH_STATE(2); + FILL_STATE(2); + rx_prefetch_t0(prefetchPtr + 64 * 2); + + HASH_STATE(3); + FILL_STATE(3); + rx_prefetch_t0(prefetchPtr + 64 * 3); + + scratchpadPtr += 64 * 4; + prefetchPtr += 64 * 4; + break; + + case 2: + HASH_STATE(0); + FILL_STATE(0); + rx_prefetch_t0(prefetchPtr); + + HASH_STATE(1); + FILL_STATE(1); + rx_prefetch_t0(prefetchPtr + 64); + + scratchpadPtr += 64 * 2; + prefetchPtr += 64 * 2; + break; + + default: + HASH_STATE(0); + FILL_STATE(0); + rx_prefetch_t0(prefetchPtr); + + scratchpadPtr += 64; + prefetchPtr += 64; + + break; + } break; } } @@ -317,6 +360,53 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3); } -template void hashAndFillAes1Rx4<0>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); -template void hashAndFillAes1Rx4<1>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); -template void hashAndFillAes1Rx4<2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); +template void hashAndFillAes1Rx4<0,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); +template void hashAndFillAes1Rx4<1,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); +template void hashAndFillAes1Rx4<2,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); +template void hashAndFillAes1Rx4<2,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); +template void hashAndFillAes1Rx4<2,4>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); + +hashAndFillAes1Rx4_impl* softAESImpl = &hashAndFillAes1Rx4<1,1>; + +void SelectSoftAESImpl(size_t threadsCount) +{ + constexpr int test_length_ms = 100; + const std::vector impl = { + &hashAndFillAes1Rx4<1,1>, + &hashAndFillAes1Rx4<2,1>, + &hashAndFillAes1Rx4<2,2>, + &hashAndFillAes1Rx4<2,4>, + }; + size_t fast_idx = 0; + double fast_speed = 0.0; + for (size_t run = 0; run < 3; ++run) { + for (size_t i = 0; i < impl.size(); ++i) { + const uint64_t t1 = xmrig::Chrono::highResolutionMSecs(); + std::vector count(threadsCount, 0); + std::vector threads; + for (size_t t = 0; t < threadsCount; ++t) { + threads.emplace_back([&, t]() { + std::vector scratchpad(10 * 1024); + uint8_t hash[64] = {}; + uint8_t state[64] = {}; + do { + (*impl[i])(scratchpad.data(), scratchpad.size(), hash, state); + ++count[t]; + } while (xmrig::Chrono::highResolutionMSecs() - t1 < test_length_ms); + }); + } + uint32_t total = 0; + for (size_t t = 0; t < threadsCount; ++t) { + threads[t].join(); + total += count[t]; + } + const uint64_t t2 = xmrig::Chrono::highResolutionMSecs(); + const double speed = total * 1e3 / (t2 - t1); + if (speed > fast_speed) { + fast_idx = i; + fast_speed = speed; + } + } + } + softAESImpl = impl[fast_idx]; +} diff --git a/src/crypto/randomx/aes_hash.hpp b/src/crypto/randomx/aes_hash.hpp index 345ec8d99..b4b57b17c 100644 --- a/src/crypto/randomx/aes_hash.hpp +++ b/src/crypto/randomx/aes_hash.hpp @@ -30,6 +30,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +typedef void (hashAndFillAes1Rx4_impl)(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); + +extern hashAndFillAes1Rx4_impl* softAESImpl; + +inline hashAndFillAes1Rx4_impl* GetSoftAESImpl() +{ + return softAESImpl; +} + +void SelectSoftAESImpl(size_t threadsCount); + template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); @@ -39,5 +50,5 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); -template +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/crypto/randomx/soft_aes.cpp b/src/crypto/randomx/soft_aes.cpp index ad6f9ffe6..04fb7ac0e 100644 --- a/src/crypto/randomx/soft_aes.cpp +++ b/src/crypto/randomx/soft_aes.cpp @@ -28,9 +28,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "crypto/randomx/soft_aes.h" -#include "crypto/randomx/aes_hash.hpp" -#include "base/tools/Chrono.h" -#include alignas(64) uint32_t lutEnc0[256]; alignas(64) uint32_t lutEnc1[256]; @@ -120,47 +117,3 @@ static struct SAESInitializer } } } aes_initializer; - -static uint32_t softAESImpl = 1; - -uint32_t GetSoftAESImpl() -{ - return softAESImpl; -} - -void SelectSoftAESImpl() -{ - constexpr int test_length_ms = 100; - double speed[2] = {}; - - for (int run = 0; run < 3; ++run) { - for (int i = 0; i < 2; ++i) { - std::vector scratchpad(10 * 1024); - uint8_t hash[64] = {}; - uint8_t state[64] = {}; - - uint64_t t1, t2; - - uint32_t count = 0; - t1 = xmrig::Chrono::highResolutionMSecs(); - do { - if (i == 0) { - hashAndFillAes1Rx4<1>(scratchpad.data(), scratchpad.size(), hash, state); - } - else { - hashAndFillAes1Rx4<2>(scratchpad.data(), scratchpad.size(), hash, state); - } - ++count; - - t2 = xmrig::Chrono::highResolutionMSecs(); - } while (t2 - t1 < test_length_ms); - - const double x = count * 1e3 / (t2 - t1); - if (x > speed[i]) { - speed[i] = x; - } - } - } - - softAESImpl = (speed[0] > speed[1]) ? 1 : 2; -} diff --git a/src/crypto/randomx/soft_aes.h b/src/crypto/randomx/soft_aes.h index d03a1a279..2b7d5a1e9 100644 --- a/src/crypto/randomx/soft_aes.h +++ b/src/crypto/randomx/soft_aes.h @@ -41,9 +41,6 @@ extern uint32_t lutDec1[256]; extern uint32_t lutDec2[256]; extern uint32_t lutDec3[256]; -uint32_t GetSoftAESImpl(); -void SelectSoftAESImpl(); - template rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key); template rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key); diff --git a/src/crypto/randomx/virtual_machine.cpp b/src/crypto/randomx/virtual_machine.cpp index 3a2d675c4..4a6990b23 100644 --- a/src/crypto/randomx/virtual_machine.cpp +++ b/src/crypto/randomx/virtual_machine.cpp @@ -119,15 +119,10 @@ namespace randomx { template void VmBase::hashAndFill(void* out, uint64_t (&fill_state)[8]) { if (!softAes) { - hashAndFillAes1Rx4<0>(scratchpad, ScratchpadSize, ®.a, fill_state); + hashAndFillAes1Rx4<0, 2>(scratchpad, ScratchpadSize, ®.a, fill_state); } else { - if (GetSoftAESImpl() == 1) { - hashAndFillAes1Rx4<1>(scratchpad, ScratchpadSize, ®.a, fill_state); - } - else { - hashAndFillAes1Rx4<2>(scratchpad, ScratchpadSize, ®.a, fill_state); - } + (*GetSoftAESImpl())(scratchpad, ScratchpadSize, ®.a, fill_state); } rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile)); diff --git a/src/crypto/rx/Rx.cpp b/src/crypto/rx/Rx.cpp index 79354d7e8..c8ecc2b39 100644 --- a/src/crypto/rx/Rx.cpp +++ b/src/crypto/rx/Rx.cpp @@ -26,14 +26,12 @@ #include "crypto/rx/Rx.h" -#include "backend/common/Tags.h" #include "backend/cpu/CpuConfig.h" #include "backend/cpu/CpuThreads.h" -#include "base/io/log/Log.h" #include "crypto/rx/RxConfig.h" #include "crypto/rx/RxQueue.h" #include "crypto/randomx/randomx.h" -#include "crypto/randomx/soft_aes.h" +#include "crypto/randomx/aes_hash.hpp" namespace xmrig { @@ -115,7 +113,7 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu if (!osInitialized) { setupMainLoopExceptionFrame(); if (!cpu.isHwAES()) { - SelectSoftAESImpl(); + SelectSoftAESImpl(cpu.threads().get(seed.algorithm()).count()); } osInitialized = true; }