RandomX: AES improvements

- A bit faster hardware AES code when compiled with MSVC
- More reliable software AES benchmark
This commit is contained in:
SChernykh 2020-09-21 17:51:08 +02:00
parent db920e8006
commit 891a46382e
2 changed files with 44 additions and 49 deletions

View file

@ -244,38 +244,29 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
//process 64 bytes at a time in 4 lanes //process 64 bytes at a time in 4 lanes
while (scratchpadPtr < scratchpadEnd) { while (scratchpadPtr < scratchpadEnd) {
hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0)); #define HASH_STATE(k) \
hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1)); hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 0)); \
hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2)); hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 1)); \
hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3)); hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2)); \
hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3));
fill_state0 = aesdec<softAes>(fill_state0, key0); #define FILL_STATE(k) \
fill_state1 = aesenc<softAes>(fill_state1, key1); fill_state0 = aesdec<softAes>(fill_state0, key0); \
fill_state2 = aesdec<softAes>(fill_state2, key2); fill_state1 = aesenc<softAes>(fill_state1, key1); \
fill_state3 = aesenc<softAes>(fill_state3, key3); fill_state2 = aesdec<softAes>(fill_state2, key2); \
fill_state3 = aesenc<softAes>(fill_state3, key3); \
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 0, fill_state0); \
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 1, fill_state1); \
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2, fill_state2); \
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3, fill_state3);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0); HASH_STATE(0);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1); HASH_STATE(1);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3); FILL_STATE(0);
FILL_STATE(1);
rx_prefetch_t0(prefetchPtr); rx_prefetch_t0(prefetchPtr);
hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 4));
hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 5));
hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 6));
hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 7));
fill_state0 = aesdec<softAes>(fill_state0, key0);
fill_state1 = aesenc<softAes>(fill_state1, key1);
fill_state2 = aesdec<softAes>(fill_state2, key2);
fill_state3 = aesenc<softAes>(fill_state3, key3);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 4, fill_state0);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 5, fill_state1);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 6, fill_state2);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 7, fill_state3);
rx_prefetch_t0(prefetchPtr + 64); rx_prefetch_t0(prefetchPtr + 64);
scratchpadPtr += 128; scratchpadPtr += 128;

View file

@ -131,10 +131,10 @@ uint32_t GetSoftAESImpl()
void SelectSoftAESImpl() void SelectSoftAESImpl()
{ {
constexpr int test_length_ms = 100; constexpr int test_length_ms = 100;
double speed[2]; double speed[2] = {};
for (int i = 0; i < 2; ++i) for (int run = 0; run < 3; ++run) {
{ for (int i = 0; i < 2; ++i) {
std::vector<uint8_t> scratchpad(10 * 1024); std::vector<uint8_t> scratchpad(10 * 1024);
uint8_t hash[64] = {}; uint8_t hash[64] = {};
uint8_t state[64] = {}; uint8_t state[64] = {};
@ -155,7 +155,11 @@ void SelectSoftAESImpl()
t2 = xmrig::Chrono::highResolutionMSecs(); t2 = xmrig::Chrono::highResolutionMSecs();
} while (t2 - t1 < test_length_ms); } while (t2 - t1 < test_length_ms);
speed[i] = count * 1e3 / (t2 - t1); const double x = count * 1e3 / (t2 - t1);
if (x > speed[i]) {
speed[i] = x;
}
}
} }
softAESImpl = (speed[0] > speed[1]) ? 1 : 2; softAESImpl = (speed[0] > speed[1]) ? 1 : 2;