Merge pull request #1864 from cohcho/soft_aes_optimization2

soft_aes: fix previous optimization
This commit is contained in:
xmrig 2020-10-05 12:20:41 +07:00 committed by GitHub
commit 116fb3d3f9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 116 additions and 72 deletions

View file

@ -26,8 +26,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <thread>
#include <vector>
#include "crypto/randomx/aes_hash.hpp"
#include "crypto/randomx/soft_aes.h"
#include "crypto/randomx/randomx.h"
#include "base/tools/Chrono.h"
#include "base/tools/Profiler.h"
#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
@ -214,7 +219,7 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
template<int softAes>
template<int softAes, int unroll>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
PROFILE_SCOPE(RandomX_AES);
@ -260,7 +265,7 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2, fill_state2); \
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3, fill_state3);
switch(softAes) {
switch (softAes) {
case 0:
HASH_STATE(0);
HASH_STATE(1);
@ -277,13 +282,51 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
break;
default:
HASH_STATE(0);
FILL_STATE(0);
rx_prefetch_t0(prefetchPtr);
switch (unroll) {
case 4:
HASH_STATE(0);
FILL_STATE(0);
rx_prefetch_t0(prefetchPtr);
scratchpadPtr += 64;
prefetchPtr += 64;
HASH_STATE(1);
FILL_STATE(1);
rx_prefetch_t0(prefetchPtr + 64);
HASH_STATE(2);
FILL_STATE(2);
rx_prefetch_t0(prefetchPtr + 64 * 2);
HASH_STATE(3);
FILL_STATE(3);
rx_prefetch_t0(prefetchPtr + 64 * 3);
scratchpadPtr += 64 * 4;
prefetchPtr += 64 * 4;
break;
case 2:
HASH_STATE(0);
FILL_STATE(0);
rx_prefetch_t0(prefetchPtr);
HASH_STATE(1);
FILL_STATE(1);
rx_prefetch_t0(prefetchPtr + 64);
scratchpadPtr += 64 * 2;
prefetchPtr += 64 * 2;
break;
default:
HASH_STATE(0);
FILL_STATE(0);
rx_prefetch_t0(prefetchPtr);
scratchpadPtr += 64;
prefetchPtr += 64;
break;
}
break;
}
}
@ -317,6 +360,53 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3);
}
template void hashAndFillAes1Rx4<0>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
template void hashAndFillAes1Rx4<1>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
template void hashAndFillAes1Rx4<2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
template void hashAndFillAes1Rx4<0,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
template void hashAndFillAes1Rx4<1,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
template void hashAndFillAes1Rx4<2,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
template void hashAndFillAes1Rx4<2,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
template void hashAndFillAes1Rx4<2,4>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
hashAndFillAes1Rx4_impl* softAESImpl = &hashAndFillAes1Rx4<1,1>;
void SelectSoftAESImpl(size_t threadsCount)
{
constexpr int test_length_ms = 100;
const std::vector<hashAndFillAes1Rx4_impl *> impl = {
&hashAndFillAes1Rx4<1,1>,
&hashAndFillAes1Rx4<2,1>,
&hashAndFillAes1Rx4<2,2>,
&hashAndFillAes1Rx4<2,4>,
};
size_t fast_idx = 0;
double fast_speed = 0.0;
for (size_t run = 0; run < 3; ++run) {
for (size_t i = 0; i < impl.size(); ++i) {
const uint64_t t1 = xmrig::Chrono::highResolutionMSecs();
std::vector<uint32_t> count(threadsCount, 0);
std::vector<std::thread> threads;
for (size_t t = 0; t < threadsCount; ++t) {
threads.emplace_back([&, t]() {
std::vector<uint8_t> scratchpad(10 * 1024);
uint8_t hash[64] = {};
uint8_t state[64] = {};
do {
(*impl[i])(scratchpad.data(), scratchpad.size(), hash, state);
++count[t];
} while (xmrig::Chrono::highResolutionMSecs() - t1 < test_length_ms);
});
}
uint32_t total = 0;
for (size_t t = 0; t < threadsCount; ++t) {
threads[t].join();
total += count[t];
}
const uint64_t t2 = xmrig::Chrono::highResolutionMSecs();
const double speed = total * 1e3 / (t2 - t1);
if (speed > fast_speed) {
fast_idx = i;
fast_speed = speed;
}
}
}
softAESImpl = impl[fast_idx];
}

View file

@ -30,6 +30,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <cstddef>
typedef void (hashAndFillAes1Rx4_impl)(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
extern hashAndFillAes1Rx4_impl* softAESImpl;
inline hashAndFillAes1Rx4_impl* GetSoftAESImpl()
{
return softAESImpl;
}
void SelectSoftAESImpl(size_t threadsCount);
template<int softAes>
void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
@ -39,5 +50,5 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer);
template<int softAes>
void fillAes4Rx4(void *state, size_t outputSize, void *buffer);
template<int softAes>
template<int softAes, int unroll>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);

View file

@ -28,9 +28,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "crypto/randomx/soft_aes.h"
#include "crypto/randomx/aes_hash.hpp"
#include "base/tools/Chrono.h"
#include <vector>
alignas(64) uint32_t lutEnc0[256];
alignas(64) uint32_t lutEnc1[256];
@ -120,47 +117,3 @@ static struct SAESInitializer
}
}
} aes_initializer;
static uint32_t softAESImpl = 1;
uint32_t GetSoftAESImpl()
{
return softAESImpl;
}
void SelectSoftAESImpl()
{
constexpr int test_length_ms = 100;
double speed[2] = {};
for (int run = 0; run < 3; ++run) {
for (int i = 0; i < 2; ++i) {
std::vector<uint8_t> scratchpad(10 * 1024);
uint8_t hash[64] = {};
uint8_t state[64] = {};
uint64_t t1, t2;
uint32_t count = 0;
t1 = xmrig::Chrono::highResolutionMSecs();
do {
if (i == 0) {
hashAndFillAes1Rx4<1>(scratchpad.data(), scratchpad.size(), hash, state);
}
else {
hashAndFillAes1Rx4<2>(scratchpad.data(), scratchpad.size(), hash, state);
}
++count;
t2 = xmrig::Chrono::highResolutionMSecs();
} while (t2 - t1 < test_length_ms);
const double x = count * 1e3 / (t2 - t1);
if (x > speed[i]) {
speed[i] = x;
}
}
}
softAESImpl = (speed[0] > speed[1]) ? 1 : 2;
}

View file

@ -41,9 +41,6 @@ extern uint32_t lutDec1[256];
extern uint32_t lutDec2[256];
extern uint32_t lutDec3[256];
uint32_t GetSoftAESImpl();
void SelectSoftAESImpl();
template<int soft> rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key);
template<int soft> rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key);

View file

@ -119,15 +119,10 @@ namespace randomx {
template<int softAes>
void VmBase<softAes>::hashAndFill(void* out, uint64_t (&fill_state)[8]) {
if (!softAes) {
hashAndFillAes1Rx4<0>(scratchpad, ScratchpadSize, &reg.a, fill_state);
hashAndFillAes1Rx4<0, 2>(scratchpad, ScratchpadSize, &reg.a, fill_state);
}
else {
if (GetSoftAESImpl() == 1) {
hashAndFillAes1Rx4<1>(scratchpad, ScratchpadSize, &reg.a, fill_state);
}
else {
hashAndFillAes1Rx4<2>(scratchpad, ScratchpadSize, &reg.a, fill_state);
}
(*GetSoftAESImpl())(scratchpad, ScratchpadSize, &reg.a, fill_state);
}
rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, &reg, sizeof(RegisterFile));

View file

@ -26,14 +26,12 @@
#include "crypto/rx/Rx.h"
#include "backend/common/Tags.h"
#include "backend/cpu/CpuConfig.h"
#include "backend/cpu/CpuThreads.h"
#include "base/io/log/Log.h"
#include "crypto/rx/RxConfig.h"
#include "crypto/rx/RxQueue.h"
#include "crypto/randomx/randomx.h"
#include "crypto/randomx/soft_aes.h"
#include "crypto/randomx/aes_hash.hpp"
namespace xmrig {
@ -115,7 +113,7 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu
if (!osInitialized) {
setupMainLoopExceptionFrame();
if (!cpu.isHwAES()) {
SelectSoftAESImpl();
SelectSoftAESImpl(cpu.threads().get(seed.algorithm()).count());
}
osInitialized = true;
}