From 5724d8beb6dea68a0276ee3ee505e7ce23d96d84 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Fri, 26 Jun 2020 12:31:26 +0200 Subject: [PATCH] KawPow: optimized CPU share verification - 2 times faster CPU share verification (11 -> 5 ms) - 1.5 times faster light cache initialization --- src/3rdparty/libethash/ethash_internal.c | 81 +++++++++++++++--------- src/3rdparty/libethash/ethash_internal.h | 7 ++ src/crypto/kawpow/KPCache.cpp | 6 +- src/crypto/kawpow/KPHash.cpp | 5 +- 4 files changed, 61 insertions(+), 38 deletions(-) diff --git a/src/3rdparty/libethash/ethash_internal.c b/src/3rdparty/libethash/ethash_internal.c index 7c7681e5..6f9aac47 100644 --- a/src/3rdparty/libethash/ethash_internal.c +++ b/src/3rdparty/libethash/ethash_internal.c @@ -33,6 +33,18 @@ #include "data_sizes.h" #include "base/crypto/sha3.h" +#if defined(_M_X64) || defined(__x86_64__) || defined(__SSE2__) + #ifdef __GNUC__ + #include + #else + #include + #endif + + #define kp_prefetch(x) _mm_prefetch((x), _MM_HINT_T0); +#else + #define kp_prefetch(x) +#endif + #define SHA3_256(a, b, c) sha3_HashBuffer(256, SHA3_FLAGS_KECCAK, b, c, a, 32) #define SHA3_512(a, b, c) sha3_HashBuffer(512, SHA3_FLAGS_KECCAK, b, c, a, 64) @@ -157,46 +169,53 @@ void ethash_calculate_dag_item_opt( memcpy(ret, init, sizeof(node)); ret->words[0] ^= node_index; SHA3_512(ret->bytes, ret->bytes, sizeof(node)); -#if defined(_M_X64) && ENABLE_SSE - __m128i const fnv_prime = _mm_set1_epi32(FNV_PRIME); - __m128i xmm0 = ret->xmm[0]; - __m128i xmm1 = ret->xmm[1]; - __m128i xmm2 = ret->xmm[2]; - __m128i xmm3 = ret->xmm[3]; -#endif for (uint32_t i = 0; i != num_parents; ++i) { uint32_t parent_index = fast_mod(fnv_hash(node_index ^ i, ret->words[i % NODE_WORDS]), light->num_parent_nodes, light->reciprocal, light->increment, light->shift); node const* parent = &cache_nodes[parent_index]; - -#if defined(_M_X64) && ENABLE_SSE - { - xmm0 = _mm_mullo_epi32(xmm0, fnv_prime); - xmm1 = _mm_mullo_epi32(xmm1, fnv_prime); - xmm2 = _mm_mullo_epi32(xmm2, fnv_prime); - xmm3 = _mm_mullo_epi32(xmm3, fnv_prime); - xmm0 = _mm_xor_si128(xmm0, parent->xmm[0]); - xmm1 = _mm_xor_si128(xmm1, parent->xmm[1]); - xmm2 = _mm_xor_si128(xmm2, parent->xmm[2]); - xmm3 = _mm_xor_si128(xmm3, parent->xmm[3]); - - // have to write to ret as values are used to compute index - ret->xmm[0] = xmm0; - ret->xmm[1] = xmm1; - ret->xmm[2] = xmm2; - ret->xmm[3] = xmm3; + for (unsigned w = 0; w != NODE_WORDS; ++w) { + ret->words[w] = fnv_hash(ret->words[w], parent->words[w]); } -#else - { - for (unsigned w = 0; w != NODE_WORDS; ++w) { - ret->words[w] = fnv_hash(ret->words[w], parent->words[w]); - } - } -#endif } SHA3_512(ret->bytes, ret->bytes, sizeof(node)); } +void ethash_calculate_dag_item4_opt( + node* ret, + uint32_t node_index, + uint32_t num_parents, + ethash_light_t const light +) +{ + node const* cache_nodes = (node const*)light->cache; + + for (size_t i = 0; i < 4; ++i) { + node const* init = &cache_nodes[fast_mod(node_index + i, light->num_parent_nodes, light->reciprocal, light->increment, light->shift)]; + memcpy(ret + i, init, sizeof(node)); + ret[i].words[0] ^= node_index + i; + SHA3_512(ret[i].bytes, ret[i].bytes, sizeof(node)); + } + + for (uint32_t i = 0; i != num_parents; ++i) { + node* parent[4]; + + for (uint32_t j = 0; j < 4; ++j) { + const uint32_t parent_index = fast_mod(fnv_hash((node_index + j) ^ i, ret[j].words[i % NODE_WORDS]), light->num_parent_nodes, light->reciprocal, light->increment, light->shift); + parent[j] = &cache_nodes[parent_index]; + kp_prefetch(parent[j]); + } + + for (unsigned w = 0; w != NODE_WORDS; ++w) ret[0].words[w] = fnv_hash(ret[0].words[w], parent[0]->words[w]); + for (unsigned w = 0; w != NODE_WORDS; ++w) ret[1].words[w] = fnv_hash(ret[1].words[w], parent[1]->words[w]); + for (unsigned w = 0; w != NODE_WORDS; ++w) ret[2].words[w] = fnv_hash(ret[2].words[w], parent[2]->words[w]); + for (unsigned w = 0; w != NODE_WORDS; ++w) ret[3].words[w] = fnv_hash(ret[3].words[w], parent[3]->words[w]); + } + + for (size_t i = 0; i < 4; ++i) { + SHA3_512(ret[i].bytes, ret[i].bytes, sizeof(node)); + } +} + bool ethash_compute_full_data( void* mem, uint64_t full_size, diff --git a/src/3rdparty/libethash/ethash_internal.h b/src/3rdparty/libethash/ethash_internal.h index de97b795..a1f2df21 100644 --- a/src/3rdparty/libethash/ethash_internal.h +++ b/src/3rdparty/libethash/ethash_internal.h @@ -161,6 +161,13 @@ void ethash_calculate_dag_item_opt( ethash_light_t const cache ); +void ethash_calculate_dag_item4_opt( + node* ret, + uint32_t node_index, + uint32_t num_parents, + ethash_light_t const cache +); + void ethash_quick_hash( ethash_h256_t* return_hash, ethash_h256_t const* header_hash, diff --git a/src/crypto/kawpow/KPCache.cpp b/src/crypto/kawpow/KPCache.cpp index b84f73fd..f4634102 100644 --- a/src/crypto/kawpow/KPCache.cpp +++ b/src/crypto/kawpow/KPCache.cpp @@ -92,9 +92,9 @@ bool KPCache::init(uint32_t epoch) const uint32_t b = (cache_nodes * (i + 1)) / n; threads.emplace_back([this, a, b, cache_nodes, &cache]() { - for (uint32_t j = a; j < b; ++j) { - ethash_calculate_dag_item_opt(((node*)m_DAGCache.data()) + j, j, num_dataset_parents, &cache); - } + uint32_t j = a; + for (; j + 4 <= b; j += 4) ethash_calculate_dag_item4_opt(((node*)m_DAGCache.data()) + j, j, num_dataset_parents, &cache); + for (; j < b; ++j) ethash_calculate_dag_item_opt(((node*)m_DAGCache.data()) + j, j, num_dataset_parents, &cache); }); } diff --git a/src/crypto/kawpow/KPHash.cpp b/src/crypto/kawpow/KPHash.cpp index fe5873a6..de0238b4 100644 --- a/src/crypto/kawpow/KPHash.cpp +++ b/src/crypto/kawpow/KPHash.cpp @@ -285,10 +285,7 @@ void KPHash::calculate(const KPCache& light_cache, uint32_t block_height, const uint32_t item_index = (mix[r % LANES][0] % num_items) * 4; node item[4]; - ethash_calculate_dag_item_opt(item + 0, item_index + 0, KPCache::num_dataset_parents, &cache); - ethash_calculate_dag_item_opt(item + 1, item_index + 1, KPCache::num_dataset_parents, &cache); - ethash_calculate_dag_item_opt(item + 2, item_index + 2, KPCache::num_dataset_parents, &cache); - ethash_calculate_dag_item_opt(item + 3, item_index + 3, KPCache::num_dataset_parents, &cache); + ethash_calculate_dag_item4_opt(item, item_index, KPCache::num_dataset_parents, &cache); uint32_t dst_counter = 0; uint32_t src_counter = 0;