mirror of
https://github.com/xmrig/xmrig.git
synced 2025-01-10 21:04:37 +00:00
Merge pull request #1830 from SChernykh/dev
RandomX: added performance profiler (for developers)
This commit is contained in:
commit
fa0bb0e1bf
19 changed files with 390 additions and 481 deletions
|
@ -23,6 +23,7 @@ option(WITH_NVML "Enable NVML (NVIDIA Management Library) support (on
|
||||||
option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support (only if OpenCL backend enabled)" ON)
|
option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support (only if OpenCL backend enabled)" ON)
|
||||||
option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON)
|
option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON)
|
||||||
option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF)
|
option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF)
|
||||||
|
option(WITH_PROFILING "Enable profiling for developers" OFF)
|
||||||
|
|
||||||
option(BUILD_STATIC "Build static binary" OFF)
|
option(BUILD_STATIC "Build static binary" OFF)
|
||||||
option(ARM_TARGET "Force use specific ARM target 8 or 7" 0)
|
option(ARM_TARGET "Force use specific ARM target 8 or 7" 0)
|
||||||
|
|
|
@ -222,3 +222,15 @@ if (WITH_KAWPOW)
|
||||||
src/base/net/stratum/EthStratumClient.cpp
|
src/base/net/stratum/EthStratumClient.cpp
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (WITH_PROFILING)
|
||||||
|
add_definitions(/DXMRIG_FEATURE_PROFILING)
|
||||||
|
|
||||||
|
list(APPEND HEADERS_BASE
|
||||||
|
src/base/tools/Profiler.h
|
||||||
|
)
|
||||||
|
|
||||||
|
list(APPEND SOURCES_BASE
|
||||||
|
src/base/tools/Profiler.cpp
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
|
@ -101,3 +101,13 @@ const char *xmrig::Tags::opencl()
|
||||||
return tag;
|
return tag;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef XMRIG_FEATURE_PROFILING
|
||||||
|
const char* xmrig::Tags::profiler()
|
||||||
|
{
|
||||||
|
static const char* tag = CYAN_BG_BOLD(WHITE_BOLD_S " profile ");
|
||||||
|
|
||||||
|
return tag;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
|
@ -53,6 +53,10 @@ public:
|
||||||
# ifdef XMRIG_FEATURE_OPENCL
|
# ifdef XMRIG_FEATURE_OPENCL
|
||||||
static const char *opencl();
|
static const char *opencl();
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
|
# ifdef XMRIG_FEATURE_PROFILING
|
||||||
|
static const char* profiler();
|
||||||
|
# endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
100
src/base/tools/Profiler.cpp
Normal file
100
src/base/tools/Profiler.cpp
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
/* XMRig
|
||||||
|
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||||
|
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include "base/tools/Profiler.h"
|
||||||
|
#include "base/io/log/Log.h"
|
||||||
|
#include "base/io/log/Tags.h"
|
||||||
|
#include <sstream>
|
||||||
|
#include <thread>
|
||||||
|
#include <chrono>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef XMRIG_FEATURE_PROFILING
|
||||||
|
|
||||||
|
|
||||||
|
ProfileScopeData* ProfileScopeData::s_data[MAX_DATA_COUNT] = {};
|
||||||
|
volatile long ProfileScopeData::s_dataCount = 0;
|
||||||
|
double ProfileScopeData::s_tscSpeed = 0.0;
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef NOINLINE
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#define NOINLINE __attribute__ ((noinline))
|
||||||
|
#elif _MSC_VER
|
||||||
|
#define NOINLINE __declspec(noinline)
|
||||||
|
#else
|
||||||
|
#define NOINLINE
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static std::string get_thread_id()
|
||||||
|
{
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << std::this_thread::get_id();
|
||||||
|
|
||||||
|
std::string s = ss.str();
|
||||||
|
if (s.length() > ProfileScopeData::MAX_THREAD_ID_LENGTH) {
|
||||||
|
s.resize(ProfileScopeData::MAX_THREAD_ID_LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
NOINLINE void ProfileScopeData::Register(ProfileScopeData* data)
|
||||||
|
{
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
const long id = _InterlockedIncrement(&s_dataCount) - 1;
|
||||||
|
#else
|
||||||
|
const long id = __sync_fetch_and_add(&s_dataCount, 1);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (static_cast<unsigned long>(id) < MAX_DATA_COUNT) {
|
||||||
|
s_data[id] = data;
|
||||||
|
|
||||||
|
const std::string s = get_thread_id();
|
||||||
|
memcpy(data->m_threadId, s.c_str(), s.length() + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
NOINLINE void ProfileScopeData::Init()
|
||||||
|
{
|
||||||
|
using namespace std::chrono;
|
||||||
|
|
||||||
|
const uint64_t t1 = static_cast<uint64_t>(time_point_cast<nanoseconds>(high_resolution_clock::now()).time_since_epoch().count());
|
||||||
|
const uint64_t count1 = ReadTSC();
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
const uint64_t t2 = static_cast<uint64_t>(time_point_cast<nanoseconds>(high_resolution_clock::now()).time_since_epoch().count());
|
||||||
|
const uint64_t count2 = ReadTSC();
|
||||||
|
|
||||||
|
if (t2 - t1 > 1000000000) {
|
||||||
|
s_tscSpeed = (count2 - count1) * 1e9 / (t2 - t1);
|
||||||
|
LOG_INFO("%s TSC speed = %.3f GHz", xmrig::Tags::profiler(), s_tscSpeed / 1e9);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* XMRIG_FEATURE_PROFILING */
|
132
src/base/tools/Profiler.h
Normal file
132
src/base/tools/Profiler.h
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
/* XMRig
|
||||||
|
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||||
|
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef XMRIG_PROFILER_H
|
||||||
|
#define XMRIG_PROFILER_H
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef FORCE_INLINE
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#define FORCE_INLINE __forceinline
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
|
#elif defined(__clang__)
|
||||||
|
#define FORCE_INLINE __inline__
|
||||||
|
#else
|
||||||
|
#define FORCE_INLINE
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef XMRIG_FEATURE_PROFILING
|
||||||
|
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <type_traits>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#include <intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static FORCE_INLINE uint64_t ReadTSC()
|
||||||
|
{
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
return __rdtsc();
|
||||||
|
#else
|
||||||
|
uint32_t hi, lo;
|
||||||
|
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
|
||||||
|
return (((uint64_t)hi) << 32) | lo;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
struct ProfileScopeData
|
||||||
|
{
|
||||||
|
const char* m_name;
|
||||||
|
uint64_t m_totalCycles;
|
||||||
|
uint32_t m_totalSamples;
|
||||||
|
|
||||||
|
enum
|
||||||
|
{
|
||||||
|
MAX_THREAD_ID_LENGTH = 11,
|
||||||
|
MAX_SAMPLE_COUNT = 128,
|
||||||
|
MAX_DATA_COUNT = 1024
|
||||||
|
};
|
||||||
|
|
||||||
|
char m_threadId[MAX_THREAD_ID_LENGTH + 1];
|
||||||
|
|
||||||
|
static ProfileScopeData* s_data[MAX_DATA_COUNT];
|
||||||
|
static volatile long s_dataCount;
|
||||||
|
static double s_tscSpeed;
|
||||||
|
|
||||||
|
static void Register(ProfileScopeData* data);
|
||||||
|
static void Init();
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(std::is_trivial<ProfileScopeData>::value, "ProfileScopeData must be a trivial struct");
|
||||||
|
static_assert(sizeof(ProfileScopeData) <= 32, "ProfileScopeData struct is too big");
|
||||||
|
|
||||||
|
|
||||||
|
class ProfileScope
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
FORCE_INLINE ProfileScope(ProfileScopeData& data)
|
||||||
|
: m_data(data)
|
||||||
|
{
|
||||||
|
if (m_data.m_totalCycles == 0) {
|
||||||
|
ProfileScopeData::Register(&data);
|
||||||
|
}
|
||||||
|
|
||||||
|
m_startCounter = ReadTSC();
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE ~ProfileScope()
|
||||||
|
{
|
||||||
|
m_data.m_totalCycles += ReadTSC() - m_startCounter;
|
||||||
|
++m_data.m_totalSamples;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
ProfileScopeData& m_data;
|
||||||
|
uint64_t m_startCounter;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#define PROFILE_SCOPE(x) static thread_local ProfileScopeData x##_data{#x}; ProfileScope x(x##_data);
|
||||||
|
|
||||||
|
|
||||||
|
#else /* XMRIG_FEATURE_PROFILING */
|
||||||
|
#define PROFILE_SCOPE(x)
|
||||||
|
#endif /* XMRIG_FEATURE_PROFILING */
|
||||||
|
|
||||||
|
|
||||||
|
#include "crypto/randomx/blake2/blake2.h"
|
||||||
|
|
||||||
|
|
||||||
|
struct rx_blake2b_wrapper
|
||||||
|
{
|
||||||
|
FORCE_INLINE static void run(void* out, size_t outlen, const void* in, size_t inlen)
|
||||||
|
{
|
||||||
|
PROFILE_SCOPE(RandomX_Blake2b);
|
||||||
|
rx_blake2b(out, outlen, in, inlen);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* XMRIG_PROFILER_H */
|
|
@ -38,6 +38,7 @@
|
||||||
#include "base/kernel/Platform.h"
|
#include "base/kernel/Platform.h"
|
||||||
#include "base/net/stratum/Job.h"
|
#include "base/net/stratum/Job.h"
|
||||||
#include "base/tools/Object.h"
|
#include "base/tools/Object.h"
|
||||||
|
#include "base/tools/Profiler.h"
|
||||||
#include "base/tools/Timer.h"
|
#include "base/tools/Timer.h"
|
||||||
#include "core/config/Config.h"
|
#include "core/config/Config.h"
|
||||||
#include "core/Controller.h"
|
#include "core/Controller.h"
|
||||||
|
@ -267,6 +268,44 @@ public:
|
||||||
h = "MH/s";
|
h = "MH/s";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ifdef XMRIG_FEATURE_PROFILING
|
||||||
|
ProfileScopeData* data[ProfileScopeData::MAX_DATA_COUNT];
|
||||||
|
|
||||||
|
const uint32_t n = std::min<uint32_t>(ProfileScopeData::s_dataCount, ProfileScopeData::MAX_DATA_COUNT);
|
||||||
|
memcpy(data, ProfileScopeData::s_data, n * sizeof(ProfileScopeData*));
|
||||||
|
|
||||||
|
std::sort(data, data + n, [](ProfileScopeData* a, ProfileScopeData* b) {
|
||||||
|
return strcmp(a->m_threadId, b->m_threadId) < 0;
|
||||||
|
});
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n;)
|
||||||
|
{
|
||||||
|
uint32_t n1 = i;
|
||||||
|
while ((n1 < n) && (strcmp(data[i]->m_threadId, data[n1]->m_threadId) == 0)) {
|
||||||
|
++n1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::sort(data + i, data + n1, [](ProfileScopeData* a, ProfileScopeData* b) {
|
||||||
|
return a->m_totalCycles > b->m_totalCycles;
|
||||||
|
});
|
||||||
|
|
||||||
|
for (uint32_t j = i; j < n1; ++j) {
|
||||||
|
ProfileScopeData* p = data[j];
|
||||||
|
LOG_INFO("%s Thread %6s | %-30s | %7.3f%% | %9.0f ns",
|
||||||
|
Tags::profiler(),
|
||||||
|
p->m_threadId,
|
||||||
|
p->m_name,
|
||||||
|
p->m_totalCycles * 100.0 / data[i]->m_totalCycles,
|
||||||
|
p->m_totalCycles / p->m_totalSamples * 1e9 / ProfileScopeData::s_tscSpeed
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("%s --------------|--------------------------------|----------|-------------", Tags::profiler());
|
||||||
|
|
||||||
|
i = n1;
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
|
||||||
LOG_INFO("%s " WHITE_BOLD("speed") " 10s/60s/15m " CYAN_BOLD("%s") CYAN(" %s %s ") CYAN_BOLD("%s") " max " CYAN_BOLD("%s %s"),
|
LOG_INFO("%s " WHITE_BOLD("speed") " 10s/60s/15m " CYAN_BOLD("%s") CYAN(" %s %s ") CYAN_BOLD("%s") " max " CYAN_BOLD("%s %s"),
|
||||||
Tags::miner(),
|
Tags::miner(),
|
||||||
Hashrate::format(speed[0] * scale, num, sizeof(num) / 4),
|
Hashrate::format(speed[0] * scale, num, sizeof(num) / 4),
|
||||||
|
@ -311,6 +350,10 @@ xmrig::Miner::Miner(Controller *controller)
|
||||||
Platform::setThreadPriority(std::min(priority + 1, 5));
|
Platform::setThreadPriority(std::min(priority + 1, 5));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ifdef XMRIG_FEATURE_PROFILING
|
||||||
|
ProfileScopeData::Init();
|
||||||
|
# endif
|
||||||
|
|
||||||
# ifdef XMRIG_ALGO_RANDOMX
|
# ifdef XMRIG_ALGO_RANDOMX
|
||||||
Rx::init(this);
|
Rx::init(this);
|
||||||
# endif
|
# endif
|
||||||
|
|
|
@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "crypto/randomx/soft_aes.h"
|
#include "crypto/randomx/soft_aes.h"
|
||||||
#include "crypto/randomx/randomx.h"
|
#include "crypto/randomx/randomx.h"
|
||||||
|
#include "base/tools/Profiler.h"
|
||||||
|
|
||||||
#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
|
#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
|
||||||
#define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e
|
#define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e
|
||||||
|
@ -215,6 +216,8 @@ template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
|
||||||
|
|
||||||
template<bool softAes>
|
template<bool softAes>
|
||||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
||||||
|
PROFILE_SCOPE(RandomX_AES);
|
||||||
|
|
||||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||||
|
|
||||||
|
|
|
@ -92,7 +92,7 @@ extern "C" {
|
||||||
int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen);
|
int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen);
|
||||||
|
|
||||||
/* Simple API */
|
/* Simple API */
|
||||||
int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen);
|
int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen);
|
||||||
|
|
||||||
/* Argon2 Team - Begin Code */
|
/* Argon2 Team - Begin Code */
|
||||||
int rxa2_blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
|
int rxa2_blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
|
||||||
|
|
|
@ -1,402 +0,0 @@
|
||||||
/*
|
|
||||||
BLAKE2 reference source code package - optimized C implementations
|
|
||||||
|
|
||||||
Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
|
|
||||||
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
|
|
||||||
your option. The terms of these licenses can be found at:
|
|
||||||
|
|
||||||
- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
|
|
||||||
- OpenSSL license : https://www.openssl.org/source/license.html
|
|
||||||
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
More information about the BLAKE2 hash function can be found at
|
|
||||||
https://blake2.net.
|
|
||||||
*/
|
|
||||||
#ifndef BLAKE2B_LOAD_SSE41_H
|
|
||||||
#define BLAKE2B_LOAD_SSE41_H
|
|
||||||
|
|
||||||
#define LOAD_MSG_0_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m0, m1); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m2, m3); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_0_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m0, m1); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m2, m3); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_0_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m4, m5); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m6, m7); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_0_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m4, m5); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m6, m7); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_1_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m7, m2); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m4, m6); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_1_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m5, m4); \
|
|
||||||
b1 = _mm_alignr_epi8(m3, m7, 8); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_1_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m5, m2); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_1_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m6, m1); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m3, m1); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_2_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_alignr_epi8(m6, m5, 8); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m2, m7); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_2_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m4, m0); \
|
|
||||||
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_2_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m3, m4); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_2_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m7, m3); \
|
|
||||||
b1 = _mm_alignr_epi8(m2, m0, 8); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_3_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m3, m1); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m6, m5); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_3_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m4, m0); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m6, m7); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_3_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
|
|
||||||
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_3_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m3, m5); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m0, m4); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_4_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m4, m2); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m1, m5); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_4_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
|
|
||||||
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_4_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
|
|
||||||
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_4_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_alignr_epi8(m6, m0, 8); \
|
|
||||||
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_5_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m1, m3); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m0, m4); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_5_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m6, m5); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m5, m1); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_5_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m7, m0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_5_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m6, m2); \
|
|
||||||
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_6_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m7, m2); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_6_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m2, m7); \
|
|
||||||
b1 = _mm_alignr_epi8(m5, m6, 8); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_6_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m0, m3); \
|
|
||||||
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_6_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m3, m1); \
|
|
||||||
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_7_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m6, m3); \
|
|
||||||
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_7_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_alignr_epi8(m7, m5, 8); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m0, m4); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_7_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m2, m7); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m4, m1); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_7_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m0, m2); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m3, m5); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_8_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m3, m7); \
|
|
||||||
b1 = _mm_alignr_epi8(m0, m5, 8); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_8_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m7, m4); \
|
|
||||||
b1 = _mm_alignr_epi8(m4, m1, 8); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_8_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = m6; \
|
|
||||||
b1 = _mm_alignr_epi8(m5, m0, 8); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_8_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
|
|
||||||
b1 = m2; \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_9_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m5, m4); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m3, m0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_9_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m1, m2); \
|
|
||||||
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_9_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m7, m4); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m1, m6); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_9_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_alignr_epi8(m7, m5, 8); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m6, m0); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_10_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m0, m1); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m2, m3); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_10_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m0, m1); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m2, m3); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_10_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m4, m5); \
|
|
||||||
b1 = _mm_unpacklo_epi64(m6, m7); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_10_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpackhi_epi64(m4, m5); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m6, m7); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_11_1(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m7, m2); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m4, m6); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_11_2(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m5, m4); \
|
|
||||||
b1 = _mm_alignr_epi8(m3, m7, 8); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_11_3(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m5, m2); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_MSG_11_4(b0, b1) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
b0 = _mm_unpacklo_epi64(m6, m1); \
|
|
||||||
b1 = _mm_unpackhi_epi64(m3, m1); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -102,17 +102,21 @@
|
||||||
row4l = t1; \
|
row4l = t1; \
|
||||||
row4h = t0;
|
row4h = t0;
|
||||||
|
|
||||||
#include "blake2b-load-sse41.h"
|
#define LOAD_MSG(r, i, b0, b1) \
|
||||||
|
do { \
|
||||||
|
b0 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 1]], m[blake2b_sigma_sse41[r][i * 4 + 0]]); \
|
||||||
|
b1 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 3]], m[blake2b_sigma_sse41[r][i * 4 + 2]]); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
#define ROUND(r) \
|
#define ROUND(r) \
|
||||||
LOAD_MSG_ ##r ##_1(b0, b1); \
|
LOAD_MSG(r, 0, b0, b1); \
|
||||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||||
LOAD_MSG_ ##r ##_2(b0, b1); \
|
LOAD_MSG(r, 1, b0, b1); \
|
||||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||||
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||||
LOAD_MSG_ ##r ##_3(b0, b1); \
|
LOAD_MSG(r, 2, b0, b1); \
|
||||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||||
LOAD_MSG_ ##r ##_4(b0, b1); \
|
LOAD_MSG(r, 3, b0, b1); \
|
||||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||||
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||||
|
|
||||||
|
|
|
@ -56,6 +56,23 @@ static const uint64_t blake2b_IV[8] = {
|
||||||
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
|
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
|
||||||
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) };
|
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) };
|
||||||
|
|
||||||
|
#if defined(_M_X64) || defined(__x86_64__)
|
||||||
|
static const uint8_t blake2b_sigma_sse41[12][16] = {
|
||||||
|
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
|
||||||
|
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
|
||||||
|
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
|
||||||
|
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
|
||||||
|
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
|
||||||
|
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
|
||||||
|
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
|
||||||
|
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
|
||||||
|
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
|
||||||
|
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
|
||||||
|
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
|
||||||
|
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
static const uint8_t blake2b_sigma[12][16] = {
|
static const uint8_t blake2b_sigma[12][16] = {
|
||||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||||
|
@ -203,15 +220,6 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
|
||||||
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
|
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
|
||||||
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
|
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
|
||||||
|
|
||||||
const __m128i m0 = LOADU(block + 00);
|
|
||||||
const __m128i m1 = LOADU(block + 16);
|
|
||||||
const __m128i m2 = LOADU(block + 32);
|
|
||||||
const __m128i m3 = LOADU(block + 48);
|
|
||||||
const __m128i m4 = LOADU(block + 64);
|
|
||||||
const __m128i m5 = LOADU(block + 80);
|
|
||||||
const __m128i m6 = LOADU(block + 96);
|
|
||||||
const __m128i m7 = LOADU(block + 112);
|
|
||||||
|
|
||||||
row1l = LOADU(&S->h[0]);
|
row1l = LOADU(&S->h[0]);
|
||||||
row1h = LOADU(&S->h[2]);
|
row1h = LOADU(&S->h[2]);
|
||||||
row2l = LOADU(&S->h[4]);
|
row2l = LOADU(&S->h[4]);
|
||||||
|
@ -221,18 +229,11 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
|
||||||
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
|
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
|
||||||
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
|
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
|
||||||
|
|
||||||
ROUND(0);
|
const uint64_t* m = (const uint64_t*)(block);
|
||||||
ROUND(1);
|
|
||||||
ROUND(2);
|
for (uint32_t r = 0; r < 12; ++r) {
|
||||||
ROUND(3);
|
ROUND(r);
|
||||||
ROUND(4);
|
}
|
||||||
ROUND(5);
|
|
||||||
ROUND(6);
|
|
||||||
ROUND(7);
|
|
||||||
ROUND(8);
|
|
||||||
ROUND(9);
|
|
||||||
ROUND(10);
|
|
||||||
ROUND(11);
|
|
||||||
|
|
||||||
row1l = _mm_xor_si128(row3l, row1l);
|
row1l = _mm_xor_si128(row3l, row1l);
|
||||||
row1h = _mm_xor_si128(row3h, row1h);
|
row1h = _mm_xor_si128(row3h, row1h);
|
||||||
|
@ -388,8 +389,7 @@ int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen,
|
int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen) {
|
||||||
const void *key, size_t keylen) {
|
|
||||||
blake2b_state S;
|
blake2b_state S;
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
|
|
||||||
|
@ -402,25 +402,14 @@ int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen,
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
|
if (rx_blake2b_init(&S, outlen) < 0) {
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (keylen > 0) {
|
if (rx_blake2b_update(&S, in, inlen) < 0) {
|
||||||
if (rx_blake2b_init_key(&S, outlen, key, keylen) < 0) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (rx_blake2b_init(&S, outlen) < 0) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (rx_blake2b_update(&S, in, inlen) < 0) {
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
ret = rx_blake2b_final(&S, out, outlen);
|
ret = rx_blake2b_final(&S, out, outlen);
|
||||||
|
|
||||||
fail:
|
fail:
|
||||||
//clear_internal_memory(&S, sizeof(S));
|
//clear_internal_memory(&S, sizeof(S));
|
||||||
|
@ -442,43 +431,42 @@ int rxa2_blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
|
||||||
store32(outlen_bytes, (uint32_t)outlen);
|
store32(outlen_bytes, (uint32_t)outlen);
|
||||||
|
|
||||||
#define TRY(statement) \
|
#define TRY(statement) \
|
||||||
do { \
|
do { \
|
||||||
ret = statement; \
|
ret = statement; \
|
||||||
if (ret < 0) { \
|
if (ret < 0) { \
|
||||||
goto fail; \
|
goto fail; \
|
||||||
} \
|
} \
|
||||||
} while ((void)0, 0)
|
} while ((void)0, 0)
|
||||||
|
|
||||||
if (outlen <= BLAKE2B_OUTBYTES) {
|
if (outlen <= BLAKE2B_OUTBYTES) {
|
||||||
TRY(rx_blake2b_init(&blake_state, outlen));
|
TRY(rx_blake2b_init(&blake_state, outlen));
|
||||||
TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
|
TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
|
||||||
TRY(rx_blake2b_update(&blake_state, in, inlen));
|
TRY(rx_blake2b_update(&blake_state, in, inlen));
|
||||||
TRY(rx_blake2b_final(&blake_state, out, outlen));
|
TRY(rx_blake2b_final(&blake_state, out, outlen));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
uint32_t toproduce;
|
uint32_t toproduce;
|
||||||
uint8_t out_buffer[BLAKE2B_OUTBYTES];
|
uint8_t out_buffer[BLAKE2B_OUTBYTES];
|
||||||
uint8_t in_buffer[BLAKE2B_OUTBYTES];
|
uint8_t in_buffer[BLAKE2B_OUTBYTES];
|
||||||
TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
|
TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
|
||||||
TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
|
TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
|
||||||
TRY(rx_blake2b_update(&blake_state, in, inlen));
|
TRY(rx_blake2b_update(&blake_state, in, inlen));
|
||||||
TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
|
TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
|
||||||
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
|
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
|
||||||
out += BLAKE2B_OUTBYTES / 2;
|
out += BLAKE2B_OUTBYTES / 2;
|
||||||
toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
|
toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
|
||||||
|
|
||||||
while (toproduce > BLAKE2B_OUTBYTES) {
|
while (toproduce > BLAKE2B_OUTBYTES) {
|
||||||
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
|
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
|
||||||
TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
|
TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
|
||||||
BLAKE2B_OUTBYTES, NULL, 0));
|
BLAKE2B_OUTBYTES));
|
||||||
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
|
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
|
||||||
out += BLAKE2B_OUTBYTES / 2;
|
out += BLAKE2B_OUTBYTES / 2;
|
||||||
toproduce -= BLAKE2B_OUTBYTES / 2;
|
toproduce -= BLAKE2B_OUTBYTES / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
|
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
|
||||||
TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
|
TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES));
|
||||||
0));
|
|
||||||
memcpy(out, out_buffer, toproduce);
|
memcpy(out, out_buffer, toproduce);
|
||||||
}
|
}
|
||||||
fail:
|
fail:
|
||||||
|
|
|
@ -55,7 +55,7 @@ namespace randomx {
|
||||||
|
|
||||||
void Blake2Generator::checkData(const size_t bytesNeeded) {
|
void Blake2Generator::checkData(const size_t bytesNeeded) {
|
||||||
if (dataIndex + bytesNeeded > sizeof(data)) {
|
if (dataIndex + bytesNeeded > sizeof(data)) {
|
||||||
rx_blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0);
|
rx_blake2b(data, sizeof(data), data, sizeof(data));
|
||||||
dataIndex = 0;
|
dataIndex = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "crypto/randomx/program.hpp"
|
#include "crypto/randomx/program.hpp"
|
||||||
#include "crypto/randomx/reciprocal.h"
|
#include "crypto/randomx/reciprocal.h"
|
||||||
#include "crypto/randomx/virtual_memory.hpp"
|
#include "crypto/randomx/virtual_memory.hpp"
|
||||||
|
#include "base/tools/Profiler.h"
|
||||||
|
|
||||||
#ifdef XMRIG_FIX_RYZEN
|
#ifdef XMRIG_FIX_RYZEN
|
||||||
# include "crypto/rx/Rx.h"
|
# include "crypto/rx/Rx.h"
|
||||||
|
@ -255,6 +256,8 @@ namespace randomx {
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) {
|
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) {
|
||||||
|
PROFILE_SCOPE(RandomX_JIT_compile);
|
||||||
|
|
||||||
vm_flags = flags;
|
vm_flags = flags;
|
||||||
|
|
||||||
generateProgramPrologue(prog, pcfg);
|
generateProgramPrologue(prog, pcfg);
|
||||||
|
|
|
@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
|
#include "base/tools/Profiler.h"
|
||||||
|
|
||||||
RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
|
RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
|
||||||
{
|
{
|
||||||
ArgonSalt = "RandomWOW\x01";
|
ArgonSalt = "RandomWOW\x01";
|
||||||
|
@ -574,33 +576,35 @@ extern "C" {
|
||||||
assert(inputSize == 0 || input != nullptr);
|
assert(inputSize == 0 || input != nullptr);
|
||||||
assert(output != nullptr);
|
assert(output != nullptr);
|
||||||
alignas(16) uint64_t tempHash[8];
|
alignas(16) uint64_t tempHash[8];
|
||||||
rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
|
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize);
|
||||||
machine->initScratchpad(&tempHash);
|
machine->initScratchpad(&tempHash);
|
||||||
machine->resetRoundingMode();
|
machine->resetRoundingMode();
|
||||||
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
|
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
|
||||||
machine->run(&tempHash);
|
machine->run(&tempHash);
|
||||||
rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
|
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile));
|
||||||
}
|
}
|
||||||
machine->run(&tempHash);
|
machine->run(&tempHash);
|
||||||
machine->getFinalResult(output, RANDOMX_HASH_SIZE);
|
machine->getFinalResult(output);
|
||||||
}
|
}
|
||||||
|
|
||||||
void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) {
|
void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) {
|
||||||
rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
|
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize);
|
||||||
machine->initScratchpad(tempHash);
|
machine->initScratchpad(tempHash);
|
||||||
}
|
}
|
||||||
|
|
||||||
void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) {
|
void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) {
|
||||||
|
PROFILE_SCOPE(RandomX_hash);
|
||||||
|
|
||||||
machine->resetRoundingMode();
|
machine->resetRoundingMode();
|
||||||
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
|
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
|
||||||
machine->run(&tempHash);
|
machine->run(&tempHash);
|
||||||
rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
|
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile));
|
||||||
}
|
}
|
||||||
machine->run(&tempHash);
|
machine->run(&tempHash);
|
||||||
|
|
||||||
// Finish current hash and fill the scratchpad for the next hash at the same time
|
// Finish current hash and fill the scratchpad for the next hash at the same time
|
||||||
rx_blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0);
|
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), nextInput, nextInputSize);
|
||||||
machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash);
|
machine->hashAndFill(output, tempHash);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "crypto/randomx/blake2/blake2.h"
|
#include "crypto/randomx/blake2/blake2.h"
|
||||||
#include "crypto/randomx/intrin_portable.h"
|
#include "crypto/randomx/intrin_portable.h"
|
||||||
#include "crypto/randomx/allocator.hpp"
|
#include "crypto/randomx/allocator.hpp"
|
||||||
|
#include "base/tools/Profiler.h"
|
||||||
|
|
||||||
randomx_vm::~randomx_vm() {
|
randomx_vm::~randomx_vm() {
|
||||||
|
|
||||||
|
@ -109,15 +110,15 @@ namespace randomx {
|
||||||
}
|
}
|
||||||
|
|
||||||
template<bool softAes>
|
template<bool softAes>
|
||||||
void VmBase<softAes>::getFinalResult(void* out, size_t outSize) {
|
void VmBase<softAes>::getFinalResult(void* out) {
|
||||||
hashAes1Rx4<softAes>(scratchpad, ScratchpadSize, ®.a);
|
hashAes1Rx4<softAes>(scratchpad, ScratchpadSize, ®.a);
|
||||||
rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0);
|
rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<bool softAes>
|
template<bool softAes>
|
||||||
void VmBase<softAes>::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) {
|
void VmBase<softAes>::hashAndFill(void* out, uint64_t (&fill_state)[8]) {
|
||||||
hashAndFillAes1Rx4<softAes>(scratchpad, ScratchpadSize, ®.a, fill_state);
|
hashAndFillAes1Rx4<softAes>(scratchpad, ScratchpadSize, ®.a, fill_state);
|
||||||
rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0);
|
rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, ®, sizeof(RegisterFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<bool softAes>
|
template<bool softAes>
|
||||||
|
@ -127,6 +128,7 @@ namespace randomx {
|
||||||
|
|
||||||
template<bool softAes>
|
template<bool softAes>
|
||||||
void VmBase<softAes>::generateProgram(void* seed) {
|
void VmBase<softAes>::generateProgram(void* seed) {
|
||||||
|
PROFILE_SCOPE(RandomX_generate_program);
|
||||||
fillAes4Rx4<softAes>(seed, 128 + RandomX_CurrentConfig.ProgramSize * 8, &program);
|
fillAes4Rx4<softAes>(seed, 128 + RandomX_CurrentConfig.ProgramSize * 8, &program);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -38,8 +38,8 @@ class randomx_vm
|
||||||
public:
|
public:
|
||||||
virtual ~randomx_vm() = 0;
|
virtual ~randomx_vm() = 0;
|
||||||
virtual void setScratchpad(uint8_t *scratchpad) = 0;
|
virtual void setScratchpad(uint8_t *scratchpad) = 0;
|
||||||
virtual void getFinalResult(void* out, size_t outSize) = 0;
|
virtual void getFinalResult(void* out) = 0;
|
||||||
virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0;
|
virtual void hashAndFill(void* out, uint64_t (&fill_state)[8]) = 0;
|
||||||
virtual void setDataset(randomx_dataset* dataset) { }
|
virtual void setDataset(randomx_dataset* dataset) { }
|
||||||
virtual void setCache(randomx_cache* cache) { }
|
virtual void setCache(randomx_cache* cache) { }
|
||||||
virtual void initScratchpad(void* seed) = 0;
|
virtual void initScratchpad(void* seed) = 0;
|
||||||
|
@ -86,8 +86,8 @@ namespace randomx {
|
||||||
~VmBase() override;
|
~VmBase() override;
|
||||||
void setScratchpad(uint8_t *scratchpad) override;
|
void setScratchpad(uint8_t *scratchpad) override;
|
||||||
void initScratchpad(void* seed) override;
|
void initScratchpad(void* seed) override;
|
||||||
void getFinalResult(void* out, size_t outSize) override;
|
void getFinalResult(void* out) override;
|
||||||
void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override;
|
void hashAndFill(void* out, uint64_t (&fill_state)[8]) override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void generateProgram(void* seed);
|
void generateProgram(void* seed);
|
||||||
|
|
|
@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "crypto/randomx/vm_compiled.hpp"
|
#include "crypto/randomx/vm_compiled.hpp"
|
||||||
#include "crypto/randomx/common.hpp"
|
#include "crypto/randomx/common.hpp"
|
||||||
|
#include "base/tools/Profiler.h"
|
||||||
|
|
||||||
namespace randomx {
|
namespace randomx {
|
||||||
|
|
||||||
|
@ -41,6 +42,8 @@ namespace randomx {
|
||||||
|
|
||||||
template<bool softAes>
|
template<bool softAes>
|
||||||
void CompiledVm<softAes>::run(void* seed) {
|
void CompiledVm<softAes>::run(void* seed) {
|
||||||
|
PROFILE_SCOPE(RandomX_run);
|
||||||
|
|
||||||
compiler.prepare();
|
compiler.prepare();
|
||||||
VmBase<softAes>::generateProgram(seed);
|
VmBase<softAes>::generateProgram(seed);
|
||||||
randomx_vm::initialize();
|
randomx_vm::initialize();
|
||||||
|
@ -51,6 +54,8 @@ namespace randomx {
|
||||||
|
|
||||||
template<bool softAes>
|
template<bool softAes>
|
||||||
void CompiledVm<softAes>::execute() {
|
void CompiledVm<softAes>::execute() {
|
||||||
|
PROFILE_SCOPE(RandomX_JIT_execute);
|
||||||
|
|
||||||
#ifdef XMRIG_ARM
|
#ifdef XMRIG_ARM
|
||||||
memcpy(reg.f, config.eMask, sizeof(config.eMask));
|
memcpy(reg.f, config.eMask, sizeof(config.eMask));
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -120,8 +120,8 @@ bool xmrig::RxConfig::read(const rapidjson::Value &value)
|
||||||
}
|
}
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
const int mode = Json::getInt(value, kScratchpadPrefetchMode, static_cast<int>(m_scratchpadPrefetchMode));
|
const uint32_t mode = static_cast<uint32_t>(Json::getInt(value, kScratchpadPrefetchMode, static_cast<int>(m_scratchpadPrefetchMode)));
|
||||||
if ((mode >= ScratchpadPrefetchOff) && (mode < ScratchpadPrefetchMax)) {
|
if (mode < ScratchpadPrefetchMax) {
|
||||||
m_scratchpadPrefetchMode = static_cast<ScratchpadPrefetchMode>(mode);
|
m_scratchpadPrefetchMode = static_cast<ScratchpadPrefetchMode>(mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue