mirror of
https://github.com/xmrig/xmrig.git
synced 2024-12-23 12:09:22 +00:00
Merge branch 'dev'
This commit is contained in:
commit
cfe2a098ce
19 changed files with 203 additions and 116 deletions
|
@ -1,3 +1,10 @@
|
|||
# v6.3.5
|
||||
- [#1845](https://github.com/xmrig/xmrig/pull/1845) [#1861](https://github.com/xmrig/xmrig/pull/1861) Fixed ARM build and added CMake option `WITH_SSE4_1`.
|
||||
- [#1846](https://github.com/xmrig/xmrig/pull/1846) KawPow: fixed OpenCL memory leak.
|
||||
- [#1849](https://github.com/xmrig/xmrig/pull/1849) [#1859](https://github.com/xmrig/xmrig/pull/1859) RandomX: optimized soft AES code.
|
||||
- [#1850](https://github.com/xmrig/xmrig/pull/1850) [#1852](https://github.com/xmrig/xmrig/pull/1852) General code improvements.
|
||||
- [#1853](https://github.com/xmrig/xmrig/issues/1853) [#1856](https://github.com/xmrig/xmrig/pull/1856) [#1857](https://github.com/xmrig/xmrig/pull/1857) Fixed crash on old CPUs.
|
||||
|
||||
# v6.3.4
|
||||
- [#1823](https://github.com/xmrig/xmrig/pull/1823) RandomX: added new option `scratchpad_prefetch_mode`.
|
||||
- [#1827](https://github.com/xmrig/xmrig/pull/1827) [#1831](https://github.com/xmrig/xmrig/pull/1831) Improved nonce iteration performance.
|
||||
|
|
|
@ -24,6 +24,7 @@ option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support (
|
|||
option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON)
|
||||
option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF)
|
||||
option(WITH_PROFILING "Enable profiling for developers" OFF)
|
||||
option(WITH_SSE4_1 "Enable SSE 4.1 for Blake2" ON)
|
||||
|
||||
option(BUILD_STATIC "Build static binary" OFF)
|
||||
option(ARM_TARGET "Force use specific ARM target 8 or 7" 0)
|
||||
|
|
|
@ -2,9 +2,10 @@ if (NOT CMAKE_SYSTEM_PROCESSOR)
|
|||
message(WARNING "CMAKE_SYSTEM_PROCESSOR not defined")
|
||||
endif()
|
||||
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$")
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
add_definitions(/DRAPIDJSON_SSE2)
|
||||
else()
|
||||
set(WITH_SSE4_1 OFF)
|
||||
endif()
|
||||
|
||||
if (NOT ARM_TARGET)
|
||||
|
@ -41,3 +42,7 @@ if (ARM_TARGET AND ARM_TARGET GREATER 6)
|
|||
add_definitions(/DXMRIG_ARMv7)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (WITH_SSE4_1)
|
||||
add_definitions(/DXMRIG_FEATURE_SSE4_1)
|
||||
endif()
|
||||
|
|
|
@ -29,8 +29,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
|
|||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions")
|
||||
else()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
|
||||
|
||||
add_definitions(/DHAVE_ROTR)
|
||||
endif()
|
||||
|
@ -87,8 +87,8 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang)
|
|||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
|
||||
else()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
|
||||
|
||||
check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR)
|
||||
if (HAVE_ROTR)
|
||||
|
|
|
@ -64,6 +64,14 @@ if (WITH_RANDOMX)
|
|||
set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
|
||||
endif()
|
||||
|
||||
if (WITH_SSE4_1)
|
||||
list(APPEND SOURCES_CRYPTO src/crypto/randomx/blake2/blake2b_sse41.c)
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)
|
||||
set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS -msse4.1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES Clang)
|
||||
set_source_files_properties(src/crypto/randomx/jit_compiler_x86.cpp PROPERTIES COMPILE_FLAGS -Wno-unused-const-variable)
|
||||
endif()
|
||||
|
|
1
doc/build/CMAKE_OPTIONS.md
vendored
1
doc/build/CMAKE_OPTIONS.md
vendored
|
@ -22,6 +22,7 @@ This feature add external dependency to libhwloc (1.10.0+) (except MSVC builds).
|
|||
* **`-DWITH_EMBEDDED_CONFIG=ON`** Enable [embedded](https://github.com/xmrig/xmrig/issues/957) config support.
|
||||
* **`-DWITH_OPENCL=OFF`** Disable OpenCL backend.
|
||||
* **`-DWITH_CUDA=OFF`** Disable CUDA backend.
|
||||
* **`-DWITH_SSE4_1=OFF`** Disable SSE 4.1 for Blake2 (useful for arm builds).
|
||||
|
||||
## Debug options
|
||||
|
||||
|
|
|
@ -69,8 +69,6 @@ OclKawPowRunner::~OclKawPowRunner()
|
|||
|
||||
delete m_calculateDagKernel;
|
||||
|
||||
OclLib::release(m_searchKernel);
|
||||
|
||||
OclLib::release(m_controlQueue);
|
||||
OclLib::release(m_stop);
|
||||
|
||||
|
@ -120,8 +118,7 @@ void OclKawPowRunner::run(uint32_t nonce, uint32_t *hashOutput)
|
|||
void OclKawPowRunner::set(const Job &job, uint8_t *blob)
|
||||
{
|
||||
m_blockHeight = static_cast<uint32_t>(job.height());
|
||||
m_searchProgram = OclKawPow::get(*this, m_blockHeight, m_workGroupSize);
|
||||
m_searchKernel = OclLib::createKernel(m_searchProgram, "progpow_search");
|
||||
m_searchKernel = OclKawPow::get(*this, m_blockHeight, m_workGroupSize);
|
||||
|
||||
const uint32_t epoch = m_blockHeight / KPHash::EPOCH_LENGTH;
|
||||
|
||||
|
|
|
@ -69,7 +69,6 @@ private:
|
|||
|
||||
KawPow_CalculateDAGKernel* m_calculateDagKernel = nullptr;
|
||||
|
||||
cl_program m_searchProgram = nullptr;
|
||||
cl_kernel m_searchKernel = nullptr;
|
||||
|
||||
size_t m_workGroupSize = 256;
|
||||
|
|
|
@ -54,8 +54,9 @@ namespace xmrig {
|
|||
class KawPowCacheEntry
|
||||
{
|
||||
public:
|
||||
inline KawPowCacheEntry(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program) :
|
||||
inline KawPowCacheEntry(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program, cl_kernel kernel) :
|
||||
program(program),
|
||||
kernel(kernel),
|
||||
m_algo(algo),
|
||||
m_index(index),
|
||||
m_period(period),
|
||||
|
@ -65,9 +66,10 @@ public:
|
|||
inline bool isExpired(uint64_t period) const { return m_period + 1 < period; }
|
||||
inline bool match(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index) const { return m_algo == algo && m_period == period && m_worksize == worksize && m_index == index; }
|
||||
inline bool match(const IOclRunner &runner, uint64_t period, uint32_t worksize) const { return match(runner.algorithm(), period, worksize, runner.deviceIndex()); }
|
||||
inline void release() { OclLib::release(program); }
|
||||
inline void release() { OclLib::release(kernel); OclLib::release(program); }
|
||||
|
||||
cl_program program;
|
||||
cl_kernel kernel;
|
||||
|
||||
private:
|
||||
Algorithm m_algo;
|
||||
|
@ -82,16 +84,16 @@ class KawPowCache
|
|||
public:
|
||||
KawPowCache() = default;
|
||||
|
||||
inline cl_program search(const IOclRunner &runner, uint64_t period, uint32_t worksize) { return search(runner.algorithm(), period, worksize, runner.deviceIndex()); }
|
||||
inline cl_kernel search(const IOclRunner &runner, uint64_t period, uint32_t worksize) { return search(runner.algorithm(), period, worksize, runner.deviceIndex()); }
|
||||
|
||||
|
||||
inline cl_program search(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index)
|
||||
inline cl_kernel search(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
for (const auto &entry : m_data) {
|
||||
if (entry.match(algo, period, worksize, index)) {
|
||||
return entry.program;
|
||||
return entry.kernel;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -99,9 +101,10 @@ public:
|
|||
}
|
||||
|
||||
|
||||
void add(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program)
|
||||
void add(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program, cl_kernel kernel)
|
||||
{
|
||||
if (search(algo, period, worksize, index)) {
|
||||
OclLib::release(kernel);
|
||||
OclLib::release(program);
|
||||
return;
|
||||
}
|
||||
|
@ -109,7 +112,7 @@ public:
|
|||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
gc(period);
|
||||
m_data.emplace_back(algo, period, worksize, index, program);
|
||||
m_data.emplace_back(algo, period, worksize, index, program, kernel);
|
||||
}
|
||||
|
||||
|
||||
|
@ -159,15 +162,15 @@ static KawPowCache cache;
|
|||
class KawPowBuilder
|
||||
{
|
||||
public:
|
||||
cl_program build(const IOclRunner &runner, uint64_t period, uint32_t worksize)
|
||||
cl_kernel build(const IOclRunner &runner, uint64_t period, uint32_t worksize)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
const uint64_t ts = Chrono::steadyMSecs();
|
||||
|
||||
cl_program program = cache.search(runner, period, worksize);
|
||||
if (program) {
|
||||
return program;
|
||||
cl_kernel kernel = cache.search(runner, period, worksize);
|
||||
if (kernel) {
|
||||
return kernel;
|
||||
}
|
||||
|
||||
cl_int ret;
|
||||
|
@ -175,7 +178,7 @@ public:
|
|||
cl_device_id device = runner.data().device.id();
|
||||
const char *s = source.c_str();
|
||||
|
||||
program = OclLib::createProgramWithSource(runner.ctx(), 1, &s, nullptr, &ret);
|
||||
cl_program program = OclLib::createProgramWithSource(runner.ctx(), 1, &s, nullptr, &ret);
|
||||
if (ret != CL_SUCCESS) {
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -199,11 +202,17 @@ public:
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
kernel = OclLib::createKernel(program, "progpow_search", &ret);
|
||||
if (ret != CL_SUCCESS) {
|
||||
OclLib::release(program);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
LOG_INFO("%s " YELLOW("KawPow") " program for period " WHITE_BOLD("%" PRIu64) " compiled " BLACK_BOLD("(%" PRIu64 "ms)"), Tags::opencl(), period, Chrono::steadyMSecs() - ts);
|
||||
|
||||
cache.add(runner.algorithm(), period, worksize, runner.deviceIndex(), program);
|
||||
cache.add(runner.algorithm(), period, worksize, runner.deviceIndex(), program, kernel);
|
||||
|
||||
return program;
|
||||
return kernel;
|
||||
}
|
||||
|
||||
|
||||
|
@ -382,7 +391,7 @@ public:
|
|||
static KawPowBuilder builder;
|
||||
|
||||
|
||||
cl_program OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t worksize)
|
||||
cl_kernel OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t worksize)
|
||||
{
|
||||
const uint64_t period = height / KPHash::PERIOD_LENGTH;
|
||||
|
||||
|
@ -396,9 +405,9 @@ cl_program OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t wo
|
|||
[](uv_work_t *req, int) { delete static_cast<KawPowBaton*>(req->data); }
|
||||
);
|
||||
|
||||
cl_program program = cache.search(runner, period, worksize);
|
||||
if (program) {
|
||||
return program;
|
||||
cl_kernel kernel = cache.search(runner, period, worksize);
|
||||
if (kernel) {
|
||||
return kernel;
|
||||
}
|
||||
|
||||
return builder.build(runner, period, worksize);
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
#include <cstdint>
|
||||
|
||||
|
||||
using cl_program = struct _cl_program *;
|
||||
using cl_kernel = struct _cl_kernel *;
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
@ -42,7 +42,7 @@ class IOclRunner;
|
|||
class OclKawPow
|
||||
{
|
||||
public:
|
||||
static cl_program get(const IOclRunner &runner, uint64_t height, uint32_t worksize);
|
||||
static cl_kernel get(const IOclRunner &runner, uint64_t height, uint32_t worksize);
|
||||
static void clear();
|
||||
};
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@ public:
|
|||
|
||||
inline bool isCN() const { auto f = family(); return f == CN || f == CN_LITE || f == CN_HEAVY || f == CN_PICO; }
|
||||
inline bool isEqual(const Algorithm &other) const { return m_id == other.m_id; }
|
||||
inline bool isValid() const { return m_id != INVALID; }
|
||||
inline bool isValid() const { return m_id != INVALID && family() != UNKNOWN; }
|
||||
inline const char *name() const { return name(false); }
|
||||
inline const char *shortName() const { return name(true); }
|
||||
inline Family family() const { return family(m_id); }
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
xmrig::String::String(const char *str) :
|
||||
m_size(str == nullptr ? 0 : strlen(str))
|
||||
{
|
||||
if (m_size == 0) {
|
||||
if (str == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -121,7 +121,7 @@ public:
|
|||
for (int i = 0; i < Algorithm::MAX; ++i) {
|
||||
const Algorithm algo(static_cast<Algorithm::Id>(i));
|
||||
|
||||
if (isEnabled(algo)) {
|
||||
if (algo.isValid() && isEnabled(algo)) {
|
||||
algorithms.push_back(algo);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -260,17 +260,32 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
|
|||
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2, fill_state2); \
|
||||
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3, fill_state3);
|
||||
|
||||
HASH_STATE(0);
|
||||
HASH_STATE(1);
|
||||
switch(softAes) {
|
||||
case 0:
|
||||
HASH_STATE(0);
|
||||
HASH_STATE(1);
|
||||
|
||||
FILL_STATE(0);
|
||||
FILL_STATE(1);
|
||||
FILL_STATE(0);
|
||||
FILL_STATE(1);
|
||||
|
||||
rx_prefetch_t0(prefetchPtr);
|
||||
rx_prefetch_t0(prefetchPtr + 64);
|
||||
rx_prefetch_t0(prefetchPtr);
|
||||
rx_prefetch_t0(prefetchPtr + 64);
|
||||
|
||||
scratchpadPtr += 128;
|
||||
prefetchPtr += 128;
|
||||
scratchpadPtr += 128;
|
||||
prefetchPtr += 128;
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
rx_prefetch_t0(prefetchPtr);
|
||||
|
||||
scratchpadPtr += 64;
|
||||
prefetchPtr += 64;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
prefetchPtr = (const char*) scratchpad;
|
||||
scratchpadEnd += PREFETCH_DISTANCE;
|
||||
|
|
|
@ -39,40 +39,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "crypto/randomx/blake2/blake2.h"
|
||||
#include "crypto/randomx/blake2/blake2-impl.h"
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include "blake2b-round.h"
|
||||
|
||||
#endif
|
||||
|
||||
static const uint64_t blake2b_IV[8] = {
|
||||
const uint64_t blake2b_IV[8] = {
|
||||
UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
|
||||
UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
|
||||
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
|
||||
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) };
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
static const uint8_t blake2b_sigma_sse41[12][16] = {
|
||||
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
|
||||
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
|
||||
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
|
||||
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
|
||||
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
|
||||
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
|
||||
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
|
||||
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
|
||||
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
|
||||
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
|
||||
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
|
||||
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
|
||||
};
|
||||
#endif
|
||||
|
||||
static const uint8_t blake2b_sigma[12][16] = {
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||
|
@ -207,46 +179,6 @@ int rx_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t
|
|||
return 0;
|
||||
}
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
|
||||
{
|
||||
__m128i row1l, row1h;
|
||||
__m128i row2l, row2h;
|
||||
__m128i row3l, row3h;
|
||||
__m128i row4l, row4h;
|
||||
__m128i b0, b1;
|
||||
__m128i t0, t1;
|
||||
|
||||
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
|
||||
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
|
||||
|
||||
row1l = LOADU(&S->h[0]);
|
||||
row1h = LOADU(&S->h[2]);
|
||||
row2l = LOADU(&S->h[4]);
|
||||
row2h = LOADU(&S->h[6]);
|
||||
row3l = LOADU(&blake2b_IV[0]);
|
||||
row3h = LOADU(&blake2b_IV[2]);
|
||||
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
|
||||
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
|
||||
|
||||
const uint64_t* m = (const uint64_t*)(block);
|
||||
|
||||
for (uint32_t r = 0; r < 12; ++r) {
|
||||
ROUND(r);
|
||||
}
|
||||
|
||||
row1l = _mm_xor_si128(row3l, row1l);
|
||||
row1h = _mm_xor_si128(row3h, row1h);
|
||||
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
|
||||
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
|
||||
row2l = _mm_xor_si128(row4l, row2l);
|
||||
row2h = _mm_xor_si128(row4h, row2h);
|
||||
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
|
||||
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
|
||||
}
|
||||
#undef ROUND
|
||||
#endif
|
||||
|
||||
static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) {
|
||||
uint64_t m[16];
|
||||
uint64_t v[16];
|
||||
|
@ -305,9 +237,10 @@ static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block)
|
|||
#undef ROUND
|
||||
}
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
#if defined(XMRIG_FEATURE_SSE4_1)
|
||||
|
||||
uint32_t rx_blake2b_use_sse41 = 0;
|
||||
void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t* block);
|
||||
|
||||
#define rx_blake2b_compress(S, block) \
|
||||
if (rx_blake2b_use_sse41) \
|
||||
|
|
108
src/crypto/randomx/blake2/blake2b_sse41.c
Normal file
108
src/crypto/randomx/blake2/blake2b_sse41.c
Normal file
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
||||
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* Original code from Argon2 reference source code package used under CC0 Licence
|
||||
* https://github.com/P-H-C/phc-winner-argon2
|
||||
* Copyright 2015
|
||||
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
||||
*/
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "crypto/randomx/blake2/blake2.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include "blake2b-round.h"
|
||||
|
||||
|
||||
extern const uint64_t blake2b_IV[8];
|
||||
|
||||
|
||||
static const uint8_t blake2b_sigma_sse41[12][16] = {
|
||||
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
|
||||
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
|
||||
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
|
||||
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
|
||||
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
|
||||
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
|
||||
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
|
||||
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
|
||||
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
|
||||
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
|
||||
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
|
||||
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
|
||||
};
|
||||
|
||||
|
||||
void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
|
||||
{
|
||||
__m128i row1l, row1h;
|
||||
__m128i row2l, row2h;
|
||||
__m128i row3l, row3h;
|
||||
__m128i row4l, row4h;
|
||||
__m128i b0, b1;
|
||||
__m128i t0, t1;
|
||||
|
||||
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
|
||||
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
|
||||
|
||||
row1l = LOADU(&S->h[0]);
|
||||
row1h = LOADU(&S->h[2]);
|
||||
row2l = LOADU(&S->h[4]);
|
||||
row2h = LOADU(&S->h[6]);
|
||||
row3l = LOADU(&blake2b_IV[0]);
|
||||
row3h = LOADU(&blake2b_IV[2]);
|
||||
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
|
||||
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
|
||||
|
||||
const uint64_t* m = (const uint64_t*)(block);
|
||||
|
||||
for (uint32_t r = 0; r < 12; ++r) {
|
||||
ROUND(r);
|
||||
}
|
||||
|
||||
row1l = _mm_xor_si128(row3l, row1l);
|
||||
row1h = _mm_xor_si128(row3h, row1h);
|
||||
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
|
||||
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
|
||||
row2l = _mm_xor_si128(row4l, row2l);
|
||||
row2h = _mm_xor_si128(row4h, row2h);
|
||||
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
|
||||
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
|
||||
}
|
||||
#endif
|
|
@ -337,12 +337,16 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
|
|||
INST_HANDLE(FDIV_M, FMUL_R);
|
||||
INST_HANDLE(FSQRT_R, FDIV_M);
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
if (xmrig::Cpu::info()->jccErratum()) {
|
||||
INST_HANDLE2(CBRANCH, CBRANCH<true>, FSQRT_R);
|
||||
}
|
||||
else {
|
||||
INST_HANDLE2(CBRANCH, CBRANCH<false>, FSQRT_R);
|
||||
}
|
||||
#else
|
||||
INST_HANDLE(CBRANCH, FSQRT_R);
|
||||
#endif
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
if (xmrig::Cpu::info()->hasBMI2()) {
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
#include "crypto/rx/RxVm.h"
|
||||
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
#if defined(XMRIG_FEATURE_SSE4_1)
|
||||
extern "C" uint32_t rx_blake2b_use_sse41;
|
||||
#endif
|
||||
|
||||
|
@ -60,7 +60,7 @@ randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so
|
|||
flags |= RANDOMX_FLAG_AMD;
|
||||
}
|
||||
|
||||
# if defined(_M_X64) || defined(__x86_64__)
|
||||
# if defined(XMRIG_FEATURE_SSE4_1)
|
||||
rx_blake2b_use_sse41 = Cpu::info()->has(ICpuInfo::FLAG_SSE41) ? 1 : 0;
|
||||
# endif
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
#define APP_ID "xmrig"
|
||||
#define APP_NAME "XMRig"
|
||||
#define APP_DESC "XMRig miner"
|
||||
#define APP_VERSION "6.3.4"
|
||||
#define APP_VERSION "6.3.5-dev"
|
||||
#define APP_DOMAIN "xmrig.com"
|
||||
#define APP_SITE "www.xmrig.com"
|
||||
#define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com"
|
||||
|
@ -36,7 +36,7 @@
|
|||
|
||||
#define APP_VER_MAJOR 6
|
||||
#define APP_VER_MINOR 3
|
||||
#define APP_VER_PATCH 4
|
||||
#define APP_VER_PATCH 5
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# if (_MSC_VER >= 1920)
|
||||
|
|
Loading…
Reference in a new issue