Merge branch 'dev'

This commit is contained in:
XMRig 2020-10-03 11:47:07 +07:00
commit cfe2a098ce
No known key found for this signature in database
GPG key ID: 446A53638BE94409
19 changed files with 203 additions and 116 deletions

View file

@ -1,3 +1,10 @@
# v6.3.5
- [#1845](https://github.com/xmrig/xmrig/pull/1845) [#1861](https://github.com/xmrig/xmrig/pull/1861) Fixed ARM build and added CMake option `WITH_SSE4_1`.
- [#1846](https://github.com/xmrig/xmrig/pull/1846) KawPow: fixed OpenCL memory leak.
- [#1849](https://github.com/xmrig/xmrig/pull/1849) [#1859](https://github.com/xmrig/xmrig/pull/1859) RandomX: optimized soft AES code.
- [#1850](https://github.com/xmrig/xmrig/pull/1850) [#1852](https://github.com/xmrig/xmrig/pull/1852) General code improvements.
- [#1853](https://github.com/xmrig/xmrig/issues/1853) [#1856](https://github.com/xmrig/xmrig/pull/1856) [#1857](https://github.com/xmrig/xmrig/pull/1857) Fixed crash on old CPUs.
# v6.3.4
- [#1823](https://github.com/xmrig/xmrig/pull/1823) RandomX: added new option `scratchpad_prefetch_mode`.
- [#1827](https://github.com/xmrig/xmrig/pull/1827) [#1831](https://github.com/xmrig/xmrig/pull/1831) Improved nonce iteration performance.

View file

@ -24,6 +24,7 @@ option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support (
option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON)
option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF)
option(WITH_PROFILING "Enable profiling for developers" OFF)
option(WITH_SSE4_1 "Enable SSE 4.1 for Blake2" ON)
option(BUILD_STATIC "Build static binary" OFF)
option(ARM_TARGET "Force use specific ARM target 8 or 7" 0)

View file

@ -2,9 +2,10 @@ if (NOT CMAKE_SYSTEM_PROCESSOR)
message(WARNING "CMAKE_SYSTEM_PROCESSOR not defined")
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$")
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
add_definitions(/DRAPIDJSON_SSE2)
else()
set(WITH_SSE4_1 OFF)
endif()
if (NOT ARM_TARGET)
@ -41,3 +42,7 @@ if (ARM_TARGET AND ARM_TARGET GREATER 6)
add_definitions(/DXMRIG_ARMv7)
endif()
endif()
if (WITH_SSE4_1)
add_definitions(/DXMRIG_FEATURE_SSE4_1)
endif()

View file

@ -29,8 +29,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
add_definitions(/DHAVE_ROTR)
endif()
@ -87,8 +87,8 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR)
if (HAVE_ROTR)

View file

@ -64,6 +64,14 @@ if (WITH_RANDOMX)
set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
endif()
if (WITH_SSE4_1)
list(APPEND SOURCES_CRYPTO src/crypto/randomx/blake2/blake2b_sse41.c)
if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)
set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS -msse4.1)
endif()
endif()
if (CMAKE_CXX_COMPILER_ID MATCHES Clang)
set_source_files_properties(src/crypto/randomx/jit_compiler_x86.cpp PROPERTIES COMPILE_FLAGS -Wno-unused-const-variable)
endif()

View file

@ -22,6 +22,7 @@ This feature add external dependency to libhwloc (1.10.0+) (except MSVC builds).
* **`-DWITH_EMBEDDED_CONFIG=ON`** Enable [embedded](https://github.com/xmrig/xmrig/issues/957) config support.
* **`-DWITH_OPENCL=OFF`** Disable OpenCL backend.
* **`-DWITH_CUDA=OFF`** Disable CUDA backend.
* **`-DWITH_SSE4_1=OFF`** Disable SSE 4.1 for Blake2 (useful for arm builds).
## Debug options

View file

@ -69,8 +69,6 @@ OclKawPowRunner::~OclKawPowRunner()
delete m_calculateDagKernel;
OclLib::release(m_searchKernel);
OclLib::release(m_controlQueue);
OclLib::release(m_stop);
@ -120,8 +118,7 @@ void OclKawPowRunner::run(uint32_t nonce, uint32_t *hashOutput)
void OclKawPowRunner::set(const Job &job, uint8_t *blob)
{
m_blockHeight = static_cast<uint32_t>(job.height());
m_searchProgram = OclKawPow::get(*this, m_blockHeight, m_workGroupSize);
m_searchKernel = OclLib::createKernel(m_searchProgram, "progpow_search");
m_searchKernel = OclKawPow::get(*this, m_blockHeight, m_workGroupSize);
const uint32_t epoch = m_blockHeight / KPHash::EPOCH_LENGTH;

View file

@ -69,7 +69,6 @@ private:
KawPow_CalculateDAGKernel* m_calculateDagKernel = nullptr;
cl_program m_searchProgram = nullptr;
cl_kernel m_searchKernel = nullptr;
size_t m_workGroupSize = 256;

View file

@ -54,8 +54,9 @@ namespace xmrig {
class KawPowCacheEntry
{
public:
inline KawPowCacheEntry(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program) :
inline KawPowCacheEntry(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program, cl_kernel kernel) :
program(program),
kernel(kernel),
m_algo(algo),
m_index(index),
m_period(period),
@ -65,9 +66,10 @@ public:
inline bool isExpired(uint64_t period) const { return m_period + 1 < period; }
inline bool match(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index) const { return m_algo == algo && m_period == period && m_worksize == worksize && m_index == index; }
inline bool match(const IOclRunner &runner, uint64_t period, uint32_t worksize) const { return match(runner.algorithm(), period, worksize, runner.deviceIndex()); }
inline void release() { OclLib::release(program); }
inline void release() { OclLib::release(kernel); OclLib::release(program); }
cl_program program;
cl_kernel kernel;
private:
Algorithm m_algo;
@ -82,16 +84,16 @@ class KawPowCache
public:
KawPowCache() = default;
inline cl_program search(const IOclRunner &runner, uint64_t period, uint32_t worksize) { return search(runner.algorithm(), period, worksize, runner.deviceIndex()); }
inline cl_kernel search(const IOclRunner &runner, uint64_t period, uint32_t worksize) { return search(runner.algorithm(), period, worksize, runner.deviceIndex()); }
inline cl_program search(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index)
inline cl_kernel search(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index)
{
std::lock_guard<std::mutex> lock(m_mutex);
for (const auto &entry : m_data) {
if (entry.match(algo, period, worksize, index)) {
return entry.program;
return entry.kernel;
}
}
@ -99,9 +101,10 @@ public:
}
void add(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program)
void add(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program, cl_kernel kernel)
{
if (search(algo, period, worksize, index)) {
OclLib::release(kernel);
OclLib::release(program);
return;
}
@ -109,7 +112,7 @@ public:
std::lock_guard<std::mutex> lock(m_mutex);
gc(period);
m_data.emplace_back(algo, period, worksize, index, program);
m_data.emplace_back(algo, period, worksize, index, program, kernel);
}
@ -159,15 +162,15 @@ static KawPowCache cache;
class KawPowBuilder
{
public:
cl_program build(const IOclRunner &runner, uint64_t period, uint32_t worksize)
cl_kernel build(const IOclRunner &runner, uint64_t period, uint32_t worksize)
{
std::lock_guard<std::mutex> lock(m_mutex);
const uint64_t ts = Chrono::steadyMSecs();
cl_program program = cache.search(runner, period, worksize);
if (program) {
return program;
cl_kernel kernel = cache.search(runner, period, worksize);
if (kernel) {
return kernel;
}
cl_int ret;
@ -175,7 +178,7 @@ public:
cl_device_id device = runner.data().device.id();
const char *s = source.c_str();
program = OclLib::createProgramWithSource(runner.ctx(), 1, &s, nullptr, &ret);
cl_program program = OclLib::createProgramWithSource(runner.ctx(), 1, &s, nullptr, &ret);
if (ret != CL_SUCCESS) {
return nullptr;
}
@ -199,11 +202,17 @@ public:
return nullptr;
}
kernel = OclLib::createKernel(program, "progpow_search", &ret);
if (ret != CL_SUCCESS) {
OclLib::release(program);
return nullptr;
}
LOG_INFO("%s " YELLOW("KawPow") " program for period " WHITE_BOLD("%" PRIu64) " compiled " BLACK_BOLD("(%" PRIu64 "ms)"), Tags::opencl(), period, Chrono::steadyMSecs() - ts);
cache.add(runner.algorithm(), period, worksize, runner.deviceIndex(), program);
cache.add(runner.algorithm(), period, worksize, runner.deviceIndex(), program, kernel);
return program;
return kernel;
}
@ -382,7 +391,7 @@ public:
static KawPowBuilder builder;
cl_program OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t worksize)
cl_kernel OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t worksize)
{
const uint64_t period = height / KPHash::PERIOD_LENGTH;
@ -396,9 +405,9 @@ cl_program OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t wo
[](uv_work_t *req, int) { delete static_cast<KawPowBaton*>(req->data); }
);
cl_program program = cache.search(runner, period, worksize);
if (program) {
return program;
cl_kernel kernel = cache.search(runner, period, worksize);
if (kernel) {
return kernel;
}
return builder.build(runner, period, worksize);

View file

@ -30,7 +30,7 @@
#include <cstdint>
using cl_program = struct _cl_program *;
using cl_kernel = struct _cl_kernel *;
namespace xmrig {
@ -42,7 +42,7 @@ class IOclRunner;
class OclKawPow
{
public:
static cl_program get(const IOclRunner &runner, uint64_t height, uint32_t worksize);
static cl_kernel get(const IOclRunner &runner, uint64_t height, uint32_t worksize);
static void clear();
};

View file

@ -96,7 +96,7 @@ public:
inline bool isCN() const { auto f = family(); return f == CN || f == CN_LITE || f == CN_HEAVY || f == CN_PICO; }
inline bool isEqual(const Algorithm &other) const { return m_id == other.m_id; }
inline bool isValid() const { return m_id != INVALID; }
inline bool isValid() const { return m_id != INVALID && family() != UNKNOWN; }
inline const char *name() const { return name(false); }
inline const char *shortName() const { return name(true); }
inline Family family() const { return family(m_id); }

View file

@ -33,7 +33,7 @@
xmrig::String::String(const char *str) :
m_size(str == nullptr ? 0 : strlen(str))
{
if (m_size == 0) {
if (str == nullptr) {
return;
}

View file

@ -121,7 +121,7 @@ public:
for (int i = 0; i < Algorithm::MAX; ++i) {
const Algorithm algo(static_cast<Algorithm::Id>(i));
if (isEnabled(algo)) {
if (algo.isValid() && isEnabled(algo)) {
algorithms.push_back(algo);
}
}

View file

@ -260,17 +260,32 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2, fill_state2); \
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3, fill_state3);
HASH_STATE(0);
HASH_STATE(1);
switch(softAes) {
case 0:
HASH_STATE(0);
HASH_STATE(1);
FILL_STATE(0);
FILL_STATE(1);
FILL_STATE(0);
FILL_STATE(1);
rx_prefetch_t0(prefetchPtr);
rx_prefetch_t0(prefetchPtr + 64);
rx_prefetch_t0(prefetchPtr);
rx_prefetch_t0(prefetchPtr + 64);
scratchpadPtr += 128;
prefetchPtr += 128;
scratchpadPtr += 128;
prefetchPtr += 128;
break;
default:
HASH_STATE(0);
FILL_STATE(0);
rx_prefetch_t0(prefetchPtr);
scratchpadPtr += 64;
prefetchPtr += 64;
break;
}
}
prefetchPtr = (const char*) scratchpad;
scratchpadEnd += PREFETCH_DISTANCE;

View file

@ -39,40 +39,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/blake2/blake2.h"
#include "crypto/randomx/blake2/blake2-impl.h"
#if defined(_M_X64) || defined(__x86_64__)
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <smmintrin.h>
#include "blake2b-round.h"
#endif
static const uint64_t blake2b_IV[8] = {
const uint64_t blake2b_IV[8] = {
UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) };
#if defined(_M_X64) || defined(__x86_64__)
static const uint8_t blake2b_sigma_sse41[12][16] = {
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
};
#endif
static const uint8_t blake2b_sigma[12][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
@ -207,46 +179,6 @@ int rx_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t
return 0;
}
#if defined(_M_X64) || defined(__x86_64__)
static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
{
__m128i row1l, row1h;
__m128i row2l, row2h;
__m128i row3l, row3h;
__m128i row4l, row4h;
__m128i b0, b1;
__m128i t0, t1;
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
row1l = LOADU(&S->h[0]);
row1h = LOADU(&S->h[2]);
row2l = LOADU(&S->h[4]);
row2h = LOADU(&S->h[6]);
row3l = LOADU(&blake2b_IV[0]);
row3h = LOADU(&blake2b_IV[2]);
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
const uint64_t* m = (const uint64_t*)(block);
for (uint32_t r = 0; r < 12; ++r) {
ROUND(r);
}
row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h);
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h);
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
}
#undef ROUND
#endif
static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) {
uint64_t m[16];
uint64_t v[16];
@ -305,9 +237,10 @@ static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block)
#undef ROUND
}
#if defined(_M_X64) || defined(__x86_64__)
#if defined(XMRIG_FEATURE_SSE4_1)
uint32_t rx_blake2b_use_sse41 = 0;
void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t* block);
#define rx_blake2b_compress(S, block) \
if (rx_blake2b_use_sse41) \

View file

@ -0,0 +1,108 @@
/*
* Copyright (c) 2018-2019, tevador <tevador@gmail.com>
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#if defined(_M_X64) || defined(__x86_64__)
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "crypto/randomx/blake2/blake2.h"
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <smmintrin.h>
#include "blake2b-round.h"
extern const uint64_t blake2b_IV[8];
static const uint8_t blake2b_sigma_sse41[12][16] = {
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
};
void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
{
__m128i row1l, row1h;
__m128i row2l, row2h;
__m128i row3l, row3h;
__m128i row4l, row4h;
__m128i b0, b1;
__m128i t0, t1;
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
row1l = LOADU(&S->h[0]);
row1h = LOADU(&S->h[2]);
row2l = LOADU(&S->h[4]);
row2h = LOADU(&S->h[6]);
row3l = LOADU(&blake2b_IV[0]);
row3h = LOADU(&blake2b_IV[2]);
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
const uint64_t* m = (const uint64_t*)(block);
for (uint32_t r = 0; r < 12; ++r) {
ROUND(r);
}
row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h);
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h);
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
}
#endif

View file

@ -337,12 +337,16 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
INST_HANDLE(FDIV_M, FMUL_R);
INST_HANDLE(FSQRT_R, FDIV_M);
#if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->jccErratum()) {
INST_HANDLE2(CBRANCH, CBRANCH<true>, FSQRT_R);
}
else {
INST_HANDLE2(CBRANCH, CBRANCH<false>, FSQRT_R);
}
#else
INST_HANDLE(CBRANCH, FSQRT_R);
#endif
#if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->hasBMI2()) {

View file

@ -31,7 +31,7 @@
#include "crypto/rx/RxVm.h"
#if defined(_M_X64) || defined(__x86_64__)
#if defined(XMRIG_FEATURE_SSE4_1)
extern "C" uint32_t rx_blake2b_use_sse41;
#endif
@ -60,7 +60,7 @@ randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so
flags |= RANDOMX_FLAG_AMD;
}
# if defined(_M_X64) || defined(__x86_64__)
# if defined(XMRIG_FEATURE_SSE4_1)
rx_blake2b_use_sse41 = Cpu::info()->has(ICpuInfo::FLAG_SSE41) ? 1 : 0;
# endif

View file

@ -28,7 +28,7 @@
#define APP_ID "xmrig"
#define APP_NAME "XMRig"
#define APP_DESC "XMRig miner"
#define APP_VERSION "6.3.4"
#define APP_VERSION "6.3.5-dev"
#define APP_DOMAIN "xmrig.com"
#define APP_SITE "www.xmrig.com"
#define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com"
@ -36,7 +36,7 @@
#define APP_VER_MAJOR 6
#define APP_VER_MINOR 3
#define APP_VER_PATCH 4
#define APP_VER_PATCH 5
#ifdef _MSC_VER
# if (_MSC_VER >= 1920)