* Restored all cn/1 based algorithms (cn/1, cn-lite/1, cn/rto, cn-heavy/tube)

This commit is contained in:
XMRig 2019-09-02 18:30:13 +07:00
parent 28d1eaf8da
commit 235cda1051
6 changed files with 2461 additions and 2464 deletions

View file

@ -139,8 +139,6 @@ public:
algo.l3() / 1024
);
workers.stop();
status.start(threads, algo.l3());
workers.start(threads);
}
@ -310,6 +308,8 @@ void xmrig::CpuBackend::setJob(const Job &job)
return stop();
}
stop();
d_ptr->threads = std::move(threads);
d_ptr->start();
}

View file

@ -143,8 +143,6 @@ public:
algo.l3() / 1024
);
workers.stop();
status.start(threads.size());
workers.start(threads);
}
@ -277,6 +275,8 @@ void xmrig::OclBackend::setJob(const Job &job)
return stop();
}
stop();
d_ptr->threads = std::move(threads);
d_ptr->start();
}

View file

@ -67,9 +67,6 @@ inline ulong getIdx()
}
//#include "opencl/cryptonight_gpu.cl"
//XMRIG_INCLUDE_CN_GPU
#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
@ -212,269 +209,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
}
#define VARIANT1_1(p) \
uint table = 0x75310U; \
uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
(p).s2 ^= ((table >> index) & 0x30U) << 24
#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2_0
#define VARIANT1_INIT() \
tweak1_2 = as_uint2(input[4]); \
tweak1_2.s0 >>= 24; \
tweak1_2.s0 |= tweak1_2.s1 << 8; \
tweak1_2.s1 = (uint) get_global_id(0); \
tweak1_2 ^= as_uint2(states[24])
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_v1(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256];
const ulong gIdx = getIdx();
for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
}
barrier(CLK_LOCAL_MEM_FENCE);
uint2 tweak1_2;
uint4 b_x;
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
# if (ALGO_FAMILY == FAMILY_CN_HEAVY)
Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
# else
Scratchpad += gIdx;
# endif
# elif (STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif
a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0];
VARIANT1_INIT();
}
mem_fence(CLK_LOCAL_MEM_FENCE);
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
#pragma unroll CN_UNROLL
for (int i = 0; i < ITERATIONS; ++i) {
ulong c[2];
((uint4 *)c)[0] = Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
b_x ^= ((uint4 *)c)[0];
VARIANT1_1(b_x);
Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)] = b_x;
uint4 tmp;
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
uint2 tweak1_2_0 = tweak1_2;
# if ALGO == ALGO_CN_RTO
tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
# endif
VARIANT1_2(a[1]);
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
VARIANT1_2(a[1]);
((uint4 *)a)[0] ^= tmp;
b_x = ((uint4 *)c)[0];
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_v2(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
# if (ALGO == CRYPTONIGHT || ALGO == CRYPTONIGHT_PICO)
ulong a[2], b[4];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
const ulong gIdx = getIdx();
for(int i = get_local_id(0); i < 256; i += WORKSIZE)
{
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
AES2[i] = rotate(tmp, 16U);
AES3[i] = rotate(tmp, 24U);
}
barrier(CLK_LOCAL_MEM_FENCE);
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if defined(__NV_CL_C_VERSION)
Scratchpad += gIdx * (ITERATIONS >> 2);
# else
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
Scratchpad += gIdx;
# elif (STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif
# endif
a[0] = states[0] ^ states[4];
a[1] = states[1] ^ states[5];
b[0] = states[2] ^ states[6];
b[1] = states[3] ^ states[7];
b[2] = states[8] ^ states[10];
b[3] = states[9] ^ states[11];
}
ulong2 bx0 = ((ulong2 *)b)[0];
ulong2 bx1 = ((ulong2 *)b)[1];
mem_fence(CLK_LOCAL_MEM_FENCE);
# ifdef __NV_CL_C_VERSION
__local uint16 scratchpad_line_buf[WORKSIZE];
__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
# else
# if (STRIDED_INDEX == 0)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
# elif (STRIDED_INDEX == 1)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + mul24(as_uint(idx ^ (N << 4)), Threads)))
# elif (STRIDED_INDEX == 2)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
# endif
# endif
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
uint2 division_result = as_uint2(states[12]);
uint sqrt_result = as_uint2(states[13]).s0;
#pragma unroll CN_UNROLL
for(int i = 0; i < ITERATIONS; ++i)
{
# ifdef __NV_CL_C_VERSION
uint idx = a[0] & 0x1FFFC0;
uint idx1 = a[0] & 0x30;
*scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
# else
uint idx = a[0] & MASK;
# endif
uint4 c = SCRATCHPAD_CHUNK(0);
c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);
{
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;
# ifdef __NV_CL_C_VERSION
*(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
idx = as_ulong2(c).s0 & 0x1FFFC0;
idx1 = as_ulong2(c).s0 & 0x30;
*scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
# else
idx = as_ulong2(c).s0 & MASK;
# endif
uint4 tmp = SCRATCHPAD_CHUNK(0);
{
tmp.s0 ^= division_result.s0;
tmp.s1 ^= division_result.s1 ^ sqrt_result;
division_result = fast_div_v2(as_ulong2(c).s1, (c.s0 + (sqrt_result << 1)) | 0x80000001UL);
sqrt_result = fast_sqrt_v2(as_ulong2(c).s0 + as_ulong(division_result));
}
ulong2 t;
t.s0 = mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);
t.s1 = as_ulong2(c).s0 * as_ulong2(tmp).s0;
{
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ t;
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
t ^= chunk2;
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
a[1] += t.s1;
a[0] += t.s0;
SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
# ifdef __NV_CL_C_VERSION
*(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
# endif
((uint4 *)a)[0] ^= tmp;
bx1 = bx0;
bx0 = as_ulong2(c);
}
# undef SCRATCHPAD_CHUNK
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
# endif
}
#if (ALGO_BASE == ALGO_CN_0)
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
{
@ -560,6 +295,280 @@ __kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ul
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
#elif (ALGO_BASE == ALGO_CN_1)
#define VARIANT1_1(p) \
uint table = 0x75310U; \
uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
(p).s2 ^= ((table >> index) & 0x30U) << 24
#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2_0
#define VARIANT1_INIT() \
tweak1_2 = as_uint2(input[4]); \
tweak1_2.s0 >>= 24; \
tweak1_2.s0 |= tweak1_2.s1 << 8; \
tweak1_2.s1 = (uint) get_global_id(0); \
tweak1_2 ^= as_uint2(states[24])
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256];
const ulong gIdx = getIdx();
for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
}
barrier(CLK_LOCAL_MEM_FENCE);
uint2 tweak1_2;
uint4 b_x;
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
# if (ALGO_FAMILY == FAMILY_CN_HEAVY)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + get_local_id(0);
# else
Scratchpad += gIdx;
# endif
# elif (STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif
a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0];
VARIANT1_INIT();
}
mem_fence(CLK_LOCAL_MEM_FENCE);
{
# if (ALGO == ALGO_CN_HEAVY_TUBE)
uint idx0 = a[0];
# define IDX_0 idx0
# else
# define IDX_0 as_uint2(a[0]).s0
# endif
#pragma unroll CN_UNROLL
for (int i = 0; i < ITERATIONS; ++i) {
ulong c[2];
((uint4 *)c)[0] = Scratchpad[IDX((IDX_0 & MASK) >> 4)];
# if (ALGO == ALGO_CN_HEAVY_TUBE)
((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
# else
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
# endif
b_x ^= ((uint4 *)c)[0];
VARIANT1_1(b_x);
Scratchpad[IDX((IDX_0 & MASK) >> 4)] = b_x;
uint4 tmp;
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
uint2 tweak1_2_0 = tweak1_2;
# if (ALGO == ALGO_CN_RTO || ALGO == ALGO_CN_HEAVY_TUBE)
tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
# endif
VARIANT1_2(a[1]);
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
VARIANT1_2(a[1]);
((uint4 *)a)[0] ^= tmp;
# if (ALGO == ALGO_CN_HEAVY_TUBE)
idx0 = a[0];
# endif
b_x = ((uint4 *)c)[0];
# if (ALGO == ALGO_CN_HEAVY_TUBE)
{
const long2 n = *((__global long2*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
long q = fast_div_heavy(n.s0, as_int4(n).s2 | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n.s0 ^ q;
idx0 = as_int4(n).s2 ^ q;
}
# endif
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
#undef IDX_0
#elif (ALGO_BASE == ALGO_CN_2)
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
{
ulong a[2], b[4];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
const ulong gIdx = getIdx();
for(int i = get_local_id(0); i < 256; i += WORKSIZE)
{
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
AES2[i] = rotate(tmp, 16U);
AES3[i] = rotate(tmp, 24U);
}
barrier(CLK_LOCAL_MEM_FENCE);
{
states += 25 * gIdx;
# if defined(__NV_CL_C_VERSION)
Scratchpad += gIdx * (ITERATIONS >> 2);
# else
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
Scratchpad += gIdx;
# elif (STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif
# endif
a[0] = states[0] ^ states[4];
a[1] = states[1] ^ states[5];
b[0] = states[2] ^ states[6];
b[1] = states[3] ^ states[7];
b[2] = states[8] ^ states[10];
b[3] = states[9] ^ states[11];
}
ulong2 bx0 = ((ulong2 *)b)[0];
ulong2 bx1 = ((ulong2 *)b)[1];
mem_fence(CLK_LOCAL_MEM_FENCE);
# ifdef __NV_CL_C_VERSION
__local uint16 scratchpad_line_buf[WORKSIZE];
__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
# else
# if (STRIDED_INDEX == 0)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
# elif (STRIDED_INDEX == 1)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + mul24(as_uint(idx ^ (N << 4)), Threads)))
# elif (STRIDED_INDEX == 2)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
# endif
# endif
{
uint2 division_result = as_uint2(states[12]);
uint sqrt_result = as_uint2(states[13]).s0;
#pragma unroll CN_UNROLL
for(int i = 0; i < ITERATIONS; ++i)
{
# ifdef __NV_CL_C_VERSION
uint idx = a[0] & 0x1FFFC0;
uint idx1 = a[0] & 0x30;
*scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
# else
uint idx = a[0] & MASK;
# endif
uint4 c = SCRATCHPAD_CHUNK(0);
c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);
{
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;
# ifdef __NV_CL_C_VERSION
*(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
idx = as_ulong2(c).s0 & 0x1FFFC0;
idx1 = as_ulong2(c).s0 & 0x30;
*scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
# else
idx = as_ulong2(c).s0 & MASK;
# endif
uint4 tmp = SCRATCHPAD_CHUNK(0);
{
tmp.s0 ^= division_result.s0;
tmp.s1 ^= division_result.s1 ^ sqrt_result;
division_result = fast_div_v2(as_ulong2(c).s1, (c.s0 + (sqrt_result << 1)) | 0x80000001UL);
sqrt_result = fast_sqrt_v2(as_ulong2(c).s0 + as_ulong(division_result));
}
ulong2 t;
t.s0 = mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);
t.s1 = as_ulong2(c).s0 * as_ulong2(tmp).s0;
{
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ t;
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
t ^= chunk2;
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
a[1] += t.s1;
a[0] += t.s0;
SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
# ifdef __NV_CL_C_VERSION
*(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
# endif
((uint4 *)a)[0] ^= tmp;
bx1 = bx0;
bx0 = as_ulong2(c);
}
# undef SCRATCHPAD_CHUNK
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
#endif
__attribute__((reqd_work_group_size(8, 8, 1)))
@ -581,10 +590,6 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
barrier(CLK_LOCAL_MEM_FENCE);
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
@ -626,10 +631,6 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
*xin2_store = (uint4)(0, 0, 0, 0);
# endif
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
# if (ALGO_FAMILY == FAMILY_CN_HEAVY)
#pragma unroll 2
@ -691,10 +692,6 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
}
# endif
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
vstore2(as_ulong2(text), get_local_id(1) + 4, states);
}
@ -703,10 +700,6 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
__local ulong State_buf[8 * 25];
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
if(!get_local_id(1))
{

File diff suppressed because it is too large Load diff

View file

@ -114,6 +114,7 @@ static const __constant uint AES0_C[256] =
#define BYTE(x, y) (xmrig_amd_bfe((x), (y) << 3U, 8U))
#if (ALGO == ALGO_CN_HEAVY_TUBE)
inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, uint4 x, uint4 k)
{
x = ~x;
@ -126,6 +127,7 @@ inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AE
k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ rotate(AES0[BYTE(x.s1, 2)] ^ AES1[BYTE(x.s2, 3)], 16U);
return k;
}
#endif
uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
{

View file

@ -76,6 +76,7 @@ xmrig::OclCnRunner::OclCnRunner(size_t index, const OclLaunchData &data) : OclBa
m_options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(1u << data.thread.memChunk()) + "U";
m_options += " -DMEMORY=" + std::to_string(m_algorithm.l3()) + "LU";
m_options += " -DALGO=" + std::to_string(m_algorithm.id());
m_options += " -DALGO_BASE=" + std::to_string(CnAlgo<>::base(m_algorithm));
m_options += " -DALGO_FAMILY=" + std::to_string(m_algorithm.family());
m_options += " -DCN_UNROLL=" + std::to_string(data.thread.unrollFactor());