* Restored all cn/1 based algorithms (cn/1, cn-lite/1, cn/rto, cn-heavy/tube)

2025-03-11 17:06:37 +00:00 · 2019-09-02 18:30:13 +07:00 · 2019-09-02 18:30:13 +07:00 · 235cda1051
commit 235cda1051
parent 28d1eaf8da
6 changed files with 2461 additions and 2464 deletions
--- a/src/backend/cpu/CpuBackend.cpp
+++ b/src/backend/cpu/CpuBackend.cpp
@ -139,8 +139,6 @@ public:
                 algo.l3() / 1024
                 );

-        workers.stop();
-
        status.start(threads, algo.l3());
        workers.start(threads);
    }
@ -310,6 +308,8 @@ void xmrig::CpuBackend::setJob(const Job &job)
        return stop();
    }

+    stop();
+
    d_ptr->threads = std::move(threads);
    d_ptr->start();
 }
--- a/src/backend/opencl/OclBackend.cpp
+++ b/src/backend/opencl/OclBackend.cpp
@ -143,8 +143,6 @@ public:
                 algo.l3() / 1024
                 );

-        workers.stop();
-
        status.start(threads.size());
        workers.start(threads);
    }
@ -277,6 +275,8 @@ void xmrig::OclBackend::setJob(const Job &job)
        return stop();
    }

+    stop();
+
    d_ptr->threads = std::move(threads);
    d_ptr->start();
 }
--- a/src/backend/opencl/cl/cn/cryptonight.cl
+++ b/src/backend/opencl/cl/cn/cryptonight.cl
@ -67,9 +67,6 @@ inline ulong getIdx()
 }


-//#include "opencl/cryptonight_gpu.cl"
-//XMRIG_INCLUDE_CN_GPU
-
 #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]


@ -212,269 +209,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 }


-#define VARIANT1_1(p) \
-        uint table = 0x75310U; \
-        uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
-        (p).s2 ^= ((table >> index) & 0x30U) << 24
-
-
-#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2_0
-
-
-#define VARIANT1_INIT() \
-        tweak1_2 = as_uint2(input[4]); \
-        tweak1_2.s0 >>= 24; \
-        tweak1_2.s0 |= tweak1_2.s1 << 8; \
-        tweak1_2.s1 = (uint) get_global_id(0); \
-        tweak1_2 ^= as_uint2(states[24])
-
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1_v1(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
-{
-    ulong a[2], b[2];
-    __local uint AES0[256], AES1[256];
-
-    const ulong gIdx = getIdx();
-
-    for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    uint2 tweak1_2;
-    uint4 b_x;
-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
-    {
-        states += 25 * gIdx;
-#       if (STRIDED_INDEX == 0)
-        Scratchpad += gIdx * (MEMORY >> 4);
-#       elif (STRIDED_INDEX == 1)
-#       if (ALGO_FAMILY == FAMILY_CN_HEAVY)
-            Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
-#       else
-            Scratchpad += gIdx;
-#       endif
-#       elif (STRIDED_INDEX == 2)
-        Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
-#       endif
-
-        a[0] = states[0] ^ states[4];
-        b[0] = states[2] ^ states[6];
-        a[1] = states[1] ^ states[5];
-        b[1] = states[3] ^ states[7];
-
-        b_x = ((uint4 *)b)[0];
-        VARIANT1_INIT();
-    }
-
-    mem_fence(CLK_LOCAL_MEM_FENCE);
-
-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
-    {
-        #pragma unroll CN_UNROLL
-        for (int i = 0; i < ITERATIONS; ++i) {
-            ulong c[2];
-
-            ((uint4 *)c)[0] = Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)];
-            ((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
-
-            b_x ^= ((uint4 *)c)[0];
-            VARIANT1_1(b_x);
-            Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)] = b_x;
-
-            uint4 tmp;
-            tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
-
-            a[1] += c[0] * as_ulong2(tmp).s0;
-            a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
-
-            uint2 tweak1_2_0 = tweak1_2;
-#           if ALGO == ALGO_CN_RTO
-            tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
-#           endif
-
-            VARIANT1_2(a[1]);
-            Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
-            VARIANT1_2(a[1]);
-
-            ((uint4 *)a)[0] ^= tmp;
-
-            b_x = ((uint4 *)c)[0];
-        }
-    }
-
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1_v2(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
-{
-#   if (ALGO == CRYPTONIGHT || ALGO == CRYPTONIGHT_PICO)
-    ulong a[2], b[4];
-    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
-    
-    const ulong gIdx = getIdx();
-
-    for(int i = get_local_id(0); i < 256; i += WORKSIZE)
-    {
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
-        AES2[i] = rotate(tmp, 16U);
-        AES3[i] = rotate(tmp, 24U);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
-    {
-        states += 25 * gIdx;
-
-#       if defined(__NV_CL_C_VERSION)
-            Scratchpad += gIdx * (ITERATIONS >> 2);
-#       else
-#           if (STRIDED_INDEX == 0)
-                Scratchpad += gIdx * (MEMORY >> 4);
-#           elif (STRIDED_INDEX == 1)
-                Scratchpad += gIdx;
-#           elif (STRIDED_INDEX == 2)
-                Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
-#           endif
-#       endif
-
-        a[0] = states[0] ^ states[4];
-        a[1] = states[1] ^ states[5];
-
-        b[0] = states[2] ^ states[6];
-        b[1] = states[3] ^ states[7];
-        b[2] = states[8] ^ states[10];
-        b[3] = states[9] ^ states[11];
-    }
-    
-    ulong2 bx0 = ((ulong2 *)b)[0];
-    ulong2 bx1 = ((ulong2 *)b)[1];
-    
-    mem_fence(CLK_LOCAL_MEM_FENCE);
-
-#   ifdef __NV_CL_C_VERSION
-        __local uint16 scratchpad_line_buf[WORKSIZE];
-        __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
-#       define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
-#   else
-#       if (STRIDED_INDEX == 0)
-#           define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
-#       elif (STRIDED_INDEX == 1)
-#           define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + mul24(as_uint(idx ^ (N << 4)), Threads)))
-#       elif (STRIDED_INDEX == 2)
-#           define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
-#       endif
-#   endif
-
-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
-    {
-    uint2 division_result = as_uint2(states[12]);
-    uint sqrt_result = as_uint2(states[13]).s0;
-
-    #pragma unroll CN_UNROLL
-    for(int i = 0; i < ITERATIONS; ++i)
-    {
-#       ifdef __NV_CL_C_VERSION
-            uint idx  = a[0] & 0x1FFFC0;
-            uint idx1 = a[0] & 0x30;
-
-            *scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
-#       else
-            uint idx = a[0] & MASK;
-#       endif
-
-        uint4 c = SCRATCHPAD_CHUNK(0);
-        c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);
-
-        {
-            const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
-            const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
-            const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
-
-            SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
-            SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
-            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
-        }
-
-        SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;
-
-#       ifdef __NV_CL_C_VERSION
-            *(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
-
-            idx = as_ulong2(c).s0 & 0x1FFFC0;
-            idx1 = as_ulong2(c).s0 & 0x30;
-
-            *scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
-#       else
-            idx = as_ulong2(c).s0 & MASK;
-#       endif
-
-        uint4 tmp = SCRATCHPAD_CHUNK(0);
-
-        {
-            tmp.s0 ^= division_result.s0;
-            tmp.s1 ^= division_result.s1 ^ sqrt_result;
-
-            division_result = fast_div_v2(as_ulong2(c).s1, (c.s0 + (sqrt_result << 1)) | 0x80000001UL);
-            sqrt_result = fast_sqrt_v2(as_ulong2(c).s0 + as_ulong(division_result));
-        }
-
-        ulong2 t;
-        t.s0 = mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);
-        t.s1 = as_ulong2(c).s0 * as_ulong2(tmp).s0;
-        {
-            const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ t;
-            const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
-            t ^= chunk2;
-            const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
-
-            SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
-            SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
-            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
-        }
-
-        a[1] += t.s1;
-        a[0] += t.s0;
-
-        SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
-
-#       ifdef __NV_CL_C_VERSION
-            *(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
-#       endif
-
-        ((uint4 *)a)[0] ^= tmp;
-        bx1 = bx0;
-        bx0 = as_ulong2(c);
-    }
-    
-#   undef SCRATCHPAD_CHUNK
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-#   endif
-}
-
-
+#if (ALGO_BASE == ALGO_CN_0)
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
 {
@ -560,6 +295,280 @@ __kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ul

    mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
+#elif (ALGO_BASE == ALGO_CN_1)
+#define VARIANT1_1(p) \
+        uint table = 0x75310U; \
+        uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
+        (p).s2 ^= ((table >> index) & 0x30U) << 24
+
+
+#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2_0
+
+
+#define VARIANT1_INIT() \
+        tweak1_2 = as_uint2(input[4]); \
+        tweak1_2.s0 >>= 24; \
+        tweak1_2.s0 |= tweak1_2.s1 << 8; \
+        tweak1_2.s1 = (uint) get_global_id(0); \
+        tweak1_2 ^= as_uint2(states[24])
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
+{
+    ulong a[2], b[2];
+    __local uint AES0[256], AES1[256];
+
+    const ulong gIdx = getIdx();
+
+    for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
+        const uint tmp = AES0_C[i];
+        AES0[i] = tmp;
+        AES1[i] = rotate(tmp, 8U);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint2 tweak1_2;
+    uint4 b_x;
+
+    {
+        states += 25 * gIdx;
+#       if (STRIDED_INDEX == 0)
+        Scratchpad += gIdx * (MEMORY >> 4);
+#       elif (STRIDED_INDEX == 1)
+#       if (ALGO_FAMILY == FAMILY_CN_HEAVY)
+            Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + get_local_id(0);
+#       else
+            Scratchpad += gIdx;
+#       endif
+#       elif (STRIDED_INDEX == 2)
+        Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+#       endif
+
+        a[0] = states[0] ^ states[4];
+        b[0] = states[2] ^ states[6];
+        a[1] = states[1] ^ states[5];
+        b[1] = states[3] ^ states[7];
+
+        b_x = ((uint4 *)b)[0];
+        VARIANT1_INIT();
+    }
+
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+
+    {
+#       if (ALGO == ALGO_CN_HEAVY_TUBE)
+        uint idx0 = a[0];
+#           define IDX_0 idx0
+#       else
+#           define IDX_0 as_uint2(a[0]).s0
+#       endif
+
+        #pragma unroll CN_UNROLL
+        for (int i = 0; i < ITERATIONS; ++i) {
+            ulong c[2];
+
+            ((uint4 *)c)[0] = Scratchpad[IDX((IDX_0 & MASK) >> 4)];
+
+#           if (ALGO == ALGO_CN_HEAVY_TUBE)
+            ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+#           else
+            ((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+#           endif
+
+            b_x ^= ((uint4 *)c)[0];
+            VARIANT1_1(b_x);
+
+            Scratchpad[IDX((IDX_0 & MASK) >> 4)] = b_x;
+
+            uint4 tmp;
+            tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
+
+            a[1] += c[0] * as_ulong2(tmp).s0;
+            a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+
+            uint2 tweak1_2_0 = tweak1_2;
+#           if (ALGO == ALGO_CN_RTO || ALGO == ALGO_CN_HEAVY_TUBE)
+            tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
+#           endif
+
+            VARIANT1_2(a[1]);
+            Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
+            VARIANT1_2(a[1]);
+
+            ((uint4 *)a)[0] ^= tmp;
+
+#           if (ALGO == ALGO_CN_HEAVY_TUBE)
+            idx0 = a[0];
+#           endif
+
+            b_x = ((uint4 *)c)[0];
+
+#           if (ALGO == ALGO_CN_HEAVY_TUBE)
+            {
+                const long2 n = *((__global long2*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
+                long q = fast_div_heavy(n.s0, as_int4(n).s2 | 0x5);
+                *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n.s0 ^ q;
+                idx0 = as_int4(n).s2 ^ q;
+            }
+#           endif
+        }
+    }
+
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+#undef IDX_0
+#elif (ALGO_BASE == ALGO_CN_2)
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void cn1(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
+{
+    ulong a[2], b[4];
+    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
+
+    const ulong gIdx = getIdx();
+
+    for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+    {
+        const uint tmp = AES0_C[i];
+        AES0[i] = tmp;
+        AES1[i] = rotate(tmp, 8U);
+        AES2[i] = rotate(tmp, 16U);
+        AES3[i] = rotate(tmp, 24U);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    {
+        states += 25 * gIdx;
+
+#       if defined(__NV_CL_C_VERSION)
+            Scratchpad += gIdx * (ITERATIONS >> 2);
+#       else
+#           if (STRIDED_INDEX == 0)
+                Scratchpad += gIdx * (MEMORY >> 4);
+#           elif (STRIDED_INDEX == 1)
+                Scratchpad += gIdx;
+#           elif (STRIDED_INDEX == 2)
+                Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+#           endif
+#       endif
+
+        a[0] = states[0] ^ states[4];
+        a[1] = states[1] ^ states[5];
+
+        b[0] = states[2] ^ states[6];
+        b[1] = states[3] ^ states[7];
+        b[2] = states[8] ^ states[10];
+        b[3] = states[9] ^ states[11];
+    }
+
+    ulong2 bx0 = ((ulong2 *)b)[0];
+    ulong2 bx1 = ((ulong2 *)b)[1];
+
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+
+#   ifdef __NV_CL_C_VERSION
+        __local uint16 scratchpad_line_buf[WORKSIZE];
+        __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
+#       define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
+#   else
+#       if (STRIDED_INDEX == 0)
+#           define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
+#       elif (STRIDED_INDEX == 1)
+#           define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + mul24(as_uint(idx ^ (N << 4)), Threads)))
+#       elif (STRIDED_INDEX == 2)
+#           define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
+#       endif
+#   endif
+
+    {
+    uint2 division_result = as_uint2(states[12]);
+    uint sqrt_result = as_uint2(states[13]).s0;
+
+    #pragma unroll CN_UNROLL
+    for(int i = 0; i < ITERATIONS; ++i)
+    {
+#       ifdef __NV_CL_C_VERSION
+            uint idx  = a[0] & 0x1FFFC0;
+            uint idx1 = a[0] & 0x30;
+
+            *scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
+#       else
+            uint idx = a[0] & MASK;
+#       endif
+
+        uint4 c = SCRATCHPAD_CHUNK(0);
+        c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);
+
+        {
+            const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+            const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+            const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+
+            SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
+            SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
+            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+        }
+
+        SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;
+
+#       ifdef __NV_CL_C_VERSION
+            *(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
+
+            idx = as_ulong2(c).s0 & 0x1FFFC0;
+            idx1 = as_ulong2(c).s0 & 0x30;
+
+            *scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
+#       else
+            idx = as_ulong2(c).s0 & MASK;
+#       endif
+
+        uint4 tmp = SCRATCHPAD_CHUNK(0);
+
+        {
+            tmp.s0 ^= division_result.s0;
+            tmp.s1 ^= division_result.s1 ^ sqrt_result;
+
+            division_result = fast_div_v2(as_ulong2(c).s1, (c.s0 + (sqrt_result << 1)) | 0x80000001UL);
+            sqrt_result = fast_sqrt_v2(as_ulong2(c).s0 + as_ulong(division_result));
+        }
+
+        ulong2 t;
+        t.s0 = mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);
+        t.s1 = as_ulong2(c).s0 * as_ulong2(tmp).s0;
+        {
+            const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ t;
+            const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+            t ^= chunk2;
+            const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+
+            SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
+            SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
+            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+        }
+
+        a[1] += t.s1;
+        a[0] += t.s0;
+
+        SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+
+#       ifdef __NV_CL_C_VERSION
+            *(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
+#       endif
+
+        ((uint4 *)a)[0] ^= tmp;
+        bx1 = bx0;
+        bx0 = as_ulong2(c);
+    }
+
+#   undef SCRATCHPAD_CHUNK
+    }
+
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+#endif


 __attribute__((reqd_work_group_size(8, 8, 1)))
@ -581,10 +590,6 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u

    barrier(CLK_LOCAL_MEM_FENCE);

-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
    {
        states += 25 * gIdx;
 #       if (STRIDED_INDEX == 0)
@ -626,10 +631,6 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
    *xin2_store = (uint4)(0, 0, 0, 0);
 #   endif

-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
    {
 #       if (ALGO_FAMILY == FAMILY_CN_HEAVY)
        #pragma unroll 2
@ -691,10 +692,6 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
    }
 #   endif

-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
    {
        vstore2(as_ulong2(text), get_local_id(1) + 4, states);
    }
@ -703,10 +700,6 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u

    __local ulong State_buf[8 * 25];

-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
    {
        if(!get_local_id(1))
        {
--- a/src/backend/opencl/cl/cn/cryptonight_cl.h
+++ b/src/backend/opencl/cl/cn/cryptonight_cl.h
--- a/src/backend/opencl/cl/cn/wolf-aes.cl
+++ b/src/backend/opencl/cl/cn/wolf-aes.cl
@ -114,6 +114,7 @@ static const __constant uint AES0_C[256] =

 #define BYTE(x, y) (xmrig_amd_bfe((x), (y) << 3U, 8U))

+#if (ALGO == ALGO_CN_HEAVY_TUBE)
 inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, uint4 x, uint4 k)
 {
    x = ~x;
@ -126,6 +127,7 @@ inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AE
    k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ rotate(AES0[BYTE(x.s1, 2)] ^ AES1[BYTE(x.s2, 3)], 16U);
    return k;
 }
+#endif

 uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
 {
--- a/src/backend/opencl/runners/OclCnRunner.cpp
+++ b/src/backend/opencl/runners/OclCnRunner.cpp
@ -76,6 +76,7 @@ xmrig::OclCnRunner::OclCnRunner(size_t index, const OclLaunchData &data) : OclBa
    m_options += " -DMEM_CHUNK_EXPONENT="   + std::to_string(1u << data.thread.memChunk()) + "U";
    m_options += " -DMEMORY="               + std::to_string(m_algorithm.l3()) + "LU";
    m_options += " -DALGO="                 + std::to_string(m_algorithm.id());
+    m_options += " -DALGO_BASE="            + std::to_string(CnAlgo<>::base(m_algorithm));
    m_options += " -DALGO_FAMILY="          + std::to_string(m_algorithm.family());
    m_options += " -DCN_UNROLL="            + std::to_string(data.thread.unrollFactor());