Merge pull request #1780 from SChernykh/dev

Cryptonight OpenCL: fix for long input data
2024-10-30 13:07:46 +00:00 · 2020-07-16 19:56:20 +07:00 · 2020-07-16 19:56:20 +07:00 · ba80e27349
commit ba80e27349
parent e0eed7d5d6 bd8cf54a0b
6 changed files with 700 additions and 704 deletions
--- a/src/backend/opencl/cl/cn/cryptonight.cl
+++ b/src/backend/opencl/cl/cn/cryptonight.cl
@ -71,7 +71,7 @@ inline ulong getIdx()
 __attribute__((reqd_work_group_size(8, 8, 1)))
-__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
+__kernel void cn0(__global ulong *input, int inlen, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
 {
    uint ExpandedKey1[40];
    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
@ -109,34 +109,25 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
        if (get_local_id(1) == 0) {
            __local ulong* State = State_buf + get_local_id(0) * 25;
-            ((__local ulong8 *)State)[0] = vload8(0, input);
+            #pragma unroll
-            State[8]  = input[8];
+            for (int i = 0; i < 25; ++i) {
-            State[9]  = input[9];
+                State[i] = 0;
-            State[10] = input[10];
+            }
            State[11] = input[11];
            State[12] = input[12];
            State[13] = input[13];
            State[14] = input[14];
            State[15] = input[15];
            // Input length must be a multiple of 136 and padded on the host side
            for (int i = 0; inlen > 0; i += 17, inlen -= 136) {
                #pragma unroll
                for (int j = 0; j < 17; ++j) {
                    State[j] ^= input[i + j];
                }
                if (i == 0) {
                    ((__local uint *)State)[9]  &= 0x00FFFFFFU;
                    ((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
                    ((__local uint *)State)[10] &= 0xFF000000U;
            /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
             * handle get_global_id and get_global_offset as signed long long int and add
             * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
             * (even if it is correct casted to unsigned on the host)
             */
                    ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
            // Last bit of padding
            State[16] = 0x8000000000000000UL;
            for (int i = 17; i < 25; ++i) {
                State[i] = 0x00UL;
                }
                keccakf1600_2(State);
            }
            #pragma unroll 1
            for (int i = 0; i < 25; ++i) {
--- a/src/backend/opencl/cl/cn/cryptonight_cl.h
+++ b/src/backend/opencl/cl/cn/cryptonight_cl.h
--- a/src/backend/opencl/kernels/Cn0Kernel.cpp
+++ b/src/backend/opencl/kernels/Cn0Kernel.cpp
@ -38,10 +38,11 @@ void xmrig::Cn0Kernel::enqueue(cl_command_queue queue, uint32_t nonce, size_t th
 // __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
-void xmrig::Cn0Kernel::setArgs(cl_mem input, cl_mem scratchpads, cl_mem states, uint32_t threads)
+void xmrig::Cn0Kernel::setArgs(cl_mem input, int inlen, cl_mem scratchpads, cl_mem states, uint32_t threads)
 {
    setArg(0, sizeof(cl_mem), &input);
-    setArg(1, sizeof(cl_mem), &scratchpads);
+    setArg(1, sizeof(int), &inlen);
-    setArg(2, sizeof(cl_mem), &states);
+    setArg(2, sizeof(cl_mem), &scratchpads);
-    setArg(3, sizeof(uint32_t), &threads);
+    setArg(3, sizeof(cl_mem), &states);
    setArg(4, sizeof(uint32_t), &threads);
 }
--- a/src/backend/opencl/kernels/Cn0Kernel.h
+++ b/src/backend/opencl/kernels/Cn0Kernel.h
@ -38,7 +38,7 @@ public:
    inline Cn0Kernel(cl_program program) : OclKernel(program, "cn0") {}
    void enqueue(cl_command_queue queue, uint32_t nonce, size_t threads);
-    void setArgs(cl_mem input, cl_mem scratchpads, cl_mem states, uint32_t threads);
+    void setArgs(cl_mem input, int inlen, cl_mem scratchpads, cl_mem states, uint32_t threads);
 };
--- a/src/backend/opencl/runners/OclCnRunner.cpp
+++ b/src/backend/opencl/runners/OclCnRunner.cpp
@ -122,10 +122,16 @@ void xmrig::OclCnRunner::set(const Job &job, uint8_t *blob)
        throw std::length_error("job size too big");
    }
-    blob[job.size()] = 0x01;
+    const int inlen = static_cast<int>(job.size() + 136 - (job.size() % 136));
    memset(blob + job.size() + 1, 0, Job::kMaxBlobSize - job.size() - 1);
-    enqueueWriteBuffer(m_input, CL_TRUE, 0, Job::kMaxBlobSize, blob);
+    blob[job.size()] = 0x01;
    memset(blob + job.size() + 1, 0, inlen - job.size() - 1);
    blob[inlen - 1] |= 0x80;
    enqueueWriteBuffer(m_input, CL_TRUE, 0, inlen, blob);
    m_cn0->setArg(1, sizeof(int), &inlen);
    if (m_algorithm == Algorithm::CN_R && m_height != job.height()) {
        delete m_cn1;
@ -152,7 +158,7 @@ void xmrig::OclCnRunner::build()
    OclBaseRunner::build();
    m_cn0 = new Cn0Kernel(m_program);
-    m_cn0->setArgs(m_input, m_scratchpads, m_states, m_intensity);
+    m_cn0->setArgs(m_input, 0, m_scratchpads, m_states, m_intensity);
    m_cn2 = new Cn2Kernel(m_program);
    m_cn2->setArgs(m_scratchpads, m_states, m_branches, m_intensity);
--- a/src/base/net/stratum/Job.h
+++ b/src/base/net/stratum/Job.h
@ -46,7 +46,8 @@ public:
    // Max blob size is 84 (75 fixed + 9 variable), aligned to 96. https://github.com/xmrig/xmrig/issues/1 Thanks fireice-uk.
    // SECOR increase requirements for blob size: https://github.com/xmrig/xmrig/issues/913
    // Haven (XHV) offshore increases requirements by adding pricing_record struct (192 bytes) to block_header.
-    static constexpr const size_t kMaxBlobSize = 384;
+    // Round it up to 408 (136*3) for a convenient keccak calculation in OpenCL
    static constexpr const size_t kMaxBlobSize = 408;
    static constexpr const size_t kMaxSeedSize = 32;
    Job() = default;