mirror of
https://github.com/xmrig/xmrig.git
synced 2024-12-22 03:29:32 +00:00
Use external script to prepare OpenCL source.
This commit is contained in:
parent
82696000e4
commit
fcfb738ded
17 changed files with 2948 additions and 693 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,2 +1,4 @@
|
|||
/build
|
||||
/CMakeLists.txt.user
|
||||
/.idea
|
||||
/src/backend/opencl/cl/cn/cryptonight_gen.cl
|
||||
|
|
23
package.json
Normal file
23
package.json
Normal file
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"name": "xmrig",
|
||||
"version": "3.0.0",
|
||||
"description": "RandomX, CryptoNight and Argon2 miner",
|
||||
"main": "index.js",
|
||||
"directories": {
|
||||
"doc": "doc"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "node scripts/generate_cl.js"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/xmrig/xmrig.git"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "GPLv3",
|
||||
"bugs": {
|
||||
"url": "https://github.com/xmrig/xmrig/issues"
|
||||
},
|
||||
"homepage": "https://github.com/xmrig/xmrig#readme"
|
||||
}
|
32
scripts/generate_cl.js
Normal file
32
scripts/generate_cl.js
Normal file
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
'use strict';
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { text2h, addIncludes } = require('./js/opencl');
|
||||
const cwd = process.cwd();
|
||||
|
||||
|
||||
function cn()
|
||||
{
|
||||
process.chdir(cwd);
|
||||
process.chdir(path.resolve('src/backend/opencl/cl/cn'));
|
||||
|
||||
const cn = addIncludes('cryptonight.cl', [
|
||||
'algorithm.cl',
|
||||
'wolf-aes.cl',
|
||||
'wolf-skein.cl',
|
||||
'jh.cl',
|
||||
'blake256.cl',
|
||||
'groestl256.cl',
|
||||
'fast_int_math_v2.cl',
|
||||
'fast_div_heavy.cl'
|
||||
]);
|
||||
|
||||
//fs.writeFileSync('cryptonight_gen.cl', cn);
|
||||
fs.writeFileSync('cryptonight_cl.h', text2h(cn, 'xmrig', 'cryptonight_cl'));
|
||||
}
|
||||
|
||||
|
||||
cn();
|
72
scripts/js/opencl.js
Normal file
72
scripts/js/opencl.js
Normal file
|
@ -0,0 +1,72 @@
|
|||
'use strict';
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
|
||||
function bin2h(buf, namespace, name)
|
||||
{
|
||||
const size = buf.byteLength;
|
||||
let out = `#pragma once\n\nnamespace ${namespace} {\n\nstatic unsigned char ${name}[${size}] = {\n `;
|
||||
|
||||
let b = 32;
|
||||
for (let i = 0; i < size; i++) {
|
||||
out += `0x${buf.readUInt8(i).toString(16).padStart(2, '0')}${size - i > 1 ? ',' : ''}`;
|
||||
|
||||
if (--b === 0) {
|
||||
b = 32;
|
||||
out += '\n ';
|
||||
}
|
||||
}
|
||||
|
||||
out += `\n};\n\n} // namespace ${namespace}\n`;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
function text2h(text, namespace, name)
|
||||
{
|
||||
const buf = Buffer.from(text);
|
||||
const size = buf.byteLength;
|
||||
let out = `#pragma once\n\nnamespace ${namespace} {\n\nstatic char ${name}[${size + 1}] = {\n `;
|
||||
|
||||
let b = 32;
|
||||
for (let i = 0; i < size; i++) {
|
||||
out += `0x${buf.readUInt8(i).toString(16).padStart(2, '0')},`;
|
||||
|
||||
if (--b === 0) {
|
||||
b = 32;
|
||||
out += '\n ';
|
||||
}
|
||||
}
|
||||
|
||||
out += '0x00';
|
||||
|
||||
out += `\n};\n\n} // namespace ${namespace}\n`;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
function addInclude(input, name)
|
||||
{
|
||||
return input.replace(`#include "${name}"`, fs.readFileSync(name, 'utf8'));
|
||||
}
|
||||
|
||||
|
||||
function addIncludes(inputFileName, names)
|
||||
{
|
||||
let data = fs.readFileSync(inputFileName, 'utf8');
|
||||
|
||||
for (let name of names) {
|
||||
data = addInclude(data, name);
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
|
||||
module.exports.bin2h = bin2h;
|
||||
module.exports.text2h = text2h;
|
||||
module.exports.addInclude = addInclude;
|
||||
module.exports.addIncludes = addIncludes;
|
|
@ -29,7 +29,6 @@
|
|||
#include "backend/common/Hashrate.h"
|
||||
#include "backend/common/interfaces/IWorker.h"
|
||||
#include "backend/common/Workers.h"
|
||||
#include "backend/opencl/cl/OclSource.h"
|
||||
#include "backend/opencl/OclBackend.h"
|
||||
#include "backend/opencl/OclConfig.h"
|
||||
#include "backend/opencl/OclLaunchData.h"
|
||||
|
@ -96,8 +95,6 @@ public:
|
|||
inline OclBackendPrivate(Controller *controller) :
|
||||
controller(controller)
|
||||
{
|
||||
OclSource::init();
|
||||
|
||||
init(controller->config()->cl());
|
||||
}
|
||||
|
||||
|
|
|
@ -23,75 +23,16 @@
|
|||
*/
|
||||
|
||||
|
||||
#include <string>
|
||||
#include <regex>
|
||||
|
||||
|
||||
#include "backend/opencl/cl/cn/cryptonight_cl.h"
|
||||
#include "backend/opencl/cl/OclSource.h"
|
||||
#include "crypto/common/Algorithm.h"
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
static std::string cn_source;
|
||||
|
||||
|
||||
} // namespace xmrig
|
||||
|
||||
|
||||
|
||||
const char *xmrig::OclSource::get(const Algorithm &algorithm)
|
||||
{
|
||||
if (algorithm.family() == Algorithm::RANDOM_X) {
|
||||
return nullptr; // FIXME
|
||||
}
|
||||
|
||||
return cn_source.c_str();
|
||||
}
|
||||
|
||||
|
||||
void xmrig::OclSource::init()
|
||||
{
|
||||
const char *cryptonightCL =
|
||||
#include "./cn/cryptonight.cl"
|
||||
;
|
||||
const char *cryptonightCL2 =
|
||||
#include "./cn/cryptonight2.cl"
|
||||
;
|
||||
const char *blake256CL =
|
||||
#include "./cn/blake256.cl"
|
||||
;
|
||||
const char *groestl256CL =
|
||||
#include "./cn/groestl256.cl"
|
||||
;
|
||||
const char *jhCL =
|
||||
#include "./cn/jh.cl"
|
||||
;
|
||||
const char *wolfAesCL =
|
||||
#include "./cn/wolf-aes.cl"
|
||||
;
|
||||
const char *wolfSkeinCL =
|
||||
#include "./cn/wolf-skein.cl"
|
||||
;
|
||||
const char *fastIntMathV2CL =
|
||||
#include "./cn/fast_int_math_v2.cl"
|
||||
;
|
||||
const char *fastDivHeavyCL =
|
||||
#include "./cn/fast_div_heavy.cl"
|
||||
;
|
||||
const char *cryptonight_gpu =
|
||||
#include "./cn/cryptonight_gpu.cl"
|
||||
;
|
||||
|
||||
cn_source.append(cryptonightCL);
|
||||
cn_source.append(cryptonightCL2);
|
||||
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_WOLF_AES"), wolfAesCL);
|
||||
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
|
||||
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_JH"), jhCL);
|
||||
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_BLAKE256"), blake256CL);
|
||||
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_GROESTL256"), groestl256CL);
|
||||
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
|
||||
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
|
||||
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_CN_GPU"), cryptonight_gpu);
|
||||
return cryptonight_cl;
|
||||
}
|
||||
|
|
27
src/backend/opencl/cl/cn/algorithm.cl
Normal file
27
src/backend/opencl/cl/cn/algorithm.cl
Normal file
|
@ -0,0 +1,27 @@
|
|||
enum Algorithm {
|
||||
ALGO_INVALID = -1,
|
||||
ALGO_CN_0, // "cn/0" CryptoNight (original).
|
||||
ALGO_CN_1, // "cn/1" CryptoNight variant 1 also known as Monero7 and CryptoNightV7.
|
||||
ALGO_CN_2, // "cn/2" CryptoNight variant 2.
|
||||
ALGO_CN_R, // "cn/r" CryptoNightR (Monero's variant 4).
|
||||
ALGO_CN_FAST, // "cn/fast" CryptoNight variant 1 with half iterations.
|
||||
ALGO_CN_HALF, // "cn/half" CryptoNight variant 2 with half iterations (Masari/Torque).
|
||||
ALGO_CN_XAO, // "cn/xao" CryptoNight variant 0 (modified, Alloy only).
|
||||
ALGO_CN_RTO, // "cn/rto" CryptoNight variant 1 (modified, Arto only).
|
||||
ALGO_CN_RWZ, // "cn/rwz" CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft).
|
||||
ALGO_CN_ZLS, // "cn/zls" CryptoNight variant 2 with 3/4 iterations (Zelerius).
|
||||
ALGO_CN_DOUBLE, // "cn/double" CryptoNight variant 2 with double iterations (X-CASH).
|
||||
ALGO_CN_GPU, // "cn/gpu" CryptoNight-GPU (Ryo).
|
||||
ALGO_CN_LITE_0, // "cn-lite/0" CryptoNight-Lite variant 0.
|
||||
ALGO_CN_LITE_1, // "cn-lite/1" CryptoNight-Lite variant 1.
|
||||
ALGO_CN_HEAVY_0, // "cn-heavy/0" CryptoNight-Heavy (4 MB).
|
||||
ALGO_CN_HEAVY_TUBE, // "cn-heavy/tube" CryptoNight-Heavy (modified, TUBE only).
|
||||
ALGO_CN_HEAVY_XHV, // "cn-heavy/xhv" CryptoNight-Heavy (modified, Haven Protocol only).
|
||||
ALGO_CN_PICO_0, // "cn-pico" CryptoNight Turtle (TRTL)
|
||||
ALGO_RX_0, // "rx/0" RandomX (reference configuration).
|
||||
ALGO_RX_WOW, // "rx/wow" RandomWOW (Wownero).
|
||||
ALGO_RX_LOKI, // "rx/loki" RandomXL (Loki).
|
||||
ALGO_AR2_CHUKWA, // "argon2/chukwa" Argon2id (Chukwa).
|
||||
ALGO_AR2_WRKZ, // "argon2/wrkz" Argon2id (WRKZ)
|
||||
ALGO_MAX
|
||||
};
|
|
@ -1,4 +1,3 @@
|
|||
R"===(
|
||||
/*
|
||||
* blake256 kernel implementation.
|
||||
*
|
||||
|
@ -90,4 +89,3 @@ __constant static const sph_u32 c_u256[16] = {
|
|||
v[b] ^= v[c]; \
|
||||
v[b] = rotate(v[b], 25U); \
|
||||
}
|
||||
)==="
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
R"===(
|
||||
/*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
@ -19,39 +18,16 @@ R"===(
|
|||
# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
|
||||
#endif
|
||||
|
||||
//#include "opencl/wolf-aes.cl"
|
||||
XMRIG_INCLUDE_WOLF_AES
|
||||
//#include "opencl/wolf-skein.cl"
|
||||
XMRIG_INCLUDE_WOLF_SKEIN
|
||||
//#include "opencl/jh.cl"
|
||||
XMRIG_INCLUDE_JH
|
||||
//#include "opencl/blake256.cl"
|
||||
XMRIG_INCLUDE_BLAKE256
|
||||
//#include "opencl/groestl256.cl"
|
||||
XMRIG_INCLUDE_GROESTL256
|
||||
//#include "fast_int_math_v2.cl"
|
||||
XMRIG_INCLUDE_FAST_INT_MATH_V2
|
||||
//#include "fast_div_heavy.cl"
|
||||
XMRIG_INCLUDE_FAST_DIV_HEAVY
|
||||
|
||||
#include "algorithm.cl"
|
||||
#include "wolf-aes.cl"
|
||||
#include "wolf-skein.cl"
|
||||
#include "jh.cl"
|
||||
#include "blake256.cl"
|
||||
#include "groestl256.cl"
|
||||
#include "fast_int_math_v2.cl"
|
||||
#include "fast_div_heavy.cl"
|
||||
|
||||
#define VARIANT_0 0 // Original CryptoNight or CryptoNight-Heavy
|
||||
#define VARIANT_1 1 // CryptoNight variant 1 also known as Monero7 and CryptoNightV7
|
||||
#define VARIANT_TUBE 2 // Modified CryptoNight Lite variant 1 with XOR (IPBC/TUBE only)
|
||||
#define VARIANT_XTL 3 // Modified CryptoNight variant 1 (Stellite only)
|
||||
#define VARIANT_MSR 4 // Modified CryptoNight variant 1 (Masari only)
|
||||
#define VARIANT_XHV 5 // Modified CryptoNight-Heavy (Haven Protocol only)
|
||||
#define VARIANT_XAO 6 // Modified CryptoNight variant 0 (Alloy only)
|
||||
#define VARIANT_RTO 7 // Modified CryptoNight variant 1 (Arto only)
|
||||
#define VARIANT_2 8 // CryptoNight variant 2
|
||||
#define VARIANT_HALF 9 // CryptoNight variant 2 with half iterations (Masari/Stellite)
|
||||
#define VARIANT_TRTL 10 // CryptoNight Turtle (TRTL)
|
||||
#define VARIANT_GPU 11 // CryptoNight-GPU (Ryo)
|
||||
|
||||
#define CRYPTONIGHT 0 /* CryptoNight (2 MB) */
|
||||
#define CRYPTONIGHT_LITE 1 /* CryptoNight (1 MB) */
|
||||
#define CRYPTONIGHT_HEAVY 2 /* CryptoNight (4 MB) */
|
||||
#define CRYPTONIGHT_PICO 3 /* CryptoNight (256 KB) */
|
||||
|
||||
#if defined(__NV_CL_C_VERSION) && STRIDED_INDEX != 0
|
||||
# undef STRIDED_INDEX
|
||||
|
@ -71,6 +47,7 @@ static const __constant ulong keccakf_rndc[24] =
|
|||
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
|
||||
};
|
||||
|
||||
|
||||
static const __constant uchar sbox[256] =
|
||||
{
|
||||
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
|
||||
|
@ -92,75 +69,27 @@ static const __constant uchar sbox[256] =
|
|||
};
|
||||
|
||||
|
||||
void keccakf1600(ulong *s)
|
||||
{
|
||||
for(int i = 0; i < 24; ++i)
|
||||
{
|
||||
ulong bc[5], tmp1, tmp2;
|
||||
bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
|
||||
bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL);
|
||||
bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
|
||||
bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
|
||||
bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
|
||||
|
||||
tmp1 = s[1] ^ bc[0];
|
||||
|
||||
s[0] ^= bc[4];
|
||||
s[1] = rotate(s[6] ^ bc[0], 44UL);
|
||||
s[6] = rotate(s[9] ^ bc[3], 20UL);
|
||||
s[9] = rotate(s[22] ^ bc[1], 61UL);
|
||||
s[22] = rotate(s[14] ^ bc[3], 39UL);
|
||||
s[14] = rotate(s[20] ^ bc[4], 18UL);
|
||||
s[20] = rotate(s[2] ^ bc[1], 62UL);
|
||||
s[2] = rotate(s[12] ^ bc[1], 43UL);
|
||||
s[12] = rotate(s[13] ^ bc[2], 25UL);
|
||||
s[13] = rotate(s[19] ^ bc[3], 8UL);
|
||||
s[19] = rotate(s[23] ^ bc[2], 56UL);
|
||||
s[23] = rotate(s[15] ^ bc[4], 41UL);
|
||||
s[15] = rotate(s[4] ^ bc[3], 27UL);
|
||||
s[4] = rotate(s[24] ^ bc[3], 14UL);
|
||||
s[24] = rotate(s[21] ^ bc[0], 2UL);
|
||||
s[21] = rotate(s[8] ^ bc[2], 55UL);
|
||||
s[8] = rotate(s[16] ^ bc[0], 35UL);
|
||||
s[16] = rotate(s[5] ^ bc[4], 36UL);
|
||||
s[5] = rotate(s[3] ^ bc[2], 28UL);
|
||||
s[3] = rotate(s[18] ^ bc[2], 21UL);
|
||||
s[18] = rotate(s[17] ^ bc[1], 15UL);
|
||||
s[17] = rotate(s[11] ^ bc[0], 10UL);
|
||||
s[11] = rotate(s[7] ^ bc[1], 6UL);
|
||||
s[7] = rotate(s[10] ^ bc[4], 3UL);
|
||||
s[10] = rotate(tmp1, 1UL);
|
||||
|
||||
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
|
||||
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
|
||||
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
|
||||
tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
|
||||
tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
|
||||
s[0] ^= keccakf_rndc[i];
|
||||
}
|
||||
}
|
||||
|
||||
static const __constant uint keccakf_rotc[24] =
|
||||
{
|
||||
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
|
||||
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
|
||||
};
|
||||
|
||||
|
||||
static const __constant uint keccakf_piln[24] =
|
||||
{
|
||||
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
|
||||
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
|
||||
};
|
||||
|
||||
|
||||
void keccakf1600_1(ulong *st)
|
||||
{
|
||||
int i, round;
|
||||
ulong t, bc[5];
|
||||
|
||||
#pragma unroll 1
|
||||
for(round = 0; round < 24; ++round)
|
||||
{
|
||||
|
||||
for (round = 0; round < 24; ++round) {
|
||||
// Theta
|
||||
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
|
||||
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
|
||||
|
@ -180,7 +109,7 @@ void keccakf1600_1(ulong *st)
|
|||
|
||||
// Rho Pi
|
||||
t = st[1];
|
||||
#pragma unroll
|
||||
#pragma unroll 1
|
||||
for (i = 0; i < 24; ++i) {
|
||||
bc[0] = st[keccakf_piln[i]];
|
||||
st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
|
||||
|
@ -188,16 +117,18 @@ void keccakf1600_1(ulong *st)
|
|||
}
|
||||
|
||||
#pragma unroll 1
|
||||
for(int i = 0; i < 25; i += 5)
|
||||
{
|
||||
for (int i = 0; i < 25; i += 5) {
|
||||
ulong tmp[5];
|
||||
|
||||
#pragma unroll 1
|
||||
for(int x = 0; x < 5; ++x)
|
||||
for (int x = 0; x < 5; ++x) {
|
||||
tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
|
||||
}
|
||||
|
||||
#pragma unroll 1
|
||||
for(int x = 0; x < 5; ++x) st[i + x] = tmp[x];
|
||||
for (int x = 0; x < 5; ++x) {
|
||||
st[i + x] = tmp[x];
|
||||
}
|
||||
}
|
||||
|
||||
// Iota
|
||||
|
@ -205,8 +136,6 @@ void keccakf1600_1(ulong *st)
|
|||
}
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
void keccakf1600_2(__local ulong *st)
|
||||
{
|
||||
|
@ -214,56 +143,54 @@ void keccakf1600_2(__local ulong *st)
|
|||
ulong t, bc[5];
|
||||
|
||||
#pragma unroll 1
|
||||
for (round = 0; round < 24; ++round)
|
||||
{
|
||||
for (round = 0; round < 24; ++round) {
|
||||
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
|
||||
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
|
||||
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
|
||||
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
|
||||
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
|
||||
|
||||
st[0] ^= bc[4];
|
||||
st[5] ^= bc[4];
|
||||
st[0] ^= bc[4];
|
||||
st[5] ^= bc[4];
|
||||
st[10] ^= bc[4];
|
||||
st[15] ^= bc[4];
|
||||
st[20] ^= bc[4];
|
||||
|
||||
st[1] ^= bc[0];
|
||||
st[6] ^= bc[0];
|
||||
st[1] ^= bc[0];
|
||||
st[6] ^= bc[0];
|
||||
st[11] ^= bc[0];
|
||||
st[16] ^= bc[0];
|
||||
st[21] ^= bc[0];
|
||||
|
||||
st[2] ^= bc[1];
|
||||
st[7] ^= bc[1];
|
||||
st[2] ^= bc[1];
|
||||
st[7] ^= bc[1];
|
||||
st[12] ^= bc[1];
|
||||
st[17] ^= bc[1];
|
||||
st[22] ^= bc[1];
|
||||
|
||||
st[3] ^= bc[2];
|
||||
st[8] ^= bc[2];
|
||||
st[3] ^= bc[2];
|
||||
st[8] ^= bc[2];
|
||||
st[13] ^= bc[2];
|
||||
st[18] ^= bc[2];
|
||||
st[23] ^= bc[2];
|
||||
|
||||
st[4] ^= bc[3];
|
||||
st[9] ^= bc[3];
|
||||
st[4] ^= bc[3];
|
||||
st[9] ^= bc[3];
|
||||
st[14] ^= bc[3];
|
||||
st[19] ^= bc[3];
|
||||
st[24] ^= bc[3];
|
||||
|
||||
// Rho Pi
|
||||
t = st[1];
|
||||
#pragma unroll
|
||||
#pragma unroll 1
|
||||
for (i = 0; i < 24; ++i) {
|
||||
bc[0] = st[keccakf_piln[i]];
|
||||
st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
|
||||
t = bc[0];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for(int i = 0; i < 25; i += 5)
|
||||
{
|
||||
#pragma unroll 1
|
||||
for (int i = 0; i < 25; i += 5) {
|
||||
ulong tmp1 = st[i], tmp2 = st[i + 1];
|
||||
|
||||
st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
|
||||
|
@ -278,40 +205,17 @@ void keccakf1600_2(__local ulong *st)
|
|||
}
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
void CNKeccak(ulong *output, ulong *input)
|
||||
{
|
||||
ulong st[25];
|
||||
|
||||
// Copy 72 bytes
|
||||
for(int i = 0; i < 9; ++i) st[i] = input[i];
|
||||
|
||||
// Last four and '1' bit for padding
|
||||
//st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
|
||||
|
||||
st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
|
||||
|
||||
for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
|
||||
|
||||
// Last bit of padding
|
||||
st[16] = 0x8000000000000000UL;
|
||||
|
||||
keccakf1600_1(st);
|
||||
|
||||
for(int i = 0; i < 25; ++i) output[i] = st[i];
|
||||
}
|
||||
|
||||
static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
|
||||
|
||||
#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
|
||||
|
||||
#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
|
||||
|
||||
|
||||
void AESExpandKey256(uint *keybuf)
|
||||
{
|
||||
//#pragma unroll 4
|
||||
for(uint c = 8, i = 1; c < 40; ++c)
|
||||
{
|
||||
for (uint c = 8, i = 1; c < 40; ++c) {
|
||||
// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
|
||||
uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
|
||||
|
||||
|
@ -322,8 +226,10 @@ void AESExpandKey256(uint *keybuf)
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
|
||||
|
||||
|
||||
#if (STRIDED_INDEX == 0)
|
||||
# define IDX(x) (x)
|
||||
#elif (STRIDED_INDEX == 1)
|
||||
|
@ -336,6 +242,7 @@ void AESExpandKey256(uint *keybuf)
|
|||
# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
|
||||
#endif
|
||||
|
||||
|
||||
inline ulong getIdx()
|
||||
{
|
||||
# if (STRIDED_INDEX == 0 || STRIDED_INDEX == 1 || STRIDED_INDEX == 2)
|
||||
|
@ -343,11 +250,13 @@ inline ulong getIdx()
|
|||
# endif
|
||||
}
|
||||
|
||||
|
||||
//#include "opencl/cryptonight_gpu.cl"
|
||||
XMRIG_INCLUDE_CN_GPU
|
||||
//XMRIG_INCLUDE_CN_GPU
|
||||
|
||||
#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(8, 8, 1)))
|
||||
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
|
||||
{
|
||||
|
@ -388,8 +297,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
|
|||
Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
|
||||
# endif
|
||||
|
||||
if (get_local_id(1) == 0)
|
||||
{
|
||||
if (get_local_id(1) == 0) {
|
||||
__local ulong* State = State_buf + get_local_id(0) * 25;
|
||||
|
||||
((__local ulong8 *)State)[0] = vload8(0, input);
|
||||
|
@ -421,7 +329,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
|
|||
|
||||
keccakf1600_2(State);
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll 1
|
||||
for (int i = 0; i < 25; ++i) {
|
||||
states[i] = State[i];
|
||||
}
|
||||
|
@ -495,25 +403,20 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
|
|||
Scratchpad[IDX(i + local_id1)] = text;
|
||||
}
|
||||
}
|
||||
|
||||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
#define VARIANT1_1(p) \
|
||||
uint table = 0x75310U; \
|
||||
uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
|
||||
(p).s2 ^= ((table >> index) & 0x30U) << 24
|
||||
|
||||
#define VARIANT1_1_XTL(p) \
|
||||
uint table = 0x75310U; \
|
||||
uint offset = variant == VARIANT_XTL ? 27 : 26; \
|
||||
uint index = (((p).s2 >> offset) & 12) | (((p).s2 >> 23) & 2); \
|
||||
(p).s2 ^= ((table >> index) & 0x30U) << 24
|
||||
|
||||
#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2_0
|
||||
|
||||
|
||||
#define VARIANT1_INIT() \
|
||||
tweak1_2 = as_uint2(input[4]); \
|
||||
tweak1_2.s0 >>= 24; \
|
||||
|
@ -521,8 +424,9 @@ R"===(
|
|||
tweak1_2.s1 = (uint) get_global_id(0); \
|
||||
tweak1_2 ^= as_uint2(states[24])
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
||||
__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
__kernel void cn1_v1(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
{
|
||||
ulong a[2], b[2];
|
||||
__local uint AES0[256], AES1[256];
|
||||
|
@ -581,7 +485,7 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, uin
|
|||
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
|
||||
|
||||
b_x ^= ((uint4 *)c)[0];
|
||||
VARIANT1_1_XTL(b_x);
|
||||
VARIANT1_1(b_x);
|
||||
Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)] = b_x;
|
||||
|
||||
uint4 tmp;
|
||||
|
@ -591,9 +495,9 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, uin
|
|||
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
|
||||
|
||||
uint2 tweak1_2_0 = tweak1_2;
|
||||
if (variant == VARIANT_RTO) {
|
||||
tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
|
||||
}
|
||||
# if ALGO == ALGO_CN_RTO
|
||||
tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
|
||||
# endif
|
||||
|
||||
VARIANT1_2(a[1]);
|
||||
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
|
||||
|
@ -604,15 +508,13 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, uin
|
|||
b_x = ((uint4 *)c)[0];
|
||||
}
|
||||
}
|
||||
|
||||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
||||
__kernel void cn1_v2_monero(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
__kernel void cn1_v2(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
{
|
||||
# if (ALGO == CRYPTONIGHT || ALGO == CRYPTONIGHT_PICO)
|
||||
ulong a[2], b[4];
|
||||
|
@ -768,356 +670,6 @@ __kernel void cn1_v2_monero(__global uint4 *Scratchpad, __global ulong *states,
|
|||
# endif
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
||||
__kernel void cn1_v2_half(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
{
|
||||
# if (ALGO == CRYPTONIGHT)
|
||||
ulong a[2], b[4];
|
||||
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
|
||||
|
||||
const ulong gIdx = getIdx();
|
||||
|
||||
for(int i = get_local_id(0); i < 256; i += WORKSIZE)
|
||||
{
|
||||
const uint tmp = AES0_C[i];
|
||||
AES0[i] = tmp;
|
||||
AES1[i] = rotate(tmp, 8U);
|
||||
AES2[i] = rotate(tmp, 16U);
|
||||
AES3[i] = rotate(tmp, 24U);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
# if (COMP_MODE == 1)
|
||||
// do not use early return here
|
||||
if (gIdx < Threads)
|
||||
# endif
|
||||
{
|
||||
states += 25 * gIdx;
|
||||
|
||||
# if defined(__NV_CL_C_VERSION)
|
||||
Scratchpad += gIdx * (0x40000 >> 2);
|
||||
# else
|
||||
# if (STRIDED_INDEX == 0)
|
||||
Scratchpad += gIdx * (MEMORY >> 4);
|
||||
# elif (STRIDED_INDEX == 1)
|
||||
Scratchpad += gIdx;
|
||||
# elif (STRIDED_INDEX == 2)
|
||||
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
|
||||
# endif
|
||||
# endif
|
||||
|
||||
a[0] = states[0] ^ states[4];
|
||||
a[1] = states[1] ^ states[5];
|
||||
|
||||
b[0] = states[2] ^ states[6];
|
||||
b[1] = states[3] ^ states[7];
|
||||
b[2] = states[8] ^ states[10];
|
||||
b[3] = states[9] ^ states[11];
|
||||
}
|
||||
|
||||
ulong2 bx0 = ((ulong2 *)b)[0];
|
||||
ulong2 bx1 = ((ulong2 *)b)[1];
|
||||
|
||||
mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
# ifdef __NV_CL_C_VERSION
|
||||
__local uint16 scratchpad_line_buf[WORKSIZE];
|
||||
__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
|
||||
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
|
||||
# else
|
||||
# if (STRIDED_INDEX == 0)
|
||||
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
|
||||
# elif (STRIDED_INDEX == 1)
|
||||
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + mul24(as_uint(idx ^ (N << 4)), Threads)))
|
||||
# elif (STRIDED_INDEX == 2)
|
||||
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if (COMP_MODE == 1)
|
||||
// do not use early return here
|
||||
if (gIdx < Threads)
|
||||
# endif
|
||||
{
|
||||
uint2 division_result = as_uint2(states[12]);
|
||||
uint sqrt_result = as_uint2(states[13]).s0;
|
||||
|
||||
#pragma unroll CN_UNROLL
|
||||
for(int i = 0; i < 0x40000; ++i)
|
||||
{
|
||||
# ifdef __NV_CL_C_VERSION
|
||||
uint idx = a[0] & 0x1FFFC0;
|
||||
uint idx1 = a[0] & 0x30;
|
||||
|
||||
*scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
|
||||
# else
|
||||
uint idx = a[0] & MASK;
|
||||
# endif
|
||||
|
||||
uint4 c = SCRATCHPAD_CHUNK(0);
|
||||
c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);
|
||||
|
||||
{
|
||||
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
|
||||
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
|
||||
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
|
||||
|
||||
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
|
||||
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
|
||||
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
|
||||
}
|
||||
|
||||
SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;
|
||||
|
||||
# ifdef __NV_CL_C_VERSION
|
||||
*(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
|
||||
|
||||
idx = as_ulong2(c).s0 & 0x1FFFC0;
|
||||
idx1 = as_ulong2(c).s0 & 0x30;
|
||||
|
||||
*scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
|
||||
# else
|
||||
idx = as_ulong2(c).s0 & MASK;
|
||||
# endif
|
||||
|
||||
uint4 tmp = SCRATCHPAD_CHUNK(0);
|
||||
|
||||
{
|
||||
tmp.s0 ^= division_result.s0;
|
||||
tmp.s1 ^= division_result.s1 ^ sqrt_result;
|
||||
|
||||
division_result = fast_div_v2(as_ulong2(c).s1, (c.s0 + (sqrt_result << 1)) | 0x80000001UL);
|
||||
sqrt_result = fast_sqrt_v2(as_ulong2(c).s0 + as_ulong(division_result));
|
||||
}
|
||||
|
||||
ulong2 t;
|
||||
t.s0 = mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);
|
||||
t.s1 = as_ulong2(c).s0 * as_ulong2(tmp).s0;
|
||||
{
|
||||
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ t;
|
||||
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
|
||||
t ^= chunk2;
|
||||
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
|
||||
|
||||
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
|
||||
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
|
||||
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
|
||||
}
|
||||
|
||||
a[1] += t.s1;
|
||||
a[0] += t.s0;
|
||||
|
||||
SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
|
||||
|
||||
# ifdef __NV_CL_C_VERSION
|
||||
*(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
|
||||
# endif
|
||||
|
||||
((uint4 *)a)[0] ^= tmp;
|
||||
bx1 = bx0;
|
||||
bx0 = as_ulong2(c);
|
||||
}
|
||||
|
||||
# undef SCRATCHPAD_CHUNK
|
||||
}
|
||||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
# endif
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
||||
__kernel void cn1_msr(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
{
|
||||
# if (ALGO == CRYPTONIGHT)
|
||||
ulong a[2], b[2];
|
||||
__local uint AES0[256], AES1[256];
|
||||
|
||||
const ulong gIdx = getIdx();
|
||||
|
||||
for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
|
||||
const uint tmp = AES0_C[i];
|
||||
AES0[i] = tmp;
|
||||
AES1[i] = rotate(tmp, 8U);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint2 tweak1_2;
|
||||
uint4 b_x;
|
||||
# if (COMP_MODE == 1)
|
||||
// do not use early return here
|
||||
if (gIdx < Threads)
|
||||
# endif
|
||||
{
|
||||
states += 25 * gIdx;
|
||||
# if (STRIDED_INDEX == 0)
|
||||
Scratchpad += gIdx * (MEMORY >> 4);
|
||||
# elif (STRIDED_INDEX == 1)
|
||||
# if (ALGO == CRYPTONIGHT_HEAVY)
|
||||
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + get_local_id(0);
|
||||
# else
|
||||
Scratchpad += gIdx;
|
||||
# endif
|
||||
# elif (STRIDED_INDEX == 2)
|
||||
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
|
||||
# endif
|
||||
|
||||
a[0] = states[0] ^ states[4];
|
||||
b[0] = states[2] ^ states[6];
|
||||
a[1] = states[1] ^ states[5];
|
||||
b[1] = states[3] ^ states[7];
|
||||
|
||||
b_x = ((uint4 *)b)[0];
|
||||
VARIANT1_INIT();
|
||||
}
|
||||
|
||||
mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
# if (COMP_MODE == 1)
|
||||
// do not use early return here
|
||||
if (gIdx < Threads)
|
||||
# endif
|
||||
{
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i < 0x40000; ++i) {
|
||||
ulong c[2];
|
||||
|
||||
((uint4 *)c)[0] = Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)];
|
||||
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
|
||||
|
||||
b_x ^= ((uint4 *)c)[0];
|
||||
VARIANT1_1(b_x);
|
||||
Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)] = b_x;
|
||||
|
||||
uint4 tmp;
|
||||
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
|
||||
|
||||
a[1] += c[0] * as_ulong2(tmp).s0;
|
||||
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
|
||||
|
||||
uint2 tweak1_2_0 = tweak1_2;
|
||||
|
||||
VARIANT1_2(a[1]);
|
||||
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
|
||||
VARIANT1_2(a[1]);
|
||||
|
||||
((uint4 *)a)[0] ^= tmp;
|
||||
|
||||
b_x = ((uint4 *)c)[0];
|
||||
}
|
||||
}
|
||||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
# endif
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
||||
__kernel void cn1_tube(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
{
|
||||
# if (ALGO == CRYPTONIGHT_HEAVY)
|
||||
ulong a[2], b[2];
|
||||
__local uint AES0[256], AES1[256];
|
||||
|
||||
const ulong gIdx = getIdx();
|
||||
|
||||
for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
|
||||
const uint tmp = AES0_C[i];
|
||||
AES0[i] = tmp;
|
||||
AES1[i] = rotate(tmp, 8U);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint2 tweak1_2;
|
||||
uint4 b_x;
|
||||
# if (COMP_MODE == 1)
|
||||
// do not use early return here
|
||||
if (gIdx < Threads)
|
||||
# endif
|
||||
{
|
||||
states += 25 * gIdx;
|
||||
# if (STRIDED_INDEX == 0)
|
||||
Scratchpad += gIdx * (MEMORY >> 4);
|
||||
# elif (STRIDED_INDEX == 1)
|
||||
# if (ALGO == CRYPTONIGHT_HEAVY)
|
||||
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + get_local_id(0);
|
||||
# else
|
||||
Scratchpad += gIdx;
|
||||
# endif
|
||||
# elif (STRIDED_INDEX == 2)
|
||||
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
|
||||
# endif
|
||||
|
||||
a[0] = states[0] ^ states[4];
|
||||
b[0] = states[2] ^ states[6];
|
||||
a[1] = states[1] ^ states[5];
|
||||
b[1] = states[3] ^ states[7];
|
||||
|
||||
b_x = ((uint4 *)b)[0];
|
||||
VARIANT1_INIT();
|
||||
}
|
||||
|
||||
mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
# if (COMP_MODE == 1)
|
||||
// do not use early return here
|
||||
if (gIdx < Threads)
|
||||
# endif
|
||||
{
|
||||
uint idx0 = a[0];
|
||||
|
||||
#pragma unroll CN_UNROLL
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
ulong c[2];
|
||||
|
||||
((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
|
||||
((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
|
||||
|
||||
b_x ^= ((uint4 *)c)[0];
|
||||
VARIANT1_1(b_x);
|
||||
Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
|
||||
|
||||
uint4 tmp;
|
||||
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
|
||||
|
||||
a[1] += c[0] * as_ulong2(tmp).s0;
|
||||
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
|
||||
|
||||
uint2 tweak1_2_0 = tweak1_2;
|
||||
tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
|
||||
|
||||
VARIANT1_2(a[1]);
|
||||
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
|
||||
VARIANT1_2(a[1]);
|
||||
|
||||
((uint4 *)a)[0] ^= tmp;
|
||||
idx0 = a[0];
|
||||
|
||||
b_x = ((uint4 *)c)[0];
|
||||
|
||||
{
|
||||
long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
|
||||
int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
|
||||
long q = fast_div_heavy(n, d | 0x5);
|
||||
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
|
||||
idx0 = d ^ q;
|
||||
}
|
||||
}
|
||||
}
|
||||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
# endif
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
||||
__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
|
@ -1211,91 +763,6 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, uint varia
|
|||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
||||
__kernel void cn1_xao(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
|
||||
{
|
||||
# if (ALGO == CRYPTONIGHT)
|
||||
ulong a[2], b[2];
|
||||
__local uint AES0[256], AES1[256];
|
||||
|
||||
const ulong gIdx = getIdx();
|
||||
|
||||
for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
|
||||
const uint tmp = AES0_C[i];
|
||||
AES0[i] = tmp;
|
||||
AES1[i] = rotate(tmp, 8U);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint4 b_x;
|
||||
# if (COMP_MODE == 1)
|
||||
// do not use early return here
|
||||
if (gIdx < Threads)
|
||||
# endif
|
||||
{
|
||||
states += 25 * gIdx;
|
||||
# if (STRIDED_INDEX == 0)
|
||||
Scratchpad += gIdx * (MEMORY >> 4);
|
||||
# elif (STRIDED_INDEX == 1)
|
||||
# if (ALGO == CRYPTONIGHT_HEAVY)
|
||||
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + get_local_id(0);
|
||||
# else
|
||||
Scratchpad += gIdx;
|
||||
# endif
|
||||
# elif(STRIDED_INDEX == 2)
|
||||
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
|
||||
# endif
|
||||
|
||||
a[0] = states[0] ^ states[4];
|
||||
b[0] = states[2] ^ states[6];
|
||||
a[1] = states[1] ^ states[5];
|
||||
b[1] = states[3] ^ states[7];
|
||||
|
||||
b_x = ((uint4 *)b)[0];
|
||||
}
|
||||
|
||||
mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
# if (COMP_MODE == 1)
|
||||
// do not use early return here
|
||||
if (gIdx < Threads)
|
||||
# endif
|
||||
{
|
||||
uint idx0 = a[0];
|
||||
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i < 0x100000; ++i) {
|
||||
ulong c[2];
|
||||
|
||||
((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
|
||||
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
|
||||
|
||||
Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0];
|
||||
|
||||
uint4 tmp;
|
||||
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
|
||||
|
||||
a[1] += c[0] * as_ulong2(tmp).s0;
|
||||
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
|
||||
|
||||
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
|
||||
|
||||
((uint4 *)a)[0] ^= tmp;
|
||||
idx0 = a[0];
|
||||
|
||||
b_x = ((uint4 *)c)[0];
|
||||
}
|
||||
}
|
||||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
# endif
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
__attribute__((reqd_work_group_size(8, 8, 1)))
|
||||
__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
|
||||
|
@ -1463,15 +930,15 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
|
|||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
#define VSWAP8(x) (((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \
|
||||
| (((x) >> 8) & 0x00000000FF000000UL) | (((x) << 8) & 0x000000FF00000000UL) \
|
||||
| (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL))
|
||||
|
||||
|
||||
#define VSWAP4(x) ((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U))
|
||||
|
||||
|
||||
__kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
|
||||
{
|
||||
const uint idx = get_global_id(0) - get_global_offset(0);
|
||||
|
@ -1529,7 +996,9 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
|
|||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
|
||||
|
||||
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
|
||||
|
||||
|
||||
#define JHXOR \
|
||||
h0h ^= input[0]; \
|
||||
|
@ -1552,6 +1021,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
|
|||
h7h ^= input[6]; \
|
||||
h7l ^= input[7]
|
||||
|
||||
|
||||
__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
|
||||
{
|
||||
const uint idx = get_global_id(0) - get_global_offset(0);
|
||||
|
@ -1597,8 +1067,10 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
#define SWAP4(x) as_uint(as_uchar4(x).s3210)
|
||||
|
||||
|
||||
__kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
|
||||
{
|
||||
const uint idx = get_global_id(0) - get_global_offset(0);
|
||||
|
@ -1697,6 +1169,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
#undef SWAP4
|
||||
|
||||
|
||||
|
@ -1796,5 +1269,3 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
)==="
|
||||
|
|
2724
src/backend/opencl/cl/cn/cryptonight_cl.h
Normal file
2724
src/backend/opencl/cl/cn/cryptonight_cl.h
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,4 +1,3 @@
|
|||
R"===(
|
||||
#ifndef FAST_DIV_HEAVY_CL
|
||||
#define FAST_DIV_HEAVY_CL
|
||||
|
||||
|
@ -26,4 +25,3 @@ inline long fast_div_heavy(long _a, int _b)
|
|||
}
|
||||
|
||||
#endif
|
||||
)==="
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
R"===(
|
||||
/*
|
||||
* @author SChernykh
|
||||
*/
|
||||
|
@ -53,5 +52,3 @@ inline uint fast_sqrt_v2(const ulong n1)
|
|||
|
||||
return result;
|
||||
}
|
||||
|
||||
)==="
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
R"===(
|
||||
/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */
|
||||
/*
|
||||
* Groestl256
|
||||
|
@ -124,9 +123,6 @@ static const __constant ulong T0_G[] =
|
|||
0x7bcbf646cb463d7bUL, 0xa8fc4b1ffc1fb7a8UL, 0x6dd6da61d6610c6dUL, 0x2c3a584e3a4e622cUL
|
||||
};
|
||||
|
||||
)==="
|
||||
R"===(
|
||||
|
||||
static const __constant ulong T4_G[] =
|
||||
{
|
||||
0xA5F432C6C6A597F4UL, 0x84976FF8F884EB97UL, 0x99B05EEEEE99C7B0UL, 0x8D8C7AF6F68DF78CUL,
|
||||
|
@ -291,5 +287,3 @@ static const __constant ulong T4_G[] =
|
|||
ROUND_SMALL_Q(a, r); \
|
||||
} while (0)
|
||||
|
||||
)==="
|
||||
|
|
@ -1,4 +1,3 @@
|
|||
R"===(
|
||||
/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
|
||||
/*
|
||||
* JH implementation.
|
||||
|
@ -270,5 +269,3 @@ static const __constant ulong C[] =
|
|||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
)==="
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
R"===(
|
||||
#ifndef WOLF_AES_CL
|
||||
#define WOLF_AES_CL
|
||||
|
||||
|
@ -149,5 +148,3 @@ uint4 AES_Round_Two_Tables(const __local uint *AES0, const __local uint *AES1, c
|
|||
}
|
||||
|
||||
#endif
|
||||
|
||||
)==="
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
R"===(
|
||||
#ifndef WOLF_SKEIN_CL
|
||||
#define WOLF_SKEIN_CL
|
||||
|
||||
|
@ -137,5 +136,3 @@ ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t)
|
|||
}
|
||||
|
||||
#endif
|
||||
|
||||
)==="
|
||||
|
|
|
@ -52,30 +52,18 @@ public:
|
|||
CN_RWZ, // "cn/rwz" CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft).
|
||||
CN_ZLS, // "cn/zls" CryptoNight variant 2 with 3/4 iterations (Zelerius).
|
||||
CN_DOUBLE, // "cn/double" CryptoNight variant 2 with double iterations (X-CASH).
|
||||
# ifdef XMRIG_ALGO_CN_GPU
|
||||
CN_GPU, // "cn/gpu" CryptoNight-GPU (Ryo).
|
||||
# endif
|
||||
# ifdef XMRIG_ALGO_CN_LITE
|
||||
CN_LITE_0, // "cn-lite/0" CryptoNight-Lite variant 0.
|
||||
CN_LITE_1, // "cn-lite/1" CryptoNight-Lite variant 1.
|
||||
# endif
|
||||
# ifdef XMRIG_ALGO_CN_HEAVY
|
||||
CN_HEAVY_0, // "cn-heavy/0" CryptoNight-Heavy (4 MB).
|
||||
CN_HEAVY_TUBE, // "cn-heavy/tube" CryptoNight-Heavy (modified, TUBE only).
|
||||
CN_HEAVY_XHV, // "cn-heavy/xhv" CryptoNight-Heavy (modified, Haven Protocol only).
|
||||
# endif
|
||||
# ifdef XMRIG_ALGO_CN_PICO
|
||||
CN_PICO_0, // "cn-pico" CryptoNight Turtle (TRTL)
|
||||
# endif
|
||||
# ifdef XMRIG_ALGO_RANDOMX
|
||||
RX_0, // "rx/0" RandomX (reference configuration).
|
||||
RX_WOW, // "rx/wow" RandomWOW (Wownero).
|
||||
RX_LOKI, // "rx/loki" RandomXL (Loki).
|
||||
# endif
|
||||
# ifdef XMRIG_ALGO_ARGON2
|
||||
AR2_CHUKWA, // "argon2/chukwa"
|
||||
AR2_WRKZ, // "argon2/wrkz"
|
||||
# endif
|
||||
AR2_CHUKWA, // "argon2/chukwa" Argon2id (Chukwa).
|
||||
AR2_WRKZ, // "argon2/wrkz" Argon2id (WRKZ)
|
||||
MAX
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in a new issue