Use external script to prepare OpenCL source.

This commit is contained in:
XMRig 2019-08-28 00:33:49 +07:00
parent 82696000e4
commit fcfb738ded
17 changed files with 2948 additions and 693 deletions

2
.gitignore vendored
View file

@ -1,2 +1,4 @@
/build /build
/CMakeLists.txt.user /CMakeLists.txt.user
/.idea
/src/backend/opencl/cl/cn/cryptonight_gen.cl

23
package.json Normal file
View file

@ -0,0 +1,23 @@
{
"name": "xmrig",
"version": "3.0.0",
"description": "RandomX, CryptoNight and Argon2 miner",
"main": "index.js",
"directories": {
"doc": "doc"
},
"scripts": {
"build": "node scripts/generate_cl.js"
},
"repository": {
"type": "git",
"url": "git+https://github.com/xmrig/xmrig.git"
},
"keywords": [],
"author": "",
"license": "GPLv3",
"bugs": {
"url": "https://github.com/xmrig/xmrig/issues"
},
"homepage": "https://github.com/xmrig/xmrig#readme"
}

32
scripts/generate_cl.js Normal file
View file

@ -0,0 +1,32 @@
#!/usr/bin/env node
'use strict';
const fs = require('fs');
const path = require('path');
const { text2h, addIncludes } = require('./js/opencl');
const cwd = process.cwd();
function cn()
{
process.chdir(cwd);
process.chdir(path.resolve('src/backend/opencl/cl/cn'));
const cn = addIncludes('cryptonight.cl', [
'algorithm.cl',
'wolf-aes.cl',
'wolf-skein.cl',
'jh.cl',
'blake256.cl',
'groestl256.cl',
'fast_int_math_v2.cl',
'fast_div_heavy.cl'
]);
//fs.writeFileSync('cryptonight_gen.cl', cn);
fs.writeFileSync('cryptonight_cl.h', text2h(cn, 'xmrig', 'cryptonight_cl'));
}
cn();

72
scripts/js/opencl.js Normal file
View file

@ -0,0 +1,72 @@
'use strict';
const fs = require('fs');
function bin2h(buf, namespace, name)
{
const size = buf.byteLength;
let out = `#pragma once\n\nnamespace ${namespace} {\n\nstatic unsigned char ${name}[${size}] = {\n `;
let b = 32;
for (let i = 0; i < size; i++) {
out += `0x${buf.readUInt8(i).toString(16).padStart(2, '0')}${size - i > 1 ? ',' : ''}`;
if (--b === 0) {
b = 32;
out += '\n ';
}
}
out += `\n};\n\n} // namespace ${namespace}\n`;
return out;
}
function text2h(text, namespace, name)
{
const buf = Buffer.from(text);
const size = buf.byteLength;
let out = `#pragma once\n\nnamespace ${namespace} {\n\nstatic char ${name}[${size + 1}] = {\n `;
let b = 32;
for (let i = 0; i < size; i++) {
out += `0x${buf.readUInt8(i).toString(16).padStart(2, '0')},`;
if (--b === 0) {
b = 32;
out += '\n ';
}
}
out += '0x00';
out += `\n};\n\n} // namespace ${namespace}\n`;
return out;
}
function addInclude(input, name)
{
return input.replace(`#include "${name}"`, fs.readFileSync(name, 'utf8'));
}
function addIncludes(inputFileName, names)
{
let data = fs.readFileSync(inputFileName, 'utf8');
for (let name of names) {
data = addInclude(data, name);
}
return data;
}
module.exports.bin2h = bin2h;
module.exports.text2h = text2h;
module.exports.addInclude = addInclude;
module.exports.addIncludes = addIncludes;

View file

@ -29,7 +29,6 @@
#include "backend/common/Hashrate.h" #include "backend/common/Hashrate.h"
#include "backend/common/interfaces/IWorker.h" #include "backend/common/interfaces/IWorker.h"
#include "backend/common/Workers.h" #include "backend/common/Workers.h"
#include "backend/opencl/cl/OclSource.h"
#include "backend/opencl/OclBackend.h" #include "backend/opencl/OclBackend.h"
#include "backend/opencl/OclConfig.h" #include "backend/opencl/OclConfig.h"
#include "backend/opencl/OclLaunchData.h" #include "backend/opencl/OclLaunchData.h"
@ -96,8 +95,6 @@ public:
inline OclBackendPrivate(Controller *controller) : inline OclBackendPrivate(Controller *controller) :
controller(controller) controller(controller)
{ {
OclSource::init();
init(controller->config()->cl()); init(controller->config()->cl());
} }

View file

@ -23,75 +23,16 @@
*/ */
#include <string> #include "backend/opencl/cl/cn/cryptonight_cl.h"
#include <regex>
#include "backend/opencl/cl/OclSource.h" #include "backend/opencl/cl/OclSource.h"
#include "crypto/common/Algorithm.h" #include "crypto/common/Algorithm.h"
namespace xmrig {
static std::string cn_source;
} // namespace xmrig
const char *xmrig::OclSource::get(const Algorithm &algorithm) const char *xmrig::OclSource::get(const Algorithm &algorithm)
{ {
if (algorithm.family() == Algorithm::RANDOM_X) { if (algorithm.family() == Algorithm::RANDOM_X) {
return nullptr; // FIXME return nullptr; // FIXME
} }
return cn_source.c_str(); return cryptonight_cl;
}
void xmrig::OclSource::init()
{
const char *cryptonightCL =
#include "./cn/cryptonight.cl"
;
const char *cryptonightCL2 =
#include "./cn/cryptonight2.cl"
;
const char *blake256CL =
#include "./cn/blake256.cl"
;
const char *groestl256CL =
#include "./cn/groestl256.cl"
;
const char *jhCL =
#include "./cn/jh.cl"
;
const char *wolfAesCL =
#include "./cn/wolf-aes.cl"
;
const char *wolfSkeinCL =
#include "./cn/wolf-skein.cl"
;
const char *fastIntMathV2CL =
#include "./cn/fast_int_math_v2.cl"
;
const char *fastDivHeavyCL =
#include "./cn/fast_div_heavy.cl"
;
const char *cryptonight_gpu =
#include "./cn/cryptonight_gpu.cl"
;
cn_source.append(cryptonightCL);
cn_source.append(cryptonightCL2);
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_WOLF_AES"), wolfAesCL);
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_JH"), jhCL);
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_BLAKE256"), blake256CL);
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_GROESTL256"), groestl256CL);
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
cn_source = std::regex_replace(cn_source, std::regex("XMRIG_INCLUDE_CN_GPU"), cryptonight_gpu);
} }

View file

@ -0,0 +1,27 @@
enum Algorithm {
ALGO_INVALID = -1,
ALGO_CN_0, // "cn/0" CryptoNight (original).
ALGO_CN_1, // "cn/1" CryptoNight variant 1 also known as Monero7 and CryptoNightV7.
ALGO_CN_2, // "cn/2" CryptoNight variant 2.
ALGO_CN_R, // "cn/r" CryptoNightR (Monero's variant 4).
ALGO_CN_FAST, // "cn/fast" CryptoNight variant 1 with half iterations.
ALGO_CN_HALF, // "cn/half" CryptoNight variant 2 with half iterations (Masari/Torque).
ALGO_CN_XAO, // "cn/xao" CryptoNight variant 0 (modified, Alloy only).
ALGO_CN_RTO, // "cn/rto" CryptoNight variant 1 (modified, Arto only).
ALGO_CN_RWZ, // "cn/rwz" CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft).
ALGO_CN_ZLS, // "cn/zls" CryptoNight variant 2 with 3/4 iterations (Zelerius).
ALGO_CN_DOUBLE, // "cn/double" CryptoNight variant 2 with double iterations (X-CASH).
ALGO_CN_GPU, // "cn/gpu" CryptoNight-GPU (Ryo).
ALGO_CN_LITE_0, // "cn-lite/0" CryptoNight-Lite variant 0.
ALGO_CN_LITE_1, // "cn-lite/1" CryptoNight-Lite variant 1.
ALGO_CN_HEAVY_0, // "cn-heavy/0" CryptoNight-Heavy (4 MB).
ALGO_CN_HEAVY_TUBE, // "cn-heavy/tube" CryptoNight-Heavy (modified, TUBE only).
ALGO_CN_HEAVY_XHV, // "cn-heavy/xhv" CryptoNight-Heavy (modified, Haven Protocol only).
ALGO_CN_PICO_0, // "cn-pico" CryptoNight Turtle (TRTL)
ALGO_RX_0, // "rx/0" RandomX (reference configuration).
ALGO_RX_WOW, // "rx/wow" RandomWOW (Wownero).
ALGO_RX_LOKI, // "rx/loki" RandomXL (Loki).
ALGO_AR2_CHUKWA, // "argon2/chukwa" Argon2id (Chukwa).
ALGO_AR2_WRKZ, // "argon2/wrkz" Argon2id (WRKZ)
ALGO_MAX
};

View file

@ -1,4 +1,3 @@
R"===(
/* /*
* blake256 kernel implementation. * blake256 kernel implementation.
* *
@ -90,4 +89,3 @@ __constant static const sph_u32 c_u256[16] = {
v[b] ^= v[c]; \ v[b] ^= v[c]; \
v[b] = rotate(v[b], 25U); \ v[b] = rotate(v[b], 25U); \
} }
)==="

View file

@ -1,4 +1,3 @@
R"===(
/* /*
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@ -19,39 +18,16 @@ R"===(
# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable # pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
#endif #endif
//#include "opencl/wolf-aes.cl"
XMRIG_INCLUDE_WOLF_AES
//#include "opencl/wolf-skein.cl"
XMRIG_INCLUDE_WOLF_SKEIN
//#include "opencl/jh.cl"
XMRIG_INCLUDE_JH
//#include "opencl/blake256.cl"
XMRIG_INCLUDE_BLAKE256
//#include "opencl/groestl256.cl"
XMRIG_INCLUDE_GROESTL256
//#include "fast_int_math_v2.cl"
XMRIG_INCLUDE_FAST_INT_MATH_V2
//#include "fast_div_heavy.cl"
XMRIG_INCLUDE_FAST_DIV_HEAVY
#include "algorithm.cl"
#include "wolf-aes.cl"
#include "wolf-skein.cl"
#include "jh.cl"
#include "blake256.cl"
#include "groestl256.cl"
#include "fast_int_math_v2.cl"
#include "fast_div_heavy.cl"
#define VARIANT_0 0 // Original CryptoNight or CryptoNight-Heavy
#define VARIANT_1 1 // CryptoNight variant 1 also known as Monero7 and CryptoNightV7
#define VARIANT_TUBE 2 // Modified CryptoNight Lite variant 1 with XOR (IPBC/TUBE only)
#define VARIANT_XTL 3 // Modified CryptoNight variant 1 (Stellite only)
#define VARIANT_MSR 4 // Modified CryptoNight variant 1 (Masari only)
#define VARIANT_XHV 5 // Modified CryptoNight-Heavy (Haven Protocol only)
#define VARIANT_XAO 6 // Modified CryptoNight variant 0 (Alloy only)
#define VARIANT_RTO 7 // Modified CryptoNight variant 1 (Arto only)
#define VARIANT_2 8 // CryptoNight variant 2
#define VARIANT_HALF 9 // CryptoNight variant 2 with half iterations (Masari/Stellite)
#define VARIANT_TRTL 10 // CryptoNight Turtle (TRTL)
#define VARIANT_GPU 11 // CryptoNight-GPU (Ryo)
#define CRYPTONIGHT 0 /* CryptoNight (2 MB) */
#define CRYPTONIGHT_LITE 1 /* CryptoNight (1 MB) */
#define CRYPTONIGHT_HEAVY 2 /* CryptoNight (4 MB) */
#define CRYPTONIGHT_PICO 3 /* CryptoNight (256 KB) */
#if defined(__NV_CL_C_VERSION) && STRIDED_INDEX != 0 #if defined(__NV_CL_C_VERSION) && STRIDED_INDEX != 0
# undef STRIDED_INDEX # undef STRIDED_INDEX
@ -71,6 +47,7 @@ static const __constant ulong keccakf_rndc[24] =
0x8000000000008080, 0x0000000080000001, 0x8000000080008008 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
}; };
static const __constant uchar sbox[256] = static const __constant uchar sbox[256] =
{ {
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
@ -92,75 +69,27 @@ static const __constant uchar sbox[256] =
}; };
void keccakf1600(ulong *s)
{
for(int i = 0; i < 24; ++i)
{
ulong bc[5], tmp1, tmp2;
bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL);
bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
tmp1 = s[1] ^ bc[0];
s[0] ^= bc[4];
s[1] = rotate(s[6] ^ bc[0], 44UL);
s[6] = rotate(s[9] ^ bc[3], 20UL);
s[9] = rotate(s[22] ^ bc[1], 61UL);
s[22] = rotate(s[14] ^ bc[3], 39UL);
s[14] = rotate(s[20] ^ bc[4], 18UL);
s[20] = rotate(s[2] ^ bc[1], 62UL);
s[2] = rotate(s[12] ^ bc[1], 43UL);
s[12] = rotate(s[13] ^ bc[2], 25UL);
s[13] = rotate(s[19] ^ bc[3], 8UL);
s[19] = rotate(s[23] ^ bc[2], 56UL);
s[23] = rotate(s[15] ^ bc[4], 41UL);
s[15] = rotate(s[4] ^ bc[3], 27UL);
s[4] = rotate(s[24] ^ bc[3], 14UL);
s[24] = rotate(s[21] ^ bc[0], 2UL);
s[21] = rotate(s[8] ^ bc[2], 55UL);
s[8] = rotate(s[16] ^ bc[0], 35UL);
s[16] = rotate(s[5] ^ bc[4], 36UL);
s[5] = rotate(s[3] ^ bc[2], 28UL);
s[3] = rotate(s[18] ^ bc[2], 21UL);
s[18] = rotate(s[17] ^ bc[1], 15UL);
s[17] = rotate(s[11] ^ bc[0], 10UL);
s[11] = rotate(s[7] ^ bc[1], 6UL);
s[7] = rotate(s[10] ^ bc[4], 3UL);
s[10] = rotate(tmp1, 1UL);
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
s[0] ^= keccakf_rndc[i];
}
}
static const __constant uint keccakf_rotc[24] = static const __constant uint keccakf_rotc[24] =
{ {
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
}; };
static const __constant uint keccakf_piln[24] = static const __constant uint keccakf_piln[24] =
{ {
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
}; };
void keccakf1600_1(ulong *st) void keccakf1600_1(ulong *st)
{ {
int i, round; int i, round;
ulong t, bc[5]; ulong t, bc[5];
#pragma unroll 1 #pragma unroll 1
for(round = 0; round < 24; ++round) for (round = 0; round < 24; ++round) {
{
// Theta // Theta
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
@ -180,7 +109,7 @@ void keccakf1600_1(ulong *st)
// Rho Pi // Rho Pi
t = st[1]; t = st[1];
#pragma unroll #pragma unroll 1
for (i = 0; i < 24; ++i) { for (i = 0; i < 24; ++i) {
bc[0] = st[keccakf_piln[i]]; bc[0] = st[keccakf_piln[i]];
st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]); st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
@ -188,16 +117,18 @@ void keccakf1600_1(ulong *st)
} }
#pragma unroll 1 #pragma unroll 1
for(int i = 0; i < 25; i += 5) for (int i = 0; i < 25; i += 5) {
{
ulong tmp[5]; ulong tmp[5];
#pragma unroll 1 #pragma unroll 1
for(int x = 0; x < 5; ++x) for (int x = 0; x < 5; ++x) {
tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]); tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
}
#pragma unroll 1 #pragma unroll 1
for(int x = 0; x < 5; ++x) st[i + x] = tmp[x]; for (int x = 0; x < 5; ++x) {
st[i + x] = tmp[x];
}
} }
// Iota // Iota
@ -205,8 +136,6 @@ void keccakf1600_1(ulong *st)
} }
} }
)==="
R"===(
void keccakf1600_2(__local ulong *st) void keccakf1600_2(__local ulong *st)
{ {
@ -214,56 +143,54 @@ void keccakf1600_2(__local ulong *st)
ulong t, bc[5]; ulong t, bc[5];
#pragma unroll 1 #pragma unroll 1
for (round = 0; round < 24; ++round) for (round = 0; round < 24; ++round) {
{
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL); bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL); bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL); bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL); bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL); bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
st[0] ^= bc[4]; st[0] ^= bc[4];
st[5] ^= bc[4]; st[5] ^= bc[4];
st[10] ^= bc[4]; st[10] ^= bc[4];
st[15] ^= bc[4]; st[15] ^= bc[4];
st[20] ^= bc[4]; st[20] ^= bc[4];
st[1] ^= bc[0]; st[1] ^= bc[0];
st[6] ^= bc[0]; st[6] ^= bc[0];
st[11] ^= bc[0]; st[11] ^= bc[0];
st[16] ^= bc[0]; st[16] ^= bc[0];
st[21] ^= bc[0]; st[21] ^= bc[0];
st[2] ^= bc[1]; st[2] ^= bc[1];
st[7] ^= bc[1]; st[7] ^= bc[1];
st[12] ^= bc[1]; st[12] ^= bc[1];
st[17] ^= bc[1]; st[17] ^= bc[1];
st[22] ^= bc[1]; st[22] ^= bc[1];
st[3] ^= bc[2]; st[3] ^= bc[2];
st[8] ^= bc[2]; st[8] ^= bc[2];
st[13] ^= bc[2]; st[13] ^= bc[2];
st[18] ^= bc[2]; st[18] ^= bc[2];
st[23] ^= bc[2]; st[23] ^= bc[2];
st[4] ^= bc[3]; st[4] ^= bc[3];
st[9] ^= bc[3]; st[9] ^= bc[3];
st[14] ^= bc[3]; st[14] ^= bc[3];
st[19] ^= bc[3]; st[19] ^= bc[3];
st[24] ^= bc[3]; st[24] ^= bc[3];
// Rho Pi // Rho Pi
t = st[1]; t = st[1];
#pragma unroll #pragma unroll 1
for (i = 0; i < 24; ++i) { for (i = 0; i < 24; ++i) {
bc[0] = st[keccakf_piln[i]]; bc[0] = st[keccakf_piln[i]];
st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]); st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
t = bc[0]; t = bc[0];
} }
#pragma unroll #pragma unroll 1
for(int i = 0; i < 25; i += 5) for (int i = 0; i < 25; i += 5) {
{
ulong tmp1 = st[i], tmp2 = st[i + 1]; ulong tmp1 = st[i], tmp2 = st[i + 1];
st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]); st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
@ -278,40 +205,17 @@ void keccakf1600_2(__local ulong *st)
} }
} }
)==="
R"===(
void CNKeccak(ulong *output, ulong *input)
{
ulong st[25];
// Copy 72 bytes
for(int i = 0; i < 9; ++i) st[i] = input[i];
// Last four and '1' bit for padding
//st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
// Last bit of padding
st[16] = 0x8000000000000000UL;
keccakf1600_1(st);
for(int i = 0; i < 25; ++i) output[i] = st[i];
}
static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 }; static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
void AESExpandKey256(uint *keybuf) void AESExpandKey256(uint *keybuf)
{ {
//#pragma unroll 4 //#pragma unroll 4
for(uint c = 8, i = 1; c < 40; ++c) for (uint c = 8, i = 1; c < 40; ++c) {
{
// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1]; uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
@ -322,8 +226,10 @@ void AESExpandKey256(uint *keybuf)
} }
} }
#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT) #define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
#if (STRIDED_INDEX == 0) #if (STRIDED_INDEX == 0)
# define IDX(x) (x) # define IDX(x) (x)
#elif (STRIDED_INDEX == 1) #elif (STRIDED_INDEX == 1)
@ -336,6 +242,7 @@ void AESExpandKey256(uint *keybuf)
# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) # define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
#endif #endif
inline ulong getIdx() inline ulong getIdx()
{ {
# if (STRIDED_INDEX == 0 || STRIDED_INDEX == 1 || STRIDED_INDEX == 2) # if (STRIDED_INDEX == 0 || STRIDED_INDEX == 1 || STRIDED_INDEX == 2)
@ -343,11 +250,13 @@ inline ulong getIdx()
# endif # endif
} }
//#include "opencl/cryptonight_gpu.cl" //#include "opencl/cryptonight_gpu.cl"
XMRIG_INCLUDE_CN_GPU //XMRIG_INCLUDE_CN_GPU
#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
__attribute__((reqd_work_group_size(8, 8, 1))) __attribute__((reqd_work_group_size(8, 8, 1)))
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads) __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
{ {
@ -388,8 +297,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
# endif # endif
if (get_local_id(1) == 0) if (get_local_id(1) == 0) {
{
__local ulong* State = State_buf + get_local_id(0) * 25; __local ulong* State = State_buf + get_local_id(0) * 25;
((__local ulong8 *)State)[0] = vload8(0, input); ((__local ulong8 *)State)[0] = vload8(0, input);
@ -421,7 +329,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
keccakf1600_2(State); keccakf1600_2(State);
#pragma unroll #pragma unroll 1
for (int i = 0; i < 25; ++i) { for (int i = 0; i < 25; ++i) {
states[i] = State[i]; states[i] = State[i];
} }
@ -495,25 +403,20 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
Scratchpad[IDX(i + local_id1)] = text; Scratchpad[IDX(i + local_id1)] = text;
} }
} }
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
} }
)==="
R"===(
#define VARIANT1_1(p) \ #define VARIANT1_1(p) \
uint table = 0x75310U; \ uint table = 0x75310U; \
uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \ uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
(p).s2 ^= ((table >> index) & 0x30U) << 24 (p).s2 ^= ((table >> index) & 0x30U) << 24
#define VARIANT1_1_XTL(p) \
uint table = 0x75310U; \
uint offset = variant == VARIANT_XTL ? 27 : 26; \
uint index = (((p).s2 >> offset) & 12) | (((p).s2 >> 23) & 2); \
(p).s2 ^= ((table >> index) & 0x30U) << 24
#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2_0 #define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2_0
#define VARIANT1_INIT() \ #define VARIANT1_INIT() \
tweak1_2 = as_uint2(input[4]); \ tweak1_2 = as_uint2(input[4]); \
tweak1_2.s0 >>= 24; \ tweak1_2.s0 >>= 24; \
@ -521,8 +424,9 @@ R"===(
tweak1_2.s1 = (uint) get_global_id(0); \ tweak1_2.s1 = (uint) get_global_id(0); \
tweak1_2 ^= as_uint2(states[24]) tweak1_2 ^= as_uint2(states[24])
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads) __kernel void cn1_v1(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{ {
ulong a[2], b[2]; ulong a[2], b[2];
__local uint AES0[256], AES1[256]; __local uint AES0[256], AES1[256];
@ -581,7 +485,7 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, uin
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]); ((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
b_x ^= ((uint4 *)c)[0]; b_x ^= ((uint4 *)c)[0];
VARIANT1_1_XTL(b_x); VARIANT1_1(b_x);
Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)] = b_x; Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)] = b_x;
uint4 tmp; uint4 tmp;
@ -591,9 +495,9 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, uin
a[0] += mul_hi(c[0], as_ulong2(tmp).s0); a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
uint2 tweak1_2_0 = tweak1_2; uint2 tweak1_2_0 = tweak1_2;
if (variant == VARIANT_RTO) { # if ALGO == ALGO_CN_RTO
tweak1_2_0 ^= ((uint2 *)&(a[0]))[0]; tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
} # endif
VARIANT1_2(a[1]); VARIANT1_2(a[1]);
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0]; Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
@ -604,15 +508,13 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, uin
b_x = ((uint4 *)c)[0]; b_x = ((uint4 *)c)[0];
} }
} }
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
} }
)==="
R"===(
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_v2_monero(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads) __kernel void cn1_v2(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{ {
# if (ALGO == CRYPTONIGHT || ALGO == CRYPTONIGHT_PICO) # if (ALGO == CRYPTONIGHT || ALGO == CRYPTONIGHT_PICO)
ulong a[2], b[4]; ulong a[2], b[4];
@ -768,356 +670,6 @@ __kernel void cn1_v2_monero(__global uint4 *Scratchpad, __global ulong *states,
# endif # endif
} }
)==="
R"===(
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_v2_half(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
# if (ALGO == CRYPTONIGHT)
ulong a[2], b[4];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
const ulong gIdx = getIdx();
for(int i = get_local_id(0); i < 256; i += WORKSIZE)
{
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
AES2[i] = rotate(tmp, 16U);
AES3[i] = rotate(tmp, 24U);
}
barrier(CLK_LOCAL_MEM_FENCE);
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if defined(__NV_CL_C_VERSION)
Scratchpad += gIdx * (0x40000 >> 2);
# else
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
Scratchpad += gIdx;
# elif (STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif
# endif
a[0] = states[0] ^ states[4];
a[1] = states[1] ^ states[5];
b[0] = states[2] ^ states[6];
b[1] = states[3] ^ states[7];
b[2] = states[8] ^ states[10];
b[3] = states[9] ^ states[11];
}
ulong2 bx0 = ((ulong2 *)b)[0];
ulong2 bx1 = ((ulong2 *)b)[1];
mem_fence(CLK_LOCAL_MEM_FENCE);
# ifdef __NV_CL_C_VERSION
__local uint16 scratchpad_line_buf[WORKSIZE];
__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
# else
# if (STRIDED_INDEX == 0)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
# elif (STRIDED_INDEX == 1)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + mul24(as_uint(idx ^ (N << 4)), Threads)))
# elif (STRIDED_INDEX == 2)
# define SCRATCHPAD_CHUNK(N) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
# endif
# endif
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
uint2 division_result = as_uint2(states[12]);
uint sqrt_result = as_uint2(states[13]).s0;
#pragma unroll CN_UNROLL
for(int i = 0; i < 0x40000; ++i)
{
# ifdef __NV_CL_C_VERSION
uint idx = a[0] & 0x1FFFC0;
uint idx1 = a[0] & 0x30;
*scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
# else
uint idx = a[0] & MASK;
# endif
uint4 c = SCRATCHPAD_CHUNK(0);
c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);
{
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;
# ifdef __NV_CL_C_VERSION
*(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
idx = as_ulong2(c).s0 & 0x1FFFC0;
idx1 = as_ulong2(c).s0 & 0x30;
*scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
# else
idx = as_ulong2(c).s0 & MASK;
# endif
uint4 tmp = SCRATCHPAD_CHUNK(0);
{
tmp.s0 ^= division_result.s0;
tmp.s1 ^= division_result.s1 ^ sqrt_result;
division_result = fast_div_v2(as_ulong2(c).s1, (c.s0 + (sqrt_result << 1)) | 0x80000001UL);
sqrt_result = fast_sqrt_v2(as_ulong2(c).s0 + as_ulong(division_result));
}
ulong2 t;
t.s0 = mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);
t.s1 = as_ulong2(c).s0 * as_ulong2(tmp).s0;
{
const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ t;
const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
t ^= chunk2;
const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
a[1] += t.s1;
a[0] += t.s0;
SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
# ifdef __NV_CL_C_VERSION
*(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
# endif
((uint4 *)a)[0] ^= tmp;
bx1 = bx0;
bx0 = as_ulong2(c);
}
# undef SCRATCHPAD_CHUNK
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
# endif
}
)==="
R"===(
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_msr(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
# if (ALGO == CRYPTONIGHT)
ulong a[2], b[2];
__local uint AES0[256], AES1[256];
const ulong gIdx = getIdx();
for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
}
barrier(CLK_LOCAL_MEM_FENCE);
uint2 tweak1_2;
uint4 b_x;
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
# if (ALGO == CRYPTONIGHT_HEAVY)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + get_local_id(0);
# else
Scratchpad += gIdx;
# endif
# elif (STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif
a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0];
VARIANT1_INIT();
}
mem_fence(CLK_LOCAL_MEM_FENCE);
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
#pragma unroll 8
for (int i = 0; i < 0x40000; ++i) {
ulong c[2];
((uint4 *)c)[0] = Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
b_x ^= ((uint4 *)c)[0];
VARIANT1_1(b_x);
Scratchpad[IDX((as_uint2(a[0]).s0 & MASK) >> 4)] = b_x;
uint4 tmp;
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
uint2 tweak1_2_0 = tweak1_2;
VARIANT1_2(a[1]);
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
VARIANT1_2(a[1]);
((uint4 *)a)[0] ^= tmp;
b_x = ((uint4 *)c)[0];
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
# endif
}
)==="
R"===(
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_tube(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
# if (ALGO == CRYPTONIGHT_HEAVY)
ulong a[2], b[2];
__local uint AES0[256], AES1[256];
const ulong gIdx = getIdx();
for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
}
barrier(CLK_LOCAL_MEM_FENCE);
uint2 tweak1_2;
uint4 b_x;
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
# if (ALGO == CRYPTONIGHT_HEAVY)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + get_local_id(0);
# else
Scratchpad += gIdx;
# endif
# elif (STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif
a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0];
VARIANT1_INIT();
}
mem_fence(CLK_LOCAL_MEM_FENCE);
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
uint idx0 = a[0];
#pragma unroll CN_UNROLL
for (int i = 0; i < ITERATIONS; ++i) {
ulong c[2];
((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
b_x ^= ((uint4 *)c)[0];
VARIANT1_1(b_x);
Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
uint4 tmp;
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
uint2 tweak1_2_0 = tweak1_2;
tweak1_2_0 ^= ((uint2 *)&(a[0]))[0];
VARIANT1_2(a[1]);
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
VARIANT1_2(a[1]);
((uint4 *)a)[0] ^= tmp;
idx0 = a[0];
b_x = ((uint4 *)c)[0];
{
long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
long q = fast_div_heavy(n, d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
idx0 = d ^ q;
}
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
# endif
}
)==="
R"===(
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads) __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
@ -1211,91 +763,6 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, uint varia
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
} }
)==="
R"===(
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_xao(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
# if (ALGO == CRYPTONIGHT)
ulong a[2], b[2];
__local uint AES0[256], AES1[256];
const ulong gIdx = getIdx();
for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
}
barrier(CLK_LOCAL_MEM_FENCE);
uint4 b_x;
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
# if (ALGO == CRYPTONIGHT_HEAVY)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + get_local_id(0);
# else
Scratchpad += gIdx;
# endif
# elif(STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif
a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0];
}
mem_fence(CLK_LOCAL_MEM_FENCE);
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
uint idx0 = a[0];
#pragma unroll 8
for (int i = 0; i < 0x100000; ++i) {
ulong c[2];
((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0];
uint4 tmp;
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
((uint4 *)a)[0] ^= tmp;
idx0 = a[0];
b_x = ((uint4 *)c)[0];
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
# endif
}
)==="
R"===(
__attribute__((reqd_work_group_size(8, 8, 1))) __attribute__((reqd_work_group_size(8, 8, 1)))
__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
@ -1463,15 +930,15 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
} }
)==="
R"===(
#define VSWAP8(x) (((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \ #define VSWAP8(x) (((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \
| (((x) >> 8) & 0x00000000FF000000UL) | (((x) << 8) & 0x000000FF00000000UL) \ | (((x) >> 8) & 0x00000000FF000000UL) | (((x) << 8) & 0x000000FF00000000UL) \
| (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL)) | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL))
#define VSWAP4(x) ((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U)) #define VSWAP4(x) ((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U))
__kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads) __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
{ {
const uint idx = get_global_id(0) - get_global_offset(0); const uint idx = get_global_id(0) - get_global_offset(0);
@ -1529,7 +996,9 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
} }
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
#define JHXOR \ #define JHXOR \
h0h ^= input[0]; \ h0h ^= input[0]; \
@ -1552,6 +1021,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
h7h ^= input[6]; \ h7h ^= input[6]; \
h7l ^= input[7] h7l ^= input[7]
__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads) __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
{ {
const uint idx = get_global_id(0) - get_global_offset(0); const uint idx = get_global_id(0) - get_global_offset(0);
@ -1597,8 +1067,10 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
} }
} }
#define SWAP4(x) as_uint(as_uchar4(x).s3210) #define SWAP4(x) as_uint(as_uchar4(x).s3210)
__kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads) __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
{ {
const uint idx = get_global_id(0) - get_global_offset(0); const uint idx = get_global_id(0) - get_global_offset(0);
@ -1697,6 +1169,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
} }
} }
#undef SWAP4 #undef SWAP4
@ -1796,5 +1269,3 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
} }
} }
} }
)==="

File diff suppressed because it is too large Load diff

View file

@ -1,4 +1,3 @@
R"===(
#ifndef FAST_DIV_HEAVY_CL #ifndef FAST_DIV_HEAVY_CL
#define FAST_DIV_HEAVY_CL #define FAST_DIV_HEAVY_CL
@ -26,4 +25,3 @@ inline long fast_div_heavy(long _a, int _b)
} }
#endif #endif
)==="

View file

@ -1,4 +1,3 @@
R"===(
/* /*
* @author SChernykh * @author SChernykh
*/ */
@ -53,5 +52,3 @@ inline uint fast_sqrt_v2(const ulong n1)
return result; return result;
} }
)==="

View file

@ -1,4 +1,3 @@
R"===(
/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */ /* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */
/* /*
* Groestl256 * Groestl256
@ -124,9 +123,6 @@ static const __constant ulong T0_G[] =
0x7bcbf646cb463d7bUL, 0xa8fc4b1ffc1fb7a8UL, 0x6dd6da61d6610c6dUL, 0x2c3a584e3a4e622cUL 0x7bcbf646cb463d7bUL, 0xa8fc4b1ffc1fb7a8UL, 0x6dd6da61d6610c6dUL, 0x2c3a584e3a4e622cUL
}; };
)==="
R"===(
static const __constant ulong T4_G[] = static const __constant ulong T4_G[] =
{ {
0xA5F432C6C6A597F4UL, 0x84976FF8F884EB97UL, 0x99B05EEEEE99C7B0UL, 0x8D8C7AF6F68DF78CUL, 0xA5F432C6C6A597F4UL, 0x84976FF8F884EB97UL, 0x99B05EEEEE99C7B0UL, 0x8D8C7AF6F68DF78CUL,
@ -291,5 +287,3 @@ static const __constant ulong T4_G[] =
ROUND_SMALL_Q(a, r); \ ROUND_SMALL_Q(a, r); \
} while (0) } while (0)
)==="

View file

@ -1,4 +1,3 @@
R"===(
/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */ /* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
/* /*
* JH implementation. * JH implementation.
@ -270,5 +269,3 @@ static const __constant ulong C[] =
} while (0) } while (0)
#endif #endif
)==="

View file

@ -1,4 +1,3 @@
R"===(
#ifndef WOLF_AES_CL #ifndef WOLF_AES_CL
#define WOLF_AES_CL #define WOLF_AES_CL
@ -149,5 +148,3 @@ uint4 AES_Round_Two_Tables(const __local uint *AES0, const __local uint *AES1, c
} }
#endif #endif
)==="

View file

@ -1,4 +1,3 @@
R"===(
#ifndef WOLF_SKEIN_CL #ifndef WOLF_SKEIN_CL
#define WOLF_SKEIN_CL #define WOLF_SKEIN_CL
@ -137,5 +136,3 @@ ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t)
} }
#endif #endif
)==="

View file

@ -52,30 +52,18 @@ public:
CN_RWZ, // "cn/rwz" CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft). CN_RWZ, // "cn/rwz" CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft).
CN_ZLS, // "cn/zls" CryptoNight variant 2 with 3/4 iterations (Zelerius). CN_ZLS, // "cn/zls" CryptoNight variant 2 with 3/4 iterations (Zelerius).
CN_DOUBLE, // "cn/double" CryptoNight variant 2 with double iterations (X-CASH). CN_DOUBLE, // "cn/double" CryptoNight variant 2 with double iterations (X-CASH).
# ifdef XMRIG_ALGO_CN_GPU
CN_GPU, // "cn/gpu" CryptoNight-GPU (Ryo). CN_GPU, // "cn/gpu" CryptoNight-GPU (Ryo).
# endif
# ifdef XMRIG_ALGO_CN_LITE
CN_LITE_0, // "cn-lite/0" CryptoNight-Lite variant 0. CN_LITE_0, // "cn-lite/0" CryptoNight-Lite variant 0.
CN_LITE_1, // "cn-lite/1" CryptoNight-Lite variant 1. CN_LITE_1, // "cn-lite/1" CryptoNight-Lite variant 1.
# endif
# ifdef XMRIG_ALGO_CN_HEAVY
CN_HEAVY_0, // "cn-heavy/0" CryptoNight-Heavy (4 MB). CN_HEAVY_0, // "cn-heavy/0" CryptoNight-Heavy (4 MB).
CN_HEAVY_TUBE, // "cn-heavy/tube" CryptoNight-Heavy (modified, TUBE only). CN_HEAVY_TUBE, // "cn-heavy/tube" CryptoNight-Heavy (modified, TUBE only).
CN_HEAVY_XHV, // "cn-heavy/xhv" CryptoNight-Heavy (modified, Haven Protocol only). CN_HEAVY_XHV, // "cn-heavy/xhv" CryptoNight-Heavy (modified, Haven Protocol only).
# endif
# ifdef XMRIG_ALGO_CN_PICO
CN_PICO_0, // "cn-pico" CryptoNight Turtle (TRTL) CN_PICO_0, // "cn-pico" CryptoNight Turtle (TRTL)
# endif
# ifdef XMRIG_ALGO_RANDOMX
RX_0, // "rx/0" RandomX (reference configuration). RX_0, // "rx/0" RandomX (reference configuration).
RX_WOW, // "rx/wow" RandomWOW (Wownero). RX_WOW, // "rx/wow" RandomWOW (Wownero).
RX_LOKI, // "rx/loki" RandomXL (Loki). RX_LOKI, // "rx/loki" RandomXL (Loki).
# endif AR2_CHUKWA, // "argon2/chukwa" Argon2id (Chukwa).
# ifdef XMRIG_ALGO_ARGON2 AR2_WRKZ, // "argon2/wrkz" Argon2id (WRKZ)
AR2_CHUKWA, // "argon2/chukwa"
AR2_WRKZ, // "argon2/wrkz"
# endif
MAX MAX
}; };