diff --git a/cmake/cn-gpu.cmake b/cmake/cn-gpu.cmake index 0fc048f56..804f657b6 100644 --- a/cmake/cn-gpu.cmake +++ b/cmake/cn-gpu.cmake @@ -1,13 +1,22 @@ if (WITH_CN_GPU AND CMAKE_SIZEOF_VOID_P EQUAL 8) - set(CN_GPU_SOURCES src/crypto/cn_gpu_avx.cpp src/crypto/cn_gpu_ssse3.cpp) - if (CMAKE_CXX_COMPILER_ID MATCHES GNU) - set_source_files_properties(src/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-O3 -mavx2") - set_source_files_properties(src/crypto/cn_gpu_ssse3.cpp PROPERTIES COMPILE_FLAGS "-O3") - elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang) - set_source_files_properties(src/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2") - elseif (CMAKE_CXX_COMPILER_ID MATCHES MSVC) - set_source_files_properties(src/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX") + if (XMRIG_ARM) + set(CN_GPU_SOURCES src/crypto/cn_gpu_arm.cpp) + + if (CMAKE_CXX_COMPILER_ID MATCHES GNU) + set_source_files_properties(src/crypto/cn_gpu_arm.cpp PROPERTIES COMPILE_FLAGS "-O3") + endif() + else() + set(CN_GPU_SOURCES src/crypto/cn_gpu_avx.cpp src/crypto/cn_gpu_ssse3.cpp) + + if (CMAKE_CXX_COMPILER_ID MATCHES GNU) + set_source_files_properties(src/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-O3 -mavx2") + set_source_files_properties(src/crypto/cn_gpu_ssse3.cpp PROPERTIES COMPILE_FLAGS "-O3") + elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang) + set_source_files_properties(src/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2") + elseif (CMAKE_CXX_COMPILER_ID MATCHES MSVC) + set_source_files_properties(src/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX") + endif() endif() else() set(CN_GPU_SOURCES "") diff --git a/src/common/cpu/BasicCpuInfo.cpp b/src/common/cpu/BasicCpuInfo.cpp index 0e6ff94aa..e9018c98a 100644 --- a/src/common/cpu/BasicCpuInfo.cpp +++ b/src/common/cpu/BasicCpuInfo.cpp @@ -109,9 +109,9 @@ static inline bool has_avx2() xmrig::BasicCpuInfo::BasicCpuInfo() : m_assembly(ASM_NONE), - m_brand(), m_aes(has_aes_ni()), m_avx2(has_avx2()), + m_brand(), m_threads(std::thread::hardware_concurrency()) { cpu_brand_string(m_brand); diff --git a/src/common/cpu/BasicCpuInfo.h b/src/common/cpu/BasicCpuInfo.h index 2e0b3d0de..95857ed27 100644 --- a/src/common/cpu/BasicCpuInfo.h +++ b/src/common/cpu/BasicCpuInfo.h @@ -60,9 +60,9 @@ protected: private: Assembly m_assembly; + bool m_aes; + bool m_avx2; char m_brand[64]; - const bool m_aes; - const bool m_avx2; int32_t m_threads; }; diff --git a/src/common/cpu/BasicCpuInfo_arm.cpp b/src/common/cpu/BasicCpuInfo_arm.cpp index c1c127db6..e1df86df4 100644 --- a/src/common/cpu/BasicCpuInfo_arm.cpp +++ b/src/common/cpu/BasicCpuInfo_arm.cpp @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2017-2019 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -30,6 +31,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : m_aes(false), + m_avx2(false), m_brand(), m_threads(std::thread::hardware_concurrency()) { diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index b8c5092f1..456b5d744 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -5,9 +5,9 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2016 Imran Yusuff - * Copyright 2017-2018 XMR-Stak , + * Copyright 2017-2019 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2018 SChernykh + * Copyright 2018-2019 SChernykh * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -284,6 +284,34 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) } +#ifndef XMRIG_NO_CN_GPU +template +void cn_explode_scratchpad_gpu(const uint8_t *input, uint8_t *output) +{ + constexpr size_t hash_size = 200; // 25x8 bytes + alignas(16) uint64_t hash[25]; + + for (uint64_t i = 0; i < MEM / 512; i++) + { + memcpy(hash, input, hash_size); + hash[0] ^= i; + + xmrig::keccakf(hash, 24); + memcpy(output, hash, 160); + output += 160; + + xmrig::keccakf(hash, 24); + memcpy(output, hash, 176); + output += 176; + + xmrig::keccakf(hash, 24); + memcpy(output, hash, 176); + output += 176; + } +} +#endif + + template static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) { @@ -541,6 +569,33 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si } +#ifndef XMRIG_NO_CN_GPU +template +void cn_gpu_inner_arm(const uint8_t *spad, uint8_t *lpad); + + +template +inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) +{ + constexpr size_t MASK = xmrig::CRYPTONIGHT_GPU_MASK; + constexpr size_t ITERATIONS = xmrig::cn_select_iter(); + constexpr size_t MEM = xmrig::cn_select_memory(); + + static_assert(MASK > 0 && ITERATIONS > 0 && MEM > 0, "unsupported algorithm/variant"); + + xmrig::keccak(input, size, ctx[0]->state); + cn_explode_scratchpad_gpu(ctx[0]->state, ctx[0]->memory); + + cn_gpu_inner_arm(ctx[0]->state, ctx[0]->memory); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + xmrig::keccakf((uint64_t*) ctx[0]->state, 24); + memcpy(output, ctx[0]->state, 32); +} +#endif + + template inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx) { diff --git a/src/crypto/cn_gpu_arm.cpp b/src/crypto/cn_gpu_arm.cpp new file mode 100644 index 000000000..b463dd2ec --- /dev/null +++ b/src/crypto/cn_gpu_arm.cpp @@ -0,0 +1,240 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2019 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include + + +#include "crypto/CryptoNight_constants.h" + + +inline void vandq_f32(float32x4_t &v, uint32_t v2) +{ + uint32x4_t vc = vdupq_n_u32(v2); + v = (float32x4_t)vandq_u32((uint32x4_t)v, vc); +} + + +inline void vorq_f32(float32x4_t &v, uint32_t v2) +{ + uint32x4_t vc = vdupq_n_u32(v2); + v = (float32x4_t)vorrq_u32((uint32x4_t)v, vc); +} + + +template +inline void vrot_si32(int32x4_t &r) +{ + r = (int32x4_t)vextq_s8((int8x16_t)r, (int8x16_t)r, v); +} + +template <> +inline void vrot_si32<0>(int32x4_t &r) +{ +} + + +inline uint32_t vheor_s32(const int32x4_t &v) +{ + int32x4_t v0 = veorq_s32(v, vrev64q_s32(v)); + int32x2_t vf = veor_s32(vget_high_s32(v0), vget_low_s32(v0)); + return (uint32_t)vget_lane_s32(vf, 0); +} + + +inline void prep_dv(int32_t *idx, int32x4_t &v, float32x4_t &n) +{ + v = vld1q_s32(idx); + n = vcvtq_f32_s32(v); +} + + +inline void sub_round(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, const float32x4_t &rnd_c, float32x4_t &n, float32x4_t &d, float32x4_t &c) +{ + float32x4_t ln1 = vaddq_f32(n1, c); + float32x4_t nn = vmulq_f32(n0, c); + nn = vmulq_f32(ln1, vmulq_f32(nn, nn)); + vandq_f32(nn, 0xFEFFFFFF); + vorq_f32(nn, 0x00800000); + n = vaddq_f32(n, nn); + + float32x4_t ln3 = vsubq_f32(n3, c); + float32x4_t dd = vmulq_f32(n2, c); + dd = vmulq_f32(ln3, vmulq_f32(dd, dd)); + vandq_f32(dd, 0xFEFFFFFF); + vorq_f32(dd, 0x00800000); + d = vaddq_f32(d, dd); + + //Constant feedback + c = vaddq_f32(c, rnd_c); + c = vaddq_f32(c, vdupq_n_f32(0.734375f)); + float32x4_t r = vaddq_f32(nn, dd); + vandq_f32(r, 0x807FFFFF); + vorq_f32(r, 0x40000000); + c = vaddq_f32(c, r); +} + + +inline void round_compute(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, const float32x4_t &rnd_c, float32x4_t &c, float32x4_t &r) +{ + float32x4_t n = vdupq_n_f32(0.0f), d = vdupq_n_f32(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + vandq_f32(d, 0xFF7FFFFF); + vorq_f32(d, 0x40000000); + r = vaddq_f32(r, vdivq_f32(n, d)); +} + + +// 112×4 = 448 +template +inline int32x4_t single_compute(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, float cnt, const float32x4_t &rnd_c, float32x4_t &sum) +{ + float32x4_t c = vdupq_n_f32(cnt); + float32x4_t r = vdupq_n_f32(0.0f); + + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + + // do a quick fmod by setting exp to 2 + vandq_f32(r, 0x807FFFFF); + vorq_f32(r, 0x40000000); + + if (add) { + sum = vaddq_f32(sum, r); + } else { + sum = r; + } + + const float32x4_t cc2 = vdupq_n_f32(536870880.0f); + r = vmulq_f32(r, cc2); // 35 + return vcvtq_s32_f32(r); +} + + +template +inline void single_compute_wrap(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, float cnt, const float32x4_t &rnd_c, float32x4_t &sum, int32x4_t &out) +{ + int32x4_t r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum); + vrot_si32(r); + out = veorq_s32(out, r); +} + + +template +inline int32_t *scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast(lpad + (idx & MASK) + n * 16); } + + +template +void cn_gpu_inner_arm(const uint8_t *spad, uint8_t *lpad) +{ + uint32_t s = reinterpret_cast(spad)[0] >> 8; + int32_t *idx0 = scratchpad_ptr(lpad, s, 0); + int32_t *idx1 = scratchpad_ptr(lpad, s, 1); + int32_t *idx2 = scratchpad_ptr(lpad, s, 2); + int32_t *idx3 = scratchpad_ptr(lpad, s, 3); + float32x4_t sum0 = vdupq_n_f32(0.0f); + + for (size_t i = 0; i < ITER; i++) { + float32x4_t n0, n1, n2, n3; + int32x4_t v0, v1, v2, v3; + float32x4_t suma, sumb, sum1, sum2, sum3; + + prep_dv(idx0, v0, n0); + prep_dv(idx1, v1, n1); + prep_dv(idx2, v2, n2); + prep_dv(idx3, v3, n3); + float32x4_t rc = sum0; + + int32x4_t out, out2; + out = vdupq_n_s32(0); + single_compute_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out); + single_compute_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out); + single_compute_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out); + single_compute_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out); + sum0 = vaddq_f32(suma, sumb); + vst1q_s32(idx0, veorq_s32(v0, out)); + out2 = out; + + out = vdupq_n_s32(0); + single_compute_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out); + single_compute_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out); + single_compute_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out); + single_compute_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out); + sum1 = vaddq_f32(suma, sumb); + vst1q_s32(idx1, veorq_s32(v1, out)); + out2 = veorq_s32(out2, out); + + out = vdupq_n_s32(0); + single_compute_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out); + single_compute_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out); + single_compute_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out); + single_compute_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out); + sum2 = vaddq_f32(suma, sumb); + vst1q_s32(idx2, veorq_s32(v2, out)); + out2 = veorq_s32(out2, out); + + out = vdupq_n_s32(0); + single_compute_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out); + single_compute_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out); + single_compute_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out); + single_compute_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out); + sum3 = vaddq_f32(suma, sumb); + vst1q_s32(idx3, veorq_s32(v3, out)); + out2 = veorq_s32(out2, out); + + sum0 = vaddq_f32(sum0, sum1); + sum2 = vaddq_f32(sum2, sum3); + sum0 = vaddq_f32(sum0, sum2); + + const float32x4_t cc1 = vdupq_n_f32(16777216.0f); + const float32x4_t cc2 = vdupq_n_f32(64.0f); + vandq_f32(sum0, 0x7fffffff); // take abs(va) by masking the float sign bit + // vs range 0 - 64 + n0 = vmulq_f32(sum0, cc1); + v0 = vcvtq_s32_f32(n0); + v0 = veorq_s32(v0, out2); + uint32_t n = vheor_s32(v0); + + // vs is now between 0 and 1 + sum0 = vdivq_f32(sum0, cc2); + idx0 = scratchpad_ptr(lpad, n, 0); + idx1 = scratchpad_ptr(lpad, n, 1); + idx2 = scratchpad_ptr(lpad, n, 2); + idx3 = scratchpad_ptr(lpad, n, 3); + } +} + +template void cn_gpu_inner_arm(const uint8_t* spad, uint8_t* lpad);