From ad94e9a7d273df6818e1817b4f04a149b6019e5f Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 19 Apr 2018 11:54:11 +0700 Subject: [PATCH] Simplify ARM implementation. --- src/Cpu_arm.cpp | 20 ++-- src/Summary.cpp | 33 ++++--- src/crypto/CryptoNight.h | 9 -- src/crypto/CryptoNight_arm.h | 172 +++++++++++------------------------ 4 files changed, 86 insertions(+), 148 deletions(-) diff --git a/src/Cpu_arm.cpp b/src/Cpu_arm.cpp index 1b3067894..59ff84217 100644 --- a/src/Cpu_arm.cpp +++ b/src/Cpu_arm.cpp @@ -4,8 +4,8 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,16 +28,16 @@ #include "Cpu.h" -char Cpu::m_brand[64] = { 0 }; -int Cpu::m_flags = 0; -int Cpu::m_l2_cache = 0; -int Cpu::m_l3_cache = 0; -int Cpu::m_sockets = 1; -int Cpu::m_totalCores = 0; -int Cpu::m_totalThreads = 0; +char Cpu::m_brand[64] = { 0 }; +int Cpu::m_flags = 0; +int Cpu::m_l2_cache = 0; +int Cpu::m_l3_cache = 0; +int Cpu::m_sockets = 1; +int Cpu::m_totalCores = 0; +size_t Cpu::m_totalThreads = 0; -int Cpu::optimalThreadsCount(xmrig::Algo algo, bool doubleHash, int maxCpuUsage) +size_t Cpu::optimalThreadsCount(size_t size, int maxCpuUsage) { return m_totalThreads; } diff --git a/src/Summary.cpp b/src/Summary.cpp index a54214e46..32a0e4bb8 100644 --- a/src/Summary.cpp +++ b/src/Summary.cpp @@ -93,21 +93,30 @@ static void print_cpu(xmrig::Config *config) static void print_threads(xmrig::Config *config) { - char buf[32]; - if (config->affinity() != -1L) { - snprintf(buf, 32, ", affinity=0x%" PRIX64, config->affinity()); + if (config->threadsMode() != xmrig::Config::Advanced) { + char buf[32]; + if (config->affinity() != -1L) { + snprintf(buf, 32, ", affinity=0x%" PRIX64, config->affinity()); + } + else { + buf[0] = '\0'; + } + + Log::i()->text(config->isColors() ? "\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, av=%d, %sdonate=%d%%%s" : " * THREADS: %d, %s, av=%d, %sdonate=%d%%%s", + config->threadsCount(), + config->algoName(), + config->algoVariant(), + config->isColors() && config->donateLevel() == 0 ? "\x1B[01;31m" : "", + config->donateLevel(), + buf); } else { - buf[0] = '\0'; + Log::i()->text(config->isColors() ? "\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, %sdonate=%d%%" : " * THREADS: %d, %s, %sdonate=%d%%", + config->threadsCount(), + config->algoName(), + config->isColors() && config->donateLevel() == 0 ? "\x1B[01;31m" : "", + config->donateLevel()); } - - Log::i()->text(config->isColors() ? "\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, av=%d, %sdonate=%d%%%s" : " * THREADS: %d, %s, av=%d, %sdonate=%d%%%s", - config->threadsCount(), - config->algoName(), - config->algoVariant(), - config->isColors() && config->donateLevel() == 0 ? "\x1B[01;31m" : "", - config->donateLevel(), - buf); } diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h index 5a4a266d5..e8e86dc42 100644 --- a/src/crypto/CryptoNight.h +++ b/src/crypto/CryptoNight.h @@ -30,15 +30,6 @@ #include -#define AEON_MEMORY 1048576 -#define AEON_MASK 0xFFFF0 -#define AEON_ITER 0x40000 - -#define MONERO_MEMORY 2097152 -#define MONERO_MASK 0x1FFFF0 -#define MONERO_ITER 0x80000 - - struct cryptonight_ctx { alignas(16) uint8_t state[200]; alignas(16) uint8_t* memory; diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 101a1f58b..fd8c19209 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -73,6 +73,13 @@ static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(const uint64 } +static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) +{ + alignas(16) const __m128i zero = { 0 }; + return veorq_u8(vaesmcq_u8(vaeseq_u8(v, zero)), rkey ); +} + + /* this one was not implemented yet so here it is */ static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i a) { @@ -155,19 +162,19 @@ static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, _ *k0 = xout0; *k1 = xout2; - SOFT_AES ? soft_aes_genkey_sub<0x01>(&xout0, &xout2) : soft_aes_genkey_sub<0x01>(&xout0, &xout2); + soft_aes_genkey_sub<0x01>(&xout0, &xout2); *k2 = xout0; *k3 = xout2; - SOFT_AES ? soft_aes_genkey_sub<0x02>(&xout0, &xout2) : soft_aes_genkey_sub<0x02>(&xout0, &xout2); + soft_aes_genkey_sub<0x02>(&xout0, &xout2); *k4 = xout0; *k5 = xout2; - SOFT_AES ? soft_aes_genkey_sub<0x04>(&xout0, &xout2) : soft_aes_genkey_sub<0x04>(&xout0, &xout2); + soft_aes_genkey_sub<0x04>(&xout0, &xout2); *k6 = xout0; *k7 = xout2; - SOFT_AES ? soft_aes_genkey_sub<0x08>(&xout0, &xout2) : soft_aes_genkey_sub<0x08>(&xout0, &xout2); + soft_aes_genkey_sub<0x08>(&xout0, &xout2); *k8 = xout0; *k9 = xout2; } @@ -186,18 +193,16 @@ static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, *x6 = soft_aesenc((uint32_t*)x6, key); *x7 = soft_aesenc((uint32_t*)x7, key); } -# ifndef XMRIG_ARMv7 else { - *x0 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x0), key)); - *x1 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x1), key)); - *x2 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x2), key)); - *x3 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x3), key)); - *x4 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x4), key)); - *x5 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x5), key)); - *x6 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x6), key)); - *x7 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x7), key)); + *x0 = _mm_aesenc_si128(*x0, key); + *x1 = _mm_aesenc_si128(*x1, key); + *x2 = _mm_aesenc_si128(*x2, key); + *x3 = _mm_aesenc_si128(*x3, key); + *x4 = _mm_aesenc_si128(*x4, key); + *x5 = _mm_aesenc_si128(*x5, key); + *x6 = _mm_aesenc_si128(*x6, key); + *x7 = _mm_aesenc_si128(*x7, key); } -# endif } @@ -234,10 +239,6 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { for (size_t i = 0; i < 16; i++) { - if (!SOFT_AES) { - aes_round(_mm_setzero_si128(), &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - } - aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); @@ -247,30 +248,13 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - - if (!SOFT_AES) { - xin0 ^= k9; - xin1 ^= k9; - xin2 ^= k9; - xin3 ^= k9; - xin4 ^= k9; - xin5 ^= k9; - xin6 ^= k9; - xin7 ^= k9; - } - else { - aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - } + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7); } } for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { - if (!SOFT_AES) { - aes_round(_mm_setzero_si128(), &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - } - aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); @@ -280,20 +264,7 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - - if (!SOFT_AES) { - xin0 ^= k9; - xin1 ^= k9; - xin2 ^= k9; - xin3 ^= k9; - xin4 ^= k9; - xin5 ^= k9; - xin6 ^= k9; - xin7 ^= k9; - } - else { - aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - } + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); _mm_store_si128(output + i + 0, xin0); _mm_store_si128(output + i + 1, xin1); @@ -335,10 +306,6 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); - if (!SOFT_AES) { - aes_round(_mm_setzero_si128(), &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - } - aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -348,20 +315,7 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - - if (!SOFT_AES) { - xout0 ^= k9; - xout1 ^= k9; - xout2 ^= k9; - xout3 ^= k9; - xout4 ^= k9; - xout5 ^= k9; - xout6 ^= k9; - xout7 ^= k9; - } - else { - aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - } + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); @@ -379,10 +333,6 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); - if (!SOFT_AES) { - aes_round(_mm_setzero_si128(), &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - } - aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -392,29 +342,12 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - - if (!SOFT_AES) { - xout0 ^= k9; - xout1 ^= k9; - xout2 ^= k9; - xout3 ^= k9; - xout4 ^= k9; - xout5 ^= k9; - xout6 ^= k9; - xout7 ^= k9; - } - else { - aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - } + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } for (size_t i = 0; i < 16; i++) { - if (!SOFT_AES) { - aes_round(_mm_setzero_si128(), &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - } - aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -424,20 +357,7 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - - if (!SOFT_AES) { - xout0 ^= k9; - xout1 ^= k9; - xout2 ^= k9; - xout3 ^= k9; - xout4 ^= k9; - xout5 ^= k9; - xout6 ^= k9; - xout7 ^= k9; - } - else { - aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - } + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } @@ -454,6 +374,21 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) } +static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) +{ + mem_out[0] = EXTRACT64(tmp); + + uint64_t vh = vgetq_lane_u64(tmp, 1); + + uint8_t x = vh >> 24; + static const uint16_t table = 0x7531; + const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; + vh ^= ((table >> index) & 0x3) << 28; + + mem_out[1] = vh; +} + + template inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) { @@ -489,13 +424,15 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si } else { cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); -# endif + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + } + + if (VARIANT > 0) { + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); + } else { + _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); } - _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - VARIANT1_1(&l0[idx0 & MASK]); idx0 = EXTRACT64(cx); bx0 = cx; @@ -580,16 +517,17 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si else { cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } - _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - VARIANT1_1(&l0[idx0 & MASK]); - VARIANT1_1(&l1[idx1 & MASK]); + if (VARIANT > 0) { + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + } else { + _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + }; idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1);