diff --git a/cmake/astrobwt.cmake b/cmake/astrobwt.cmake index 8c89da00..9ba06e0c 100644 --- a/cmake/astrobwt.cmake +++ b/cmake/astrobwt.cmake @@ -23,6 +23,16 @@ if (WITH_ASTROBWT) src/crypto/astrobwt/salsa20_ref/salsa20.c ) else() + if (CMAKE_SIZEOF_VOID_P EQUAL 8) + enable_language(ASM_MASM) + add_definitions(/DASTROBWT_AVX2) + if (CMAKE_C_COMPILER_ID MATCHES MSVC) + list(APPEND SOURCES_CRYPTO src/crypto/astrobwt/sha3_256_avx2.asm) + else() + list(APPEND SOURCES_CRYPTO src/crypto/astrobwt/sha3_256_avx2.S) + endif() + endif() + list(APPEND HEADERS_CRYPTO src/crypto/astrobwt/Salsa20.hpp ) diff --git a/src/backend/cpu/CpuConfig.cpp b/src/backend/cpu/CpuConfig.cpp index 0f156e92..709b90dc 100644 --- a/src/backend/cpu/CpuConfig.cpp +++ b/src/backend/cpu/CpuConfig.cpp @@ -52,6 +52,7 @@ static const char *kArgon2Impl = "argon2-impl"; #ifdef XMRIG_ALGO_ASTROBWT static const char* kAstroBWTMaxSize = "astrobwt-max-size"; +static const char* kAstroBWTAVX2 = "astrobwt-avx2"; #endif @@ -94,6 +95,7 @@ rapidjson::Value xmrig::CpuConfig::toJSON(rapidjson::Document &doc) const # ifdef XMRIG_ALGO_ASTROBWT obj.AddMember(StringRef(kAstroBWTMaxSize), m_astrobwtMaxSize, allocator); + obj.AddMember(StringRef(kAstroBWTAVX2), m_astrobwtAVX2, allocator); # endif m_threads.toJSON(obj, doc); @@ -148,12 +150,20 @@ void xmrig::CpuConfig::read(const rapidjson::Value &value) # endif # ifdef XMRIG_ALGO_ASTROBWT - const auto& obj = Json::getValue(value, kAstroBWTMaxSize); - if (obj.IsNull() || !obj.IsInt()) { + const auto& astroBWTMaxSize = Json::getValue(value, kAstroBWTMaxSize); + if (astroBWTMaxSize.IsNull() || !astroBWTMaxSize.IsInt()) { m_shouldSave = true; } else { - m_astrobwtMaxSize = std::min(std::max(obj.GetInt(), 400), 1200); + m_astrobwtMaxSize = std::min(std::max(astroBWTMaxSize.GetInt(), 400), 1200); + } + + const auto& astroBWTAVX2 = Json::getValue(value, kAstroBWTAVX2); + if (astroBWTAVX2.IsNull() || !astroBWTAVX2.IsBool()) { + m_shouldSave = true; + } + else { + m_astrobwtAVX2 = astroBWTAVX2.GetBool(); } # endif diff --git a/src/backend/cpu/CpuConfig.h b/src/backend/cpu/CpuConfig.h index c294f069..4f034258 100644 --- a/src/backend/cpu/CpuConfig.h +++ b/src/backend/cpu/CpuConfig.h @@ -60,6 +60,7 @@ public: inline const String &argon2Impl() const { return m_argon2Impl; } inline const Threads &threads() const { return m_threads; } inline int astrobwtMaxSize() const { return m_astrobwtMaxSize; } + inline bool astrobwtAVX2() const { return m_astrobwtAVX2; } inline int priority() const { return m_priority; } inline uint32_t limit() const { return m_limit; } @@ -77,6 +78,7 @@ private: bool m_shouldSave = false; bool m_yield = true; int m_astrobwtMaxSize = 550; + bool m_astrobwtAVX2 = false; int m_memoryPool = 0; int m_priority = -1; String m_argon2Impl; diff --git a/src/backend/cpu/CpuLaunchData.cpp b/src/backend/cpu/CpuLaunchData.cpp index dbc7e5c5..87b7c26b 100644 --- a/src/backend/cpu/CpuLaunchData.cpp +++ b/src/backend/cpu/CpuLaunchData.cpp @@ -39,6 +39,7 @@ xmrig::CpuLaunchData::CpuLaunchData(const Miner *miner, const Algorithm &algorit hwAES(config.isHwAES()), yield(config.isYield()), astrobwtMaxSize(config.astrobwtMaxSize()), + astrobwtAVX2(config.astrobwtAVX2()), priority(config.priority()), affinity(thread.affinity()), miner(miner), diff --git a/src/backend/cpu/CpuLaunchData.h b/src/backend/cpu/CpuLaunchData.h index 1cadc6de..17c0ee90 100644 --- a/src/backend/cpu/CpuLaunchData.h +++ b/src/backend/cpu/CpuLaunchData.h @@ -62,6 +62,7 @@ public: const bool hwAES; const bool yield; const int astrobwtMaxSize; + const bool astrobwtAVX2; const int priority; const int64_t affinity; const Miner *miner; diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp index 0740cca2..c78a8bda 100644 --- a/src/backend/cpu/CpuWorker.cpp +++ b/src/backend/cpu/CpuWorker.cpp @@ -81,6 +81,7 @@ xmrig::CpuWorker::CpuWorker(size_t id, const CpuLaunchData &data) : m_yield(data.yield), m_av(data.av()), m_astrobwtMaxSize(data.astrobwtMaxSize * 1000), + m_astrobwtAVX2(data.astrobwtAVX2), m_miner(data.miner), m_ctx() { @@ -262,7 +263,7 @@ void xmrig::CpuWorker::start() { # ifdef XMRIG_ALGO_ASTROBWT if (job.algorithm().family() == Algorithm::ASTROBWT) { - if (!astrobwt::astrobwt_dero(m_job.blob(), job.size(), m_ctx[0]->memory, m_hash, m_astrobwtMaxSize)) + if (!astrobwt::astrobwt_dero(m_job.blob(), job.size(), m_ctx[0]->memory, m_hash, m_astrobwtMaxSize, m_astrobwtAVX2)) valid = false; } else diff --git a/src/backend/cpu/CpuWorker.h b/src/backend/cpu/CpuWorker.h index 8d2c583f..a68da174 100644 --- a/src/backend/cpu/CpuWorker.h +++ b/src/backend/cpu/CpuWorker.h @@ -74,6 +74,7 @@ private: const bool m_yield; const CnHash::AlgoVariant m_av; const int m_astrobwtMaxSize; + const bool m_astrobwtAVX2; const Miner *m_miner; cryptonight_ctx *m_ctx[N]; uint8_t m_hash[N * 32]{ 0 }; diff --git a/src/core/Miner.cpp b/src/core/Miner.cpp index 55a448e2..254428fb 100644 --- a/src/core/Miner.cpp +++ b/src/core/Miner.cpp @@ -41,6 +41,7 @@ #include "core/Miner.h" #include "crypto/common/Nonce.h" #include "crypto/rx/Rx.h" +#include "crypto/astrobwt/AstroBWT.h" #include "rapidjson/document.h" #include "version.h" @@ -242,6 +243,10 @@ public: # endif +# ifdef XMRIG_ALGO_ASTROBWT + inline bool initAstroBWT() { return astrobwt::init(job); } +# endif + Algorithm algorithm; Algorithms algorithms; bool active = false; @@ -454,10 +459,14 @@ void xmrig::Miner::setJob(const Job &job, bool donate) d_ptr->userJobId = job.id(); } + bool ready = true; + # ifdef XMRIG_ALGO_RANDOMX - const bool ready = d_ptr->initRX(); -# else - constexpr const bool ready = true; + ready &= d_ptr->initRX(); +# endif + +# ifdef XMRIG_ALGO_ASTROBWT + ready &= d_ptr->initAstroBWT(); # endif mutex.unlock(); diff --git a/src/crypto/astrobwt/AstroBWT.cpp b/src/crypto/astrobwt/AstroBWT.cpp index ca325bff..29e34ecf 100644 --- a/src/crypto/astrobwt/AstroBWT.cpp +++ b/src/crypto/astrobwt/AstroBWT.cpp @@ -30,6 +30,10 @@ #include "AstroBWT.h" #include "sha3.h" #include "crypto/cn/CryptoNight.h" +#include "base/net/stratum/Job.h" +#include "base/crypto/Algorithm.h" +#include "base/io/log/Log.h" +#include "backend/cpu/Cpu.h" #include constexpr int STAGE1_SIZE = 147253; @@ -38,6 +42,18 @@ constexpr int ALLOCATION_SIZE = (STAGE1_SIZE + 1048576) + (128 - (STAGE1_SIZE & constexpr int COUNTING_SORT_BITS = 10; constexpr int COUNTING_SORT_SIZE = 1 << COUNTING_SORT_BITS; +static bool astrobwtInitialized = false; + +#ifdef ASTROBWT_AVX2 +static bool hasAVX2 = false; + +extern "C" +#ifdef __GNUC__ +__attribute__((ms_abi)) +#endif +void SHA3_256_AVX2_ASM(const void* in, size_t inBytes, void* out); +#endif + #ifdef _MSC_VER #include @@ -155,7 +171,25 @@ void sort_indices(int N, const uint8_t* v, uint64_t* indices, uint64_t* tmp_indi } } -bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size) +bool xmrig::astrobwt::init(const xmrig::Job& job) +{ + if (job.algorithm().family() != xmrig::Algorithm::ASTROBWT) + return true; + + if (astrobwtInitialized) + return true; + +#ifdef ASTROBWT_AVX2 + if (xmrig::Cpu::info()->hasAVX2()) { + hasAVX2 = true; + } +#endif + + astrobwtInitialized = true; + return true; +} + +bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2) { uint8_t key[32]; uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64; @@ -166,7 +200,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, uint8_t* stage1_result = (uint8_t*)(tmp_indices); uint8_t* stage2_result = (uint8_t*)(tmp_indices); - sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key)); +#ifdef ASTROBWT_AVX2 + if (hasAVX2 && avx2) + SHA3_256_AVX2_ASM(input_data, input_size, key); + else +#endif + sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key)); Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE); @@ -178,7 +217,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, stage1_result[i] = tmp[indices[i] & ((1 << 21) - 1)]; } - sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key)); +#ifdef ASTROBWT_AVX2 + if (hasAVX2 && avx2) + SHA3_256_AVX2_ASM(stage1_result, STAGE1_SIZE + 1, key); + else +#endif + sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key)); const int stage2_size = STAGE1_SIZE + (*(uint32_t*)(key) & 0xfffff); if (stage2_size > stage2_max_size) @@ -203,7 +247,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, stage2_result[i] = tmp[indices[i] & ((1 << 21) - 1)]; } - sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32); +#ifdef ASTROBWT_AVX2 + if (hasAVX2 && avx2) + SHA3_256_AVX2_ASM(stage2_result, stage2_size + 1, output_hash); + else +#endif + sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32); return true; } @@ -211,5 +260,5 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, template<> void xmrig::astrobwt::single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t) { - astrobwt_dero(input, static_cast(size), ctx[0]->memory, output, std::numeric_limits::max()); + astrobwt_dero(input, static_cast(size), ctx[0]->memory, output, std::numeric_limits::max(), true); } diff --git a/src/crypto/astrobwt/AstroBWT.h b/src/crypto/astrobwt/AstroBWT.h index 1419641e..e10dc8a5 100644 --- a/src/crypto/astrobwt/AstroBWT.h +++ b/src/crypto/astrobwt/AstroBWT.h @@ -33,9 +33,14 @@ struct cryptonight_ctx; -namespace xmrig { namespace astrobwt { +namespace xmrig { -bool astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size); +class Job; + +namespace astrobwt { + +bool init(const Job&); +bool astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2); template void single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t); @@ -44,4 +49,4 @@ template<> void single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t); -}} // namespace xmrig::argon2 +}} // namespace xmrig::astrobwt diff --git a/src/crypto/astrobwt/sha3_256_avx2.S b/src/crypto/astrobwt/sha3_256_avx2.S new file mode 100644 index 00000000..511294d3 --- /dev/null +++ b/src/crypto/astrobwt/sha3_256_avx2.S @@ -0,0 +1,50 @@ +;# XMRig +;# Copyright 2010 Jeff Garzik +;# Copyright 2012-2014 pooler +;# Copyright 2014 Lucas Jones +;# Copyright 2014-2016 Wolf9466 +;# Copyright 2016 Jay D Dee +;# Copyright 2017-2019 XMR-Stak , +;# Copyright 2018 Lee Clagett +;# Copyright 2018-2019 tevador +;# Copyright 2000 Transmeta Corporation +;# Copyright 2004-2008 H. Peter Anvin +;# Copyright 2018-2020 SChernykh +;# Copyright 2016-2020 XMRig , +;# +;# This program is free software: you can redistribute it and/or modify +;# it under the terms of the GNU General Public License as published by +;# the Free Software Foundation, either version 3 of the License, or +;# (at your option) any later version. +;# +;# This program is distributed in the hope that it will be useful, +;# but WITHOUT ANY WARRANTY; without even the implied warranty of +;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;# GNU General Public License for more details. +;# +;# You should have received a copy of the GNU General Public License +;# along with this program. If not, see . +;# + +.intel_syntax noprefix +#if defined(__APPLE__) +.text +#define DECL(x) _##x +#else +.section .text +#define DECL(x) x +#endif + +#define ALIGN .balign +#define dq .quad + +.global DECL(SHA3_256_AVX2_ASM) + +#include "sha3_256_avx2.inc" + +KeccakF1600_AVX2_ASM: + lea r8,[rip+rot_left+96] + lea r9,[rip+rot_right+96] + lea r10,[rip+rndc] + +#include "sha3_256_keccakf1600_avx2.inc" diff --git a/src/crypto/astrobwt/sha3_256_avx2.asm b/src/crypto/astrobwt/sha3_256_avx2.asm new file mode 100644 index 00000000..c4d5e409 --- /dev/null +++ b/src/crypto/astrobwt/sha3_256_avx2.asm @@ -0,0 +1,42 @@ +;# XMRig +;# Copyright 2010 Jeff Garzik +;# Copyright 2012-2014 pooler +;# Copyright 2014 Lucas Jones +;# Copyright 2014-2016 Wolf9466 +;# Copyright 2016 Jay D Dee +;# Copyright 2017-2019 XMR-Stak , +;# Copyright 2018 Lee Clagett +;# Copyright 2018-2019 tevador +;# Copyright 2000 Transmeta Corporation +;# Copyright 2004-2008 H. Peter Anvin +;# Copyright 2018-2020 SChernykh +;# Copyright 2016-2020 XMRig , +;# +;# This program is free software: you can redistribute it and/or modify +;# it under the terms of the GNU General Public License as published by +;# the Free Software Foundation, either version 3 of the License, or +;# (at your option) any later version. +;# +;# This program is distributed in the hope that it will be useful, +;# but WITHOUT ANY WARRANTY; without even the implied warranty of +;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;# GNU General Public License for more details. +;# +;# You should have received a copy of the GNU General Public License +;# along with this program. If not, see . +;# + +_SHA3_256_AVX2_ASM SEGMENT PAGE READ EXECUTE +PUBLIC SHA3_256_AVX2_ASM + +include sha3_256_avx2.inc + +KeccakF1600_AVX2_ASM: + lea r8,[rot_left+96] + lea r9,[rot_right+96] + lea r10,[rndc] + +include sha3_256_keccakf1600_avx2.inc + +_SHA3_256_AVX2_ASM ENDS +END diff --git a/src/crypto/astrobwt/sha3_256_avx2.inc b/src/crypto/astrobwt/sha3_256_avx2.inc new file mode 100644 index 00000000..3c27a31a --- /dev/null +++ b/src/crypto/astrobwt/sha3_256_avx2.inc @@ -0,0 +1,164 @@ +;# XMRig +;# Copyright 2010 Jeff Garzik +;# Copyright 2012-2014 pooler +;# Copyright 2014 Lucas Jones +;# Copyright 2014-2016 Wolf9466 +;# Copyright 2016 Jay D Dee +;# Copyright 2017-2019 XMR-Stak , +;# Copyright 2018 Lee Clagett +;# Copyright 2018-2019 tevador +;# Copyright 2000 Transmeta Corporation +;# Copyright 2004-2008 H. Peter Anvin +;# Copyright 2018-2020 SChernykh +;# Copyright 2016-2020 XMRig , +;# +;# This program is free software: you can redistribute it and/or modify +;# it under the terms of the GNU General Public License as published by +;# the Free Software Foundation, either version 3 of the License, or +;# (at your option) any later version. +;# +;# This program is distributed in the hope that it will be useful, +;# but WITHOUT ANY WARRANTY; without even the implied warranty of +;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;# GNU General Public License for more details. +;# +;# You should have received a copy of the GNU General Public License +;# along with this program. If not, see . +;# + +ALIGN 64 +SHA3_256_AVX2_ASM: + vzeroupper + + mov qword ptr [rsp+8],rbx + mov qword ptr [rsp+16],rsi + mov qword ptr [rsp+24],rdi + push rbp + push r12 + push r13 + push r14 + push r15 + + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm6 + movdqu xmmword ptr [rsp+48], xmm7 + movdqu xmmword ptr [rsp+32], xmm8 + movdqu xmmword ptr [rsp+16], xmm9 + movdqu xmmword ptr [rsp+0], xmm10 + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm11 + movdqu xmmword ptr [rsp+48], xmm12 + movdqu xmmword ptr [rsp+32], xmm13 + movdqu xmmword ptr [rsp+16], xmm14 + movdqu xmmword ptr [rsp+0], xmm15 + + sub rsp,320 + lea rbp,[rsp+64] + and rbp,-32 + vpxor xmm0,xmm0,xmm0 + xor edi,edi + mov dword ptr [rbp],50462976 + mov r12,rdx + mov dword ptr [rbp+4],169150212 + mov r14,rdx + mov dword ptr [rbp+8],218436623 + shr r14,3 + and r12d,7 + mov dword ptr [rbp+12],135009046 + mov r13,r8 + mov byte ptr [rbp+16],9 + mov rsi,rcx + mov ebx,edi + vmovdqa ymmword ptr [rbp+32],ymm0 + vmovdqa ymmword ptr [rbp+64],ymm0 + vmovdqa ymmword ptr [rbp+96],ymm0 + vmovdqa ymmword ptr [rbp+128],ymm0 + vmovdqa ymmword ptr [rbp+160],ymm0 + vmovdqa ymmword ptr [rbp+192],ymm0 + vmovdqa ymmword ptr [rbp+224],ymm0 + test r14,r14 + je sha3_main_loop_end + +sha3_main_loop: + movzx eax,byte ptr [rbp+rbx] + lea rcx,[rbp+32] + lea rcx,[rcx+rax*8] + mov rax,qword ptr [rsi] + xor qword ptr [rcx],rax + lea r15,[rbx+1] + cmp rbx,16 + jne skip_keccak + + lea rcx,[rbp+32] + call KeccakF1600_AVX2_ASM + +skip_keccak: + cmp rbx,16 + mov rax,rdi + cmovne rax,r15 + add rsi,8 + mov rbx,rax + sub r14,1 + jne sha3_main_loop + +sha3_main_loop_end: + mov rdx,rdi + test r12,r12 + je sha3_tail_loop_end + mov r8,rdi + +sha3_tail_loop: + movzx eax,byte ptr [rdx+rsi] + inc rdx + shlx rcx,rax,r8 + or rdi,rcx + add r8,8 + cmp rdx,r12 + jb sha3_tail_loop + +sha3_tail_loop_end: + movzx eax,byte ptr [rbp+rbx] + lea rdx,[rbp+32] + lea rdx,[rdx+rax*8] + mov ecx,6 + lea rax,[r12*8] + shlx rcx,rcx,rax + xor rcx,qword ptr [rdx] + mov eax,1 + shl rax,63 + xor rcx,rdi + mov qword ptr [rdx],rcx + xor qword ptr [rbp+104],rax + + lea rcx,[rbp+32] + call KeccakF1600_AVX2_ASM + + vmovups ymm0,ymmword ptr [rbp+32] + vmovups ymmword ptr [r13],ymm0 + vzeroupper + + add rsp,320 + + movdqu xmm15, xmmword ptr [rsp] + movdqu xmm14, xmmword ptr [rsp+16] + movdqu xmm13, xmmword ptr [rsp+32] + movdqu xmm12, xmmword ptr [rsp+48] + movdqu xmm11, xmmword ptr [rsp+64] + add rsp, 80 + movdqu xmm10, xmmword ptr [rsp] + movdqu xmm9, xmmword ptr [rsp+16] + movdqu xmm8, xmmword ptr [rsp+32] + movdqu xmm7, xmmword ptr [rsp+48] + movdqu xmm6, xmmword ptr [rsp+64] + add rsp, 80 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + mov rbx,qword ptr [rsp+8] + mov rsi,qword ptr [rsp+16] + mov rdi,qword ptr [rsp+24] + + ret diff --git a/src/crypto/astrobwt/sha3_256_keccakf1600_avx2.inc b/src/crypto/astrobwt/sha3_256_keccakf1600_avx2.inc new file mode 100644 index 00000000..a9bae47a --- /dev/null +++ b/src/crypto/astrobwt/sha3_256_keccakf1600_avx2.inc @@ -0,0 +1,203 @@ +;# XMRig +;# Copyright 2010 Jeff Garzik +;# Copyright 2012-2014 pooler +;# Copyright 2014 Lucas Jones +;# Copyright 2014-2016 Wolf9466 +;# Copyright 2016 Jay D Dee +;# Copyright 2017-2019 XMR-Stak , +;# Copyright 2018 Lee Clagett +;# Copyright 2018-2019 tevador +;# Copyright 2000 Transmeta Corporation +;# Copyright 2004-2008 H. Peter Anvin +;# Copyright 2018-2020 SChernykh +;# Copyright 2016-2020 XMRig , +;# +;# This program is free software: you can redistribute it and/or modify +;# it under the terms of the GNU General Public License as published by +;# the Free Software Foundation, either version 3 of the License, or +;# (at your option) any later version. +;# +;# This program is distributed in the hope that it will be useful, +;# but WITHOUT ANY WARRANTY; without even the implied warranty of +;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;# GNU General Public License for more details. +;# +;# You should have received a copy of the GNU General Public License +;# along with this program. If not, see . +;# + + mov eax,24 + lea rcx,[rcx+96] + vpbroadcastq ymm0,QWORD PTR [rcx-96] + vmovdqu ymm1,YMMWORD PTR [rcx-88] + vmovdqu ymm2,YMMWORD PTR [rcx-56] + vmovdqu ymm3,YMMWORD PTR [rcx-24] + vmovdqu ymm4,YMMWORD PTR [rcx+8] + vmovdqu ymm5,YMMWORD PTR [rcx+40] + vmovdqu ymm6,YMMWORD PTR [rcx+72] + +ALIGN 64 +Loop_avx2: + vpshufd ymm13,ymm2,78 + vpxor ymm12,ymm5,ymm3 + vpxor ymm9,ymm4,ymm6 + vpxor ymm12,ymm12,ymm1 + vpxor ymm12,ymm12,ymm9 + vpermq ymm11,ymm12,147 + vpxor ymm13,ymm13,ymm2 + vpermq ymm7,ymm13,78 + vpsrlq ymm8,ymm12,63 + vpaddq ymm9,ymm12,ymm12 + vpor ymm8,ymm8,ymm9 + vpermq ymm15,ymm8,57 + vpxor ymm14,ymm8,ymm11 + vpermq ymm14,ymm14,0 + vpxor ymm13,ymm13,ymm0 + vpxor ymm13,ymm13,ymm7 + vpsrlq ymm7,ymm13,63 + vpaddq ymm8,ymm13,ymm13 + vpor ymm8,ymm8,ymm7 + vpxor ymm2,ymm2,ymm14 + vpxor ymm0,ymm0,ymm14 + vpblendd ymm15,ymm15,ymm8,192 + vpblendd ymm11,ymm11,ymm13,3 + vpxor ymm15,ymm15,ymm11 + vpsllvq ymm10,ymm2,YMMWORD PTR [r8-96] + vpsrlvq ymm2,ymm2,YMMWORD PTR [r9-96] + vpor ymm2,ymm2,ymm10 + vpxor ymm3,ymm3,ymm15 + vpsllvq ymm11,ymm3,YMMWORD PTR [r8-32] + vpsrlvq ymm3,ymm3,YMMWORD PTR [r9-32] + vpor ymm3,ymm3,ymm11 + vpxor ymm4,ymm4,ymm15 + vpsllvq ymm12,ymm4,YMMWORD PTR [r8] + vpsrlvq ymm4,ymm4,YMMWORD PTR [r9] + vpor ymm4,ymm4,ymm12 + vpxor ymm5,ymm5,ymm15 + vpsllvq ymm13,ymm5,YMMWORD PTR [r8+32] + vpsrlvq ymm5,ymm5,YMMWORD PTR [r9+32] + vpor ymm5,ymm5,ymm13 + vpxor ymm6,ymm6,ymm15 + vpermq ymm10,ymm2,141 + vpermq ymm11,ymm3,141 + vpsllvq ymm14,ymm6,YMMWORD PTR [r8+64] + vpsrlvq ymm8,ymm6,YMMWORD PTR [r9+64] + vpor ymm8,ymm8,ymm14 + vpxor ymm1,ymm1,ymm15 + vpermq ymm12,ymm4,27 + vpermq ymm13,ymm5,114 + vpsllvq ymm15,ymm1,YMMWORD PTR [r8-64] + vpsrlvq ymm9,ymm1,YMMWORD PTR [r9-64] + vpor ymm9,ymm9,ymm15 + vpsrldq ymm14,ymm8,8 + vpandn ymm7,ymm8,ymm14 + vpblendd ymm3,ymm9,ymm13,12 + vpblendd ymm15,ymm11,ymm9,12 + vpblendd ymm5,ymm10,ymm11,12 + vpblendd ymm14,ymm9,ymm10,12 + vpblendd ymm3,ymm3,ymm11,48 + vpblendd ymm15,ymm15,ymm12,48 + vpblendd ymm5,ymm5,ymm9,48 + vpblendd ymm14,ymm14,ymm13,48 + vpblendd ymm3,ymm3,ymm12,192 + vpblendd ymm15,ymm15,ymm13,192 + vpblendd ymm5,ymm5,ymm13,192 + vpblendd ymm14,ymm14,ymm11,192 + vpandn ymm3,ymm3,ymm15 + vpandn ymm5,ymm5,ymm14 + vpblendd ymm6,ymm12,ymm9,12 + vpblendd ymm15,ymm10,ymm12,12 + vpxor ymm3,ymm3,ymm10 + vpblendd ymm6,ymm6,ymm10,48 + vpblendd ymm15,ymm15,ymm11,48 + vpxor ymm5,ymm5,ymm12 + vpblendd ymm6,ymm6,ymm11,192 + vpblendd ymm15,ymm15,ymm9,192 + vpandn ymm6,ymm6,ymm15 + vpxor ymm6,ymm6,ymm13 + vpermq ymm4,ymm8,30 + vpblendd ymm15,ymm4,ymm0,48 + vpermq ymm1,ymm8,57 + vpblendd ymm1,ymm1,ymm0,192 + vpandn ymm1,ymm1,ymm15 + vpblendd ymm2,ymm11,ymm12,12 + vpblendd ymm14,ymm13,ymm11,12 + vpblendd ymm2,ymm2,ymm13,48 + vpblendd ymm14,ymm14,ymm10,48 + vpblendd ymm2,ymm2,ymm10,192 + vpblendd ymm14,ymm14,ymm12,192 + vpandn ymm2,ymm2,ymm14 + vpxor ymm2,ymm2,ymm9 + vpermq ymm7,ymm7,0 + vpermq ymm3,ymm3,27 + vpermq ymm5,ymm5,141 + vpermq ymm6,ymm6,114 + vpblendd ymm4,ymm13,ymm10,12 + vpblendd ymm14,ymm12,ymm13,12 + vpblendd ymm4,ymm4,ymm12,48 + vpblendd ymm14,ymm14,ymm9,48 + vpblendd ymm4,ymm4,ymm9,192 + vpblendd ymm14,ymm14,ymm10,192 + vpandn ymm4,ymm4,ymm14 + vpxor ymm0,ymm0,ymm7 + vpxor ymm1,ymm1,ymm8 + vpxor ymm4,ymm4,ymm11 + vpxor ymm0,ymm0,YMMWORD PTR [r10] + lea r10,[r10+32] + dec eax + jnz Loop_avx2 + + vmovq QWORD PTR [rcx-96],xmm0 + vmovdqu YMMWORD PTR [rcx-88],ymm1 + vmovdqu YMMWORD PTR [rcx-56],ymm2 + vmovdqu YMMWORD PTR [rcx-24],ymm3 + vmovdqu YMMWORD PTR [rcx+8],ymm4 + vmovdqu YMMWORD PTR [rcx+40],ymm5 + vmovdqu YMMWORD PTR [rcx+72],ymm6 + + ret + +ALIGN 32 +rot_left: + dq 3, 18, 36, 41 + dq 1, 62, 28, 27 + dq 45, 6, 56, 39 + dq 10, 61, 55, 8 + dq 2, 15, 25, 20 + dq 44, 43, 21, 14 + +ALIGN 32 +rot_right: + dq 64-3, 64-18, 64-36, 64-41 + dq 64-1, 64-62, 64-28, 64-27 + dq 64-45, 64-6, 64-56, 64-39 + dq 64-10, 64-61, 64-55, 64-8 + dq 64-2, 64-15, 64-25, 64-20 + dq 64-44, 64-43, 64-21, 64-14 + +ALIGN 32 +rndc: + dq 1, 1, 1, 1 + dq 32898, 32898, 32898, 32898 + dq 9223372036854808714, 9223372036854808714, 9223372036854808714, 9223372036854808714 + dq 9223372039002292224, 9223372039002292224, 9223372039002292224, 9223372039002292224 + dq 32907, 32907, 32907, 32907 + dq 2147483649, 2147483649, 2147483649, 2147483649 + dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353 + dq 9223372036854808585, 9223372036854808585, 9223372036854808585, 9223372036854808585 + dq 138, 138, 138, 138 + dq 136, 136, 136, 136 + dq 2147516425, 2147516425, 2147516425, 2147516425 + dq 2147483658, 2147483658, 2147483658, 2147483658 + dq 2147516555, 2147516555, 2147516555, 2147516555 + dq 9223372036854775947, 9223372036854775947, 9223372036854775947, 9223372036854775947 + dq 9223372036854808713, 9223372036854808713, 9223372036854808713, 9223372036854808713 + dq 9223372036854808579, 9223372036854808579, 9223372036854808579, 9223372036854808579 + dq 9223372036854808578, 9223372036854808578, 9223372036854808578, 9223372036854808578 + dq 9223372036854775936, 9223372036854775936, 9223372036854775936, 9223372036854775936 + dq 32778, 32778, 32778, 32778 + dq 9223372039002259466, 9223372039002259466, 9223372039002259466, 9223372039002259466 + dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353 + dq 9223372036854808704, 9223372036854808704, 9223372036854808704, 9223372036854808704 + dq 2147483649, 2147483649, 2147483649, 2147483649 + dq 9223372039002292232, 9223372039002292232, 9223372039002292232, 9223372039002292232