From 878e021ff66fe2a8c30a9f0a65da08e476279384 Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 8 Jun 2017 09:47:25 +0300 Subject: [PATCH] Initial CryptoNight. --- CMakeLists.txt | 4 + src/App.cpp | 6 + src/Options.cpp | 6 + src/Options.h | 5 + src/crypto/CryptoNight.cpp | 105 +++++++++++ src/crypto/CryptoNight.h | 45 +++++ src/crypto/CryptoNight_p.h | 349 +++++++++++++++++++++++++++++++++++++ 7 files changed, 520 insertions(+) create mode 100644 src/crypto/CryptoNight.cpp create mode 100644 src/crypto/CryptoNight.h create mode 100644 src/crypto/CryptoNight_p.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 936cc9bda..0d993dda8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,8 @@ set(HEADERS_CRYPTO src/crypto/c_jh.h src/crypto/c_keccak.h src/crypto/c_skein.h + src/crypto/CryptoNight.h + src/crypto/CryptoNight_p.h src/crypto/groestl_tables.h src/crypto/hash.h src/crypto/skein_port.h @@ -48,6 +50,8 @@ set(SOURCES_CRYPTO src/crypto/c_jh.c src/crypto/c_skein.c src/crypto/soft_aes.c + src/crypto/soft_aes.c + src/crypto/CryptoNight.cpp ) if (WIN32) diff --git a/src/App.cpp b/src/App.cpp index e7cf1aa46..6608554c0 100644 --- a/src/App.cpp +++ b/src/App.cpp @@ -28,6 +28,7 @@ #include "App.h" #include "Console.h" #include "Cpu.h" +#include "crypto/CryptoNight.h" #include "net/Client.h" #include "net/Network.h" #include "Options.h" @@ -61,6 +62,11 @@ App::exec() return 0; } + if (!CryptoNight::init(m_options->algo(), m_options->algoVariant())) { + LOG_ERR("\"%s\" hash self-test failed.", m_options->algoName()); + return 1; + } + Summary::print(); m_network->connect(); diff --git a/src/Options.cpp b/src/Options.cpp index c09de032f..1fbdb1419 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -116,6 +116,12 @@ Options *Options::parse(int argc, char **argv) } +const char *Options::algoName() const +{ + return algo_names[m_algo]; +} + + Options::Options(int argc, char **argv) : m_background(false), m_colors(true), diff --git a/src/Options.h b/src/Options.h index 0c1dcc6eb..cbad99fa9 100644 --- a/src/Options.h +++ b/src/Options.h @@ -52,16 +52,21 @@ public: static Options *parse(int argc, char **argv); inline bool colors() const { return m_colors; } + inline bool doubleHash() const { return m_doubleHash; } inline bool isReady() const { return m_ready; } inline bool keepAlive() const { return m_keepAlive; } inline const char *pass() const { return m_pass; } inline const char *user() const { return m_user; } inline const Url *backupUrl() const { return m_backupUrl; } inline const Url *url() const { return m_url; } + inline int algo() const { return m_algo; } + inline int algoVariant() const { return m_algoVariant; } inline int donateLevel() const { return m_donateLevel; } inline int retries() const { return m_retries; } inline int retryPause() const { return m_retryPause; } + const char *algoName() const; + private: Options(int argc, char **argv); ~Options(); diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp new file mode 100644 index 000000000..a96ac7611 --- /dev/null +++ b/src/crypto/CryptoNight.cpp @@ -0,0 +1,105 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2016-2017 XMRig + * + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "crypto/CryptoNight.h" +#include "crypto/CryptoNight_p.h" +#include "Options.h" + + +const static uint8_t test_input[152] = { + 0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, + 0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9, + 0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F, + 0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46, + 0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02, + 0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00, + 0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B, + 0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62, + 0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92, + 0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01 +}; + + +const static uint8_t test_output0[64] = { + 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, + 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, + 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, + 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00 +}; + + +#ifndef XMRIG_NO_AEON +const static uint8_t test_output1[64] = { + 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, + 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, + 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, + 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, +}; + +//void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); +//void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); +//void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); +//void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); +#endif + + +static inline void cryptonight_av1_aesni(const void *input, size_t size, void *output, struct cryptonight_ctx *ctx) { + cryptonight_hash<0x80000, MEMORY, 0x1FFFF0, false>(input, size, output, ctx); +} + + +void (*cryptonight_hash_ctx)(const void *input, size_t size, void *output, cryptonight_ctx *ctx) = nullptr; + + +static bool self_test(int algo) { + if (cryptonight_hash_ctx == NULL) { + return false; + } + + char output[64]; + + struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16); + ctx->memory = (uint8_t *) _mm_malloc(MEMORY * 2, 16); + + cryptonight_hash_ctx(test_input, 76, output, ctx); + + _mm_free(ctx->memory); + _mm_free(ctx); + +# ifndef XMRIG_NO_AEON + if (algo == Options::ALGO_CRYPTONIGHT_LITE) { + return memcmp(output, test_output1, (Options::i()->doubleHash() ? 64 : 32)) == 0; + } +# endif + + return memcmp(output, test_output0, (Options::i()->doubleHash() ? 64 : 32)) == 0; +} + + +bool CryptoNight::init(int algo, int variant) +{ + cryptonight_hash_ctx = cryptonight_av1_aesni; + + return self_test(algo); +} diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h new file mode 100644 index 000000000..1c201af38 --- /dev/null +++ b/src/crypto/CryptoNight.h @@ -0,0 +1,45 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2016-2017 XMRig + * + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __CRYPTONIGHT_H__ +#define __CRYPTONIGHT_H__ + +#include + +#define MEMORY 2097152 /* 2 MiB */ +#define MEMORY_LITE 1048576 /* 1 MiB */ + +struct cryptonight_ctx { + uint8_t state0[200] __attribute__((aligned(16))); + uint8_t state1[200] __attribute__((aligned(16))); + uint8_t* memory __attribute__((aligned(16))); +}; + + +class CryptoNight +{ +public: + static bool init(int algo, int variant); +}; + +#endif /* __CRYPTONIGHT_H__ */ diff --git a/src/crypto/CryptoNight_p.h b/src/crypto/CryptoNight_p.h new file mode 100644 index 000000000..28ca790d3 --- /dev/null +++ b/src/crypto/CryptoNight_p.h @@ -0,0 +1,349 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2016-2017 XMRig + * + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __CRYPTONIGHT_P_H__ +#define __CRYPTONIGHT_P_H__ + + +#include + + +#include "crypto/CryptoNight.h" + + +extern "C" +{ +#include "crypto/c_keccak.h" +#include "crypto/c_groestl.h" +#include "crypto/c_blake256.h" +#include "crypto/c_jh.h" +#include "crypto/c_skein.h" +} + + +static inline void do_blake_hash(const void* input, size_t len, char* output) { + blake256_hash(reinterpret_cast(output), static_cast(input), len); +} + + +static inline void do_groestl_hash(const void* input, size_t len, char* output) { + groestl(static_cast(input), len * 8, reinterpret_cast(output)); +} + + +static inline void do_jh_hash(const void* input, size_t len, char* output) { + jh_hash(32 * 8, static_cast(input), 8 * len, reinterpret_cast(output)); +} + + +static inline void do_skein_hash(const void* input, size_t len, char* output) { + skein_hash(8 * 32, static_cast(input), 8 * len, reinterpret_cast(output)); +} + + +void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; + + +__m128i soft_aesenc(__m128i in, __m128i key); +__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon); + + +#if defined(__x86_64__) +# define EXTRACT64(X) _mm_cvtsi128_si64(X) + +static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi) +{ + unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b; + *hi = r >> 64; + return (uint64_t) r; +} +#elif defined(__i386__) +# define HI32(X) \ + _mm_srli_si128((X), 4) + + +# define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) + +static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) { + // multiplier = ab = a * 2^32 + b + // multiplicand = cd = c * 2^32 + d + // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d + uint64_t a = multiplier >> 32; + uint64_t b = multiplier & 0xFFFFFFFF; + uint64_t c = multiplicand >> 32; + uint64_t d = multiplicand & 0xFFFFFFFF; + + //uint64_t ac = a * c; + uint64_t ad = a * d; + //uint64_t bc = b * c; + uint64_t bd = b * d; + + uint64_t adbc = ad + (b * c); + uint64_t adbc_carry = adbc < ad ? 1 : 0; + + // multiplier * multiplicand = product_hi * 2^64 + product_lo + uint64_t product_lo = bd + (adbc << 32); + uint64_t product_lo_carry = product_lo < bd ? 1 : 0; + *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry; + + return product_lo; +} +#endif + + +// This will shift and xor tmp1 into itself as 4 32-bit vals such as +// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) +static inline __m128i sl_xor(__m128i tmp1) +{ + __m128i tmp4; + tmp4 = _mm_slli_si128(tmp1, 0x04); + tmp1 = _mm_xor_si128(tmp1, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x04); + tmp1 = _mm_xor_si128(tmp1, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x04); + tmp1 = _mm_xor_si128(tmp1, tmp4); + return tmp1; +} + + +template +static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2) +{ + __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon); + xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem + *xout0 = sl_xor(*xout0); + *xout0 = _mm_xor_si128(*xout0, xout1); + xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00); + xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem + *xout2 = sl_xor(*xout2); + *xout2 = _mm_xor_si128(*xout2, xout1); +} + + +static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon) +{ + __m128i xout1 = soft_aeskeygenassist(*xout2, rcon); + xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem + *xout0 = sl_xor(*xout0); + *xout0 = _mm_xor_si128(*xout0, xout1); + xout1 = soft_aeskeygenassist(*xout0, 0x00); + xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem + *xout2 = sl_xor(*xout2); + *xout2 = _mm_xor_si128(*xout2, xout1); +} + + +template +static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) +{ + __m128i xout0 = _mm_load_si128(memory); + __m128i xout2 = _mm_load_si128(memory +1 ); + *k0 = xout0; + *k1 = xout2; + + SOFT_AES ? soft_aes_genkey_sub(&xout0, &xout2, 0x01) : aes_genkey_sub<0x01>(&xout0, &xout2); + *k2 = xout0; + *k3 = xout2; + + SOFT_AES ? soft_aes_genkey_sub(&xout0, &xout2, 0x02) : aes_genkey_sub<0x02>(&xout0, &xout2); + *k4 = xout0; + *k5 = xout2; + + SOFT_AES ? soft_aes_genkey_sub(&xout0, &xout2, 0x04) : aes_genkey_sub<0x04>(&xout0, &xout2); + *k6 = xout0; + *k7 = xout2; + + SOFT_AES ? soft_aes_genkey_sub(&xout0, &xout2, 0x08) : aes_genkey_sub<0x08>(&xout0, &xout2); + *k8 = xout0; + *k9 = xout2; +} + + +template +static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) +{ + if (SOFT_AES) { + *x0 = soft_aesenc(*x0, key); + *x1 = soft_aesenc(*x1, key); + *x2 = soft_aesenc(*x2, key); + *x3 = soft_aesenc(*x3, key); + *x4 = soft_aesenc(*x4, key); + *x5 = soft_aesenc(*x5, key); + *x6 = soft_aesenc(*x6, key); + *x7 = soft_aesenc(*x7, key); + } + else { + *x0 = _mm_aesenc_si128(*x0, key); + *x1 = _mm_aesenc_si128(*x1, key); + *x2 = _mm_aesenc_si128(*x2, key); + *x3 = _mm_aesenc_si128(*x3, key); + *x4 = _mm_aesenc_si128(*x4, key); + *x5 = _mm_aesenc_si128(*x5, key); + *x6 = _mm_aesenc_si128(*x6, key); + *x7 = _mm_aesenc_si128(*x7, key); + } +} + + +template +static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) +{ + __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + + aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xin0 = _mm_load_si128(input + 4); + xin1 = _mm_load_si128(input + 5); + xin2 = _mm_load_si128(input + 6); + xin3 = _mm_load_si128(input + 7); + xin4 = _mm_load_si128(input + 8); + xin5 = _mm_load_si128(input + 9); + xin6 = _mm_load_si128(input + 10); + xin7 = _mm_load_si128(input + 11); + + for (size_t i = 0; __builtin_expect(i < MEM / sizeof(__m128i), 1); i += 8) { + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + + _mm_store_si128(output + i + 0, xin0); + _mm_store_si128(output + i + 1, xin1); + _mm_store_si128(output + i + 2, xin2); + _mm_store_si128(output + i + 3, xin3); + _mm_store_si128(output + i + 4, xin4); + _mm_store_si128(output + i + 5, xin5); + _mm_store_si128(output + i + 6, xin6); + _mm_store_si128(output + i + 7, xin7); + } +} + + +template +static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) +{ + __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + + aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xout0 = _mm_load_si128(output + 4); + xout1 = _mm_load_si128(output + 5); + xout2 = _mm_load_si128(output + 6); + xout3 = _mm_load_si128(output + 7); + xout4 = _mm_load_si128(output + 8); + xout5 = _mm_load_si128(output + 9); + xout6 = _mm_load_si128(output + 10); + xout7 = _mm_load_si128(output + 11); + + for (size_t i = 0; __builtin_expect(i < MEM / sizeof(__m128i), 1); i += 8) + { + xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } + + _mm_store_si128(output + 4, xout0); + _mm_store_si128(output + 5, xout1); + _mm_store_si128(output + 6, xout2); + _mm_store_si128(output + 7, xout3); + _mm_store_si128(output + 8, xout4); + _mm_store_si128(output + 9, xout5); + _mm_store_si128(output + 10, xout6); + _mm_store_si128(output + 11, xout7); +} + + +template +void cryptonight_hash(const void *__restrict__ input, size_t size, void *__restrict__ output, cryptonight_ctx *__restrict__ ctx) +{ + keccak(static_cast(input), size, ctx->state0, 200); + + cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); + + const uint8_t* l0 = ctx->memory; + uint64_t* h0 = reinterpret_cast(ctx->state0); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + + for (size_t i = 0; __builtin_expect(i < ITERATIONS, 1); i++) { + __m128i cx; + cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + + _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); + idx0 = EXTRACT64(cx); + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & MASK])[0] = al0; + ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + } + + cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); + + keccakf(h0, 24); + extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, static_cast(output)); +} + +#endif /* __CRYPTONIGHT_P_H__ */