Merge branch 'classic-dev' into classic

2025-05-14 17:04:39 +00:00 · 2019-03-05 01:16:22 +07:00 · 2019-03-05 01:16:22 +07:00 · 30e5e4a492
commit 30e5e4a492
parent b834c50aba d92c1a54de
42 changed files with 5459 additions and 394 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -11,6 +11,7 @@ set(HEADERS
    algo/cryptonight/cryptonight_monero.h
    algo/cryptonight/cryptonight_softaes.h
    algo/cryptonight/cryptonight_test.h
+    algo/cryptonight/variant4_random_math.h
    compat.h
    cpu.h
    donate.h
@ -29,6 +30,7 @@ set(HEADERS_CRYPTO
    crypto/c_blake256.h
    crypto/c_jh.h
    crypto/c_skein.h
+    crypto/soft_aes.h
   )

 set(HEADERS_COMPAT
@ -48,6 +50,10 @@ set(SOURCES
    algo/cryptonight/cryptonight_av2.c
    algo/cryptonight/cryptonight_av3.c
    algo/cryptonight/cryptonight_av4.c
+    algo/cryptonight/cryptonight_r_av1.c
+    algo/cryptonight/cryptonight_r_av2.c
+    algo/cryptonight/cryptonight_r_av3.c
+    algo/cryptonight/cryptonight_r_av4.c
    util.c
    options.c
    stratum.c
@ -61,7 +67,6 @@ set(SOURCES_CRYPTO
    crypto/c_blake256.c
    crypto/c_jh.c
    crypto/c_skein.c
-    crypto/soft_aes.c
   )

 set(SOURCES_UTILS
--- a/algo/cryptonight-lite/cryptonight_lite_softaes.h
+++ b/algo/cryptonight-lite/cryptonight_lite_softaes.h
@ -4,9 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -22,16 +22,15 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CRYPTONIGHT_LITE_SOFTAES_H__
-#define __CRYPTONIGHT_LITE_SOFTAES_H__
+#ifndef XMRIG_CRYPTONIGHT_LITE_SOFTAES_H
+#define XMRIG_CRYPTONIGHT_LITE_SOFTAES_H


 #include <x86intrin.h>
 #include <stdint.h>


-extern __m128i soft_aesenc(__m128i in, __m128i key);
-extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
+#include "crypto/soft_aes.h"


 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
@ -253,4 +252,4 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 }


-#endif /* __CRYPTONIGHT_LITE_SOFTAES_H__ */
+#endif /* XMRIG_CRYPTONIGHT_LITE_SOFTAES_H */
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@ -6,7 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -38,9 +39,13 @@
 #include "crypto/c_groestl.h"
 #include "crypto/c_jh.h"
 #include "crypto/c_skein.h"
-#include "cryptonight.h"
 #include "cryptonight_test.h"
+#include "cryptonight.h"
 #include "options.h"
+#include "persistent_memory.h"
+
+
+static cn_hash_fun asm_func_map[AV_MAX][VARIANT_MAX][ASM_MAX] = {};


 void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
@ -56,6 +61,11 @@ void cryptonight_av4_v0(const uint8_t *input, size_t size, uint8_t *output, stru
 void cryptonight_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_av4_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);

+void cryptonight_r_av1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av3(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av4(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+

 #ifndef XMRIG_NO_AEON
 void cryptonight_lite_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
@ -72,7 +82,13 @@ void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output,
 #ifndef XMRIG_NO_ASM
 void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_single_hash_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+
+void cryptonight_r_av1_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av1_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 #endif


@ -89,6 +105,46 @@ static inline bool verify(enum Variant variant, uint8_t *output, struct cryptoni
 }


+static inline bool verify2(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue)
+{
+    cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant);
+    if (func == NULL) {
+        return false;
+    }
+
+    if (opt_double_hash) {
+        uint8_t input[128];
+
+        for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) {
+            const size_t size = cn_r_test_input[i].size;
+            memcpy(input,        cn_r_test_input[i].data, size);
+            memcpy(input + size, cn_r_test_input[i].data, size);
+
+            ctx[0]->height = ctx[1]->height = cn_r_test_input[i].height;
+
+            func(input, size, output, ctx);
+
+            if (memcmp(output, referenceValue + i * 32, 32) != 0 || memcmp(output + 32, referenceValue + i * 32, 32) != 0) {
+                return false;
+            }
+        }
+    }
+    else {
+        for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) {
+            ctx[0]->height = cn_r_test_input[i].height;
+
+            func(cn_r_test_input[i].data, cn_r_test_input[i].size, output, ctx);
+
+            if (memcmp(output, referenceValue + i * 32, 32) != 0) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+
 static bool self_test() {
    struct cryptonight_ctx *ctx[2];
    uint8_t output[64];
@ -97,15 +153,18 @@ static bool self_test() {
    const size_t size  = opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE;
    bool result = false;

-    for (int i = 0; i < count; ++i) {
+    for (size_t i = 0; i < count; ++i) {
        ctx[i]         = _mm_malloc(sizeof(struct cryptonight_ctx), 16);
        ctx[i]->memory = _mm_malloc(size, 16);
+
+        init_cn_r(ctx[i]);
    }

    if (opt_algo == ALGO_CRYPTONIGHT) {
-        result = verify(VARIANT_0, output, ctx, test_output_v0) &&
-                 verify(VARIANT_1, output, ctx, test_output_v1) &&
-                 verify(VARIANT_2, output, ctx, test_output_v2);
+        result = verify(VARIANT_0,  output, ctx, test_output_v0) &&
+                 verify(VARIANT_1,  output, ctx, test_output_v1) &&
+                 verify(VARIANT_2,  output, ctx, test_output_v2) &&
+                 verify2(VARIANT_4, output, ctx, test_output_r);
    }
 #   ifndef XMRIG_NO_AEON
    else {
@ -115,7 +174,7 @@ static bool self_test() {
 #   endif


-    for (int i = 0; i < count; ++i) {
+    for (size_t i = 0; i < count; ++i) {
        _mm_free(ctx[i]->memory);
        _mm_free(ctx[i]);
    }
@ -124,34 +183,20 @@ static bool self_test() {
 }


-size_t fn_index(enum Algo algorithm, enum AlgoVariant av, enum Variant variant, enum Assembly assembly)
+#ifndef XMRIG_NO_ASM
+cn_hash_fun cryptonight_hash_asm_fn(enum AlgoVariant av, enum Variant variant, enum Assembly assembly)
 {
-    const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;
-
-#   ifndef XMRIG_NO_ASM
    if (assembly == ASM_AUTO) {
-        assembly = cpu_info.assembly;
+        assembly = (enum Assembly) cpu_info.assembly;
    }

    if (assembly == ASM_NONE) {
-        return index;
+        return NULL;
    }

-    const size_t offset = VARIANT_MAX * 4 * 2;
-
-    if (algorithm == ALGO_CRYPTONIGHT && variant == VARIANT_2) {
-        if (av == AV_SINGLE) {
-            return offset + assembly - 2;
-        }
-
-        if (av == AV_DOUBLE) {
-            return offset + 2;
-        }
-    }
-#   endif
-
-    return index;
+    return asm_func_map[av][variant][assembly];
 }
+#endif


 cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant)
@ -160,10 +205,15 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
    assert(variant > VARIANT_AUTO && variant < VARIANT_MAX);

 #   ifndef XMRIG_NO_ASM
-    static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2 + 3] = {
-#   else
-    static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = {
+    if (algorithm == ALGO_CRYPTONIGHT) {
+        cn_hash_fun fun = cryptonight_hash_asm_fn(av, variant, opt_assembly);
+        if (fun) {
+            return fun;
+        }
+    }
 #   endif
+
+    static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = {
        cryptonight_av1_v0,
        cryptonight_av2_v0,
        cryptonight_av3_v0,
@ -177,6 +227,11 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
        cryptonight_av3_v2,
        cryptonight_av4_v2,

+        cryptonight_r_av1,
+        cryptonight_r_av2,
+        cryptonight_r_av3,
+        cryptonight_r_av4,
+
 #       ifndef XMRIG_NO_AEON
        cryptonight_lite_av1_v0,
        cryptonight_lite_av2_v0,
@ -190,6 +245,10 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
        NULL,
        NULL,
        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
 #       else
        NULL,
        NULL,
@ -203,16 +262,15 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
        NULL,
        NULL,
        NULL,
-#       endif
-#       ifndef XMRIG_NO_ASM
-        cryptonight_single_hash_asm_intel,
-        cryptonight_single_hash_asm_ryzen,
-        cryptonight_double_hash_asm
+        NULL,
+        NULL,
+        NULL,
+        NULL,
 #       endif
    };

 #   ifndef NDEBUG
-    const size_t index = fn_index(algorithm, av, variant, opt_assembly);
+    const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;

    cn_hash_fun func = func_table[index];

@ -221,7 +279,7 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V

    return func;
 #   else
-    return func_table[fn_index(algorithm, av, variant, opt_assembly)];
+    return func_table[VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1];
 #   endif
 }

@ -230,6 +288,24 @@ bool cryptonight_init(int av)
 {
    opt_double_hash = av == AV_DOUBLE || av == AV_DOUBLE_SOFT;

+#   ifndef XMRIG_NO_ASM
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_INTEL]     = cryptonight_single_hash_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_RYZEN]     = cryptonight_single_hash_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_single_hash_asm_bulldozer;
+
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_INTEL]     = cryptonight_double_hash_asm;
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_RYZEN]     = cryptonight_double_hash_asm;
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_double_hash_asm;
+
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_INTEL]     = cryptonight_r_av1_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_RYZEN]     = cryptonight_r_av1_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av1_asm_bulldozer;
+
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_INTEL]     = cryptonight_r_av2_asm_intel;
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_RYZEN]     = cryptonight_r_av2_asm_intel;
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av2_asm_bulldozer;
+#   endif
+
    return self_test();
 }

@ -267,6 +343,10 @@ static inline enum Variant cryptonight_variant(uint8_t version)
        return VARIANT_1;
    }

+    if (version >= 10) {
+        return VARIANT_4;
+    }
+
    if (version >= 8) {
        return VARIANT_2;
    }
@ -276,7 +356,7 @@ static inline enum Variant cryptonight_variant(uint8_t version)


 #ifndef BUILD_TEST
-int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
+int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
    uint32_t *nonceptr   = (uint32_t*) (((char*) blob) + 39);
    enum Variant variant = cryptonight_variant(blob[0]);

@ -296,7 +376,7 @@ int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blo
 }


-int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
+int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
    int rc               = 0;
    uint32_t *nonceptr0  = (uint32_t*) (((char*) blob) + 39);
    uint32_t *nonceptr1  = (uint32_t*) (((char*) blob) + 39 + blob_size);
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@ -6,7 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -38,9 +39,30 @@
 #define MEMORY_LITE 1048576 /* 1 MiB */


+#if defined _MSC_VER || defined XMRIG_ARM
+#define ABI_ATTRIBUTE
+#else
+#define ABI_ATTRIBUTE __attribute__((ms_abi))
+#endif
+
+
+struct cryptonight_ctx;
+typedef void(*cn_mainloop_fun_ms_abi)(struct cryptonight_ctx*) ABI_ATTRIBUTE;
+typedef void(*cn_mainloop_double_fun_ms_abi)(struct cryptonight_ctx*, struct cryptonight_ctx*) ABI_ATTRIBUTE;
+
+
 struct cryptonight_ctx {
    uint8_t state[224] __attribute__((aligned(16)));
-    uint8_t* memory    __attribute__((aligned(16)));
+    uint8_t *memory    __attribute__((aligned(16)));
+
+    uint8_t unused[40];
+    const uint32_t *saes_table;
+
+    cn_mainloop_fun_ms_abi generated_code;
+    cn_mainloop_double_fun_ms_abi generated_code_double;
+    uint64_t generated_code_height;
+    uint64_t generated_code_double_height;
+    uint64_t height;
 };


@ -52,7 +74,8 @@ extern void (* const extra_hashes[4])(const void *, size_t, char *);
 cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant);

 bool cryptonight_init(int av);
-int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx);
-int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx);
+int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx);
+int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx);
+

 #endif /* XMRIG_CRYPTONIGHT_H */
--- a/algo/cryptonight/cryptonight_av1.c
+++ b/algo/cryptonight/cryptonight_av1.c
@ -196,6 +196,7 @@ void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *res
 #ifndef XMRIG_NO_ASM
 extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx);
 extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx);
+extern void cnv2_mainloop_bulldozer_asm(struct cryptonight_ctx *ctx);
 extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1);


@ -225,6 +226,19 @@ void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t siz
 }


+void cryptonight_single_hash_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    cnv2_mainloop_bulldozer_asm(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
 void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    keccak(input,        size, ctx[0]->state, 200);
--- a/algo/cryptonight/cryptonight_monero.h
+++ b/algo/cryptonight/cryptonight_monero.h
@ -6,8 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2018      SChernykh   <https://github.com/SChernykh>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -29,6 +29,8 @@

 #include <fenv.h>
 #include <math.h>
+#include <stdint.h>
+#include <x86intrin.h>


 static inline __m128i int_sqrt_v2(const uint64_t n0)
@ -87,6 +89,17 @@ static inline __m128i int_sqrt_v2(const uint64_t n0)
        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
    }

+#   define VARIANT4_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \
+    { \
+        const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+        _c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \
+    }
+
 #   define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
    { \
        const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
@ -99,4 +112,39 @@ static inline __m128i int_sqrt_v2(const uint64_t n0)
        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
    }

+
+#ifndef NOINLINE
+#ifdef __GNUC__
+#define NOINLINE __attribute__ ((noinline))
+#elif _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE
+#endif
+#endif
+
+#include "variant4_random_math.h"
+
+#define VARIANT4_RANDOM_MATH_INIT(part) \
+  uint32_t r##part[9]; \
+  struct V4_Instruction code##part[256]; \
+  { \
+    r##part[0] = (uint32_t)(h##part[12]); \
+    r##part[1] = (uint32_t)(h##part[12] >> 32); \
+    r##part[2] = (uint32_t)(h##part[13]); \
+    r##part[3] = (uint32_t)(h##part[13] >> 32); \
+  } \
+  v4_random_math_init(code##part, ctx[part]->height);
+
+#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \
+  { \
+    cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \
+    r##part[4] = (uint32_t)(al); \
+    r##part[5] = (uint32_t)(ah); \
+    r##part[6] = (uint32_t)(_mm_cvtsi128_si32(bx0)); \
+    r##part[7] = (uint32_t)(_mm_cvtsi128_si32(bx1)); \
+    r##part[8] = (uint32_t)(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
+    v4_random_math(code##part, r##part); \
+  }
+
 #endif /* XMRIG_CRYPTONIGHT_MONERO_H */
--- a/algo/cryptonight/cryptonight_r_av1.c
+++ b/algo/cryptonight/cryptonight_r_av1.c
@ -0,0 +1,143 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = _mm_aesenc_si128(cx, ax0);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+#ifndef XMRIG_NO_ASM
+void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+
+
+void cryptonight_r_av1_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_INTEL);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    ctx[0]->generated_code(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_r_av1_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_BULLDOZER);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    ctx[0]->generated_code(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+#endif
--- a/algo/cryptonight/cryptonight_r_av2.c
+++ b/algo/cryptonight/cryptonight_r_av2.c
@ -0,0 +1,202 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+    VARIANT4_RANDOM_MATH_INIT(1);
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = _mm_aesenc_si128(cx0, ax0);
+        cx1 = _mm_aesenc_si128(cx1, ax1);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
+        al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
+        ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
+
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+#ifndef XMRIG_NO_ASM
+void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+
+
+void cryptonight_r_av2_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+        v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_INTEL);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
+
+    ctx[0]->generated_code_double(ctx[0], ctx[1]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
+
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    keccakf((uint64_t *) ctx[1]->state, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_r_av2_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+        v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_BULLDOZER);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
+
+    ctx[0]->generated_code_double(ctx[0], ctx[1]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
+
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    keccakf((uint64_t *) ctx[1]->state, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+#endif
--- a/algo/cryptonight/cryptonight_r_av3.c
+++ b/algo/cryptonight/cryptonight_r_av3.c
@ -0,0 +1,112 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+#ifndef XMRIG_NO_ASM
+void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+#endif
+
+
+void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+#   ifndef XMRIG_NO_ASM
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_soft_aes_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_NONE);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    ctx[0]->saes_table = (const uint32_t*)saes_table;
+    ctx[0]->generated_code(ctx[0]);
+#   else
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = soft_aesenc(cx, ax0);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+#   endif
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight/cryptonight_r_av4.c
+++ b/algo/cryptonight/cryptonight_r_av4.c
@ -0,0 +1,143 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+    VARIANT4_RANDOM_MATH_INIT(1);
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = soft_aesenc(cx0, ax0);
+        cx1 = soft_aesenc(cx1, ax1);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
+        al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
+        ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
+
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
--- a/algo/cryptonight/cryptonight_softaes.h
+++ b/algo/cryptonight/cryptonight_softaes.h
@ -4,9 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -30,8 +30,7 @@
 #include <stdint.h>


-extern __m128i soft_aesenc(__m128i in, __m128i key);
-extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
+#include "crypto/soft_aes.h"


 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
--- a/algo/cryptonight/cryptonight_test.h
+++ b/algo/cryptonight/cryptonight_test.h
@ -6,8 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2018      SChernykh   <https://github.com/SChernykh>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -27,6 +27,9 @@
 #define XMRIG_CRYPTONIGHT_TEST_H


+#include <stdint.h>
+
+
 const static uint8_t test_input[152] = {
    0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
    0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
@ -67,6 +70,42 @@ const static uint8_t test_output_v2[64] = {
 };


+struct cn_r_test_input_data
+{
+    uint64_t height;
+    size_t size;
+    uint8_t data[64];
+};
+
+
+const static struct cn_r_test_input_data cn_r_test_input[] = {
+    { 1806260, 44, { 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74 } },
+    { 1806261, 50, { 0x4c, 0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x63, 0x74, 0x65, 0x74, 0x75, 0x72, 0x20, 0x61, 0x64, 0x69, 0x70, 0x69, 0x73, 0x63, 0x69, 0x6e, 0x67 } },
+    { 1806262, 48, { 0x65, 0x6c, 0x69, 0x74, 0x2c, 0x20, 0x73, 0x65, 0x64, 0x20, 0x64, 0x6f, 0x20, 0x65, 0x69, 0x75, 0x73, 0x6d, 0x6f, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x63, 0x69, 0x64, 0x69, 0x64, 0x75, 0x6e, 0x74, 0x20, 0x75, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x65 } },
+    { 1806263, 48, { 0x65, 0x74, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x61, 0x67, 0x6e, 0x61, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x61, 0x2e, 0x20, 0x55, 0x74, 0x20, 0x65, 0x6e, 0x69, 0x6d, 0x20, 0x61, 0x64, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x20, 0x76, 0x65, 0x6e, 0x69, 0x61, 0x6d, 0x2c } },
+    { 1806264, 46, { 0x71, 0x75, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x73, 0x74, 0x72, 0x75, 0x64, 0x20, 0x65, 0x78, 0x65, 0x72, 0x63, 0x69, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x6f, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x69, 0x73, 0x20, 0x6e, 0x69, 0x73, 0x69 } },
+    { 1806265, 45, { 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x69, 0x70, 0x20, 0x65, 0x78, 0x20, 0x65, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x64, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x71, 0x75, 0x61, 0x74, 0x2e, 0x20, 0x44, 0x75, 0x69, 0x73, 0x20, 0x61, 0x75, 0x74, 0x65 } },
+    { 1806266, 47, { 0x69, 0x72, 0x75, 0x72, 0x65, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x72, 0x65, 0x68, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x76, 0x6f, 0x6c, 0x75, 0x70, 0x74, 0x61, 0x74, 0x65, 0x20, 0x76, 0x65, 0x6c, 0x69, 0x74 } },
+    { 1806267, 44, { 0x65, 0x73, 0x73, 0x65, 0x20, 0x63, 0x69, 0x6c, 0x6c, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x65, 0x75, 0x20, 0x66, 0x75, 0x67, 0x69, 0x61, 0x74, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x20, 0x70, 0x61, 0x72, 0x69, 0x61, 0x74, 0x75, 0x72, 0x2e } },
+    { 1806268, 47, { 0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x65, 0x75, 0x72, 0x20, 0x73, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x63, 0x63, 0x61, 0x65, 0x63, 0x61, 0x74, 0x20, 0x63, 0x75, 0x70, 0x69, 0x64, 0x61, 0x74, 0x61, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x2c } },
+    { 1806269, 62, { 0x73, 0x75, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x75, 0x6c, 0x70, 0x61, 0x20, 0x71, 0x75, 0x69, 0x20, 0x6f, 0x66, 0x66, 0x69, 0x63, 0x69, 0x61, 0x20, 0x64, 0x65, 0x73, 0x65, 0x72, 0x75, 0x6e, 0x74, 0x20, 0x6d, 0x6f, 0x6c, 0x6c, 0x69, 0x74, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x20, 0x69, 0x64, 0x20, 0x65, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x75, 0x6d, 0x2e } },
+};
+
+
+// "cn/r"
+const static uint8_t test_output_r[] = {
+    0xf7, 0x59, 0x58, 0x8a, 0xd5, 0x7e, 0x75, 0x84, 0x67, 0x29, 0x54, 0x43, 0xa9, 0xbd, 0x71, 0x49, 0x0a, 0xbf, 0xf8, 0xe9, 0xda, 0xd1, 0xb9, 0x5b, 0x6b, 0xf2, 0xf5, 0xd0, 0xd7, 0x83, 0x87, 0xbc,
+    0x5b, 0xb8, 0x33, 0xde, 0xca, 0x2b, 0xdd, 0x72, 0x52, 0xa9, 0xcc, 0xd7, 0xb4, 0xce, 0x0b, 0x6a, 0x48, 0x54, 0x51, 0x57, 0x94, 0xb5, 0x6c, 0x20, 0x72, 0x62, 0xf7, 0xa5, 0xb9, 0xbd, 0xb5, 0x66,
+    0x1e, 0xe6, 0x72, 0x8d, 0xa6, 0x0f, 0xbd, 0x8d, 0x7d, 0x55, 0xb2, 0xb1, 0xad, 0xe4, 0x87, 0xa3, 0xcf, 0x52, 0xa2, 0xc3, 0xac, 0x6f, 0x52, 0x0d, 0xb1, 0x2c, 0x27, 0xd8, 0x92, 0x1f, 0x6c, 0xab,
+    0x69, 0x69, 0xfe, 0x2d, 0xdf, 0xb7, 0x58, 0x43, 0x8d, 0x48, 0x04, 0x9f, 0x30, 0x2f, 0xc2, 0x10, 0x8a, 0x4f, 0xcc, 0x93, 0xe3, 0x76, 0x69, 0x17, 0x0e, 0x6d, 0xb4, 0xb0, 0xb9, 0xb4, 0xc4, 0xcb,
+    0x7f, 0x30, 0x48, 0xb4, 0xe9, 0x0d, 0x0c, 0xbe, 0x7a, 0x57, 0xc0, 0x39, 0x4f, 0x37, 0x33, 0x8a, 0x01, 0xfa, 0xe3, 0xad, 0xfd, 0xc0, 0xe5, 0x12, 0x6d, 0x86, 0x3a, 0x89, 0x5e, 0xb0, 0x4e, 0x02,
+    0x1d, 0x29, 0x04, 0x43, 0xa4, 0xb5, 0x42, 0xaf, 0x04, 0xa8, 0x2f, 0x6b, 0x24, 0x94, 0xa6, 0xee, 0x7f, 0x20, 0xf2, 0x75, 0x4c, 0x58, 0xe0, 0x84, 0x90, 0x32, 0x48, 0x3a, 0x56, 0xe8, 0xe2, 0xef,
+    0xc4, 0x3c, 0xc6, 0x56, 0x74, 0x36, 0xa8, 0x6a, 0xfb, 0xd6, 0xaa, 0x9e, 0xaa, 0x7c, 0x27, 0x6e, 0x98, 0x06, 0x83, 0x03, 0x34, 0xb6, 0x14, 0xb2, 0xbe, 0xe2, 0x3c, 0xc7, 0x66, 0x34, 0xf6, 0xfd,
+    0x87, 0xbe, 0x24, 0x79, 0xc0, 0xc4, 0xe8, 0xed, 0xfd, 0xfa, 0xa5, 0x60, 0x3e, 0x93, 0xf4, 0x26, 0x5b, 0x3f, 0x82, 0x24, 0xc1, 0xc5, 0x94, 0x6f, 0xeb, 0x42, 0x48, 0x19, 0xd1, 0x89, 0x90, 0xa4,
+    0xdd, 0x9d, 0x6a, 0x6d, 0x8e, 0x47, 0x46, 0x5c, 0xce, 0xac, 0x08, 0x77, 0xef, 0x88, 0x9b, 0x93, 0xe7, 0xeb, 0xa9, 0x79, 0x55, 0x7e, 0x39, 0x35, 0xd7, 0xf8, 0x6d, 0xce, 0x11, 0xb0, 0x70, 0xf3,
+    0x75, 0xc6, 0xf2, 0xae, 0x49, 0xa2, 0x05, 0x21, 0xde, 0x97, 0x28, 0x5b, 0x43, 0x1e, 0x71, 0x71, 0x25, 0x84, 0x7f, 0xb8, 0x93, 0x5e, 0xd8, 0x4a, 0x61, 0xe7, 0xf8, 0xd3, 0x6a, 0x2c, 0x3d, 0x8e,
+};
+

 #ifndef XMRIG_NO_AEON
 const static uint8_t test_output_v0_lite[64] = {
--- a/algo/cryptonight/variant4_random_math.h
+++ b/algo/cryptonight/variant4_random_math.h
@ -0,0 +1,449 @@
+#ifndef VARIANT4_RANDOM_MATH_H
+#define VARIANT4_RANDOM_MATH_H
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+
+#include "crypto/c_blake256.h"
+
+
+enum V4_Settings
+{
+    // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
+    TOTAL_LATENCY = 15 * 3,
+    
+    // Always generate at least 60 instructions
+    NUM_INSTRUCTIONS_MIN = 60,
+
+    // Never generate more than 70 instructions (final RET instruction doesn't count here)
+    NUM_INSTRUCTIONS_MAX = 70,
+
+    // Available ALUs for MUL
+    // Modern CPUs typically have only 1 ALU which can do multiplications
+    ALU_COUNT_MUL = 1,
+
+    // Total available ALUs
+    // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
+    ALU_COUNT = 3,
+};
+
+enum V4_InstructionList
+{
+    MUL,    // a*b
+    ADD,    // a+b + C, C is an unsigned 32-bit constant
+    SUB,    // a-b
+    ROR,    // rotate right "a" by "b & 31" bits
+    ROL,    // rotate left "a" by "b & 31" bits
+    XOR,    // a^b
+    RET,    // finish execution
+    V4_INSTRUCTION_COUNT = RET,
+};
+
+// V4_InstructionDefinition is used to generate code from random data
+// Every random sequence of bytes is a valid code
+//
+// There are 9 registers in total:
+// - 4 variable registers
+// - 5 constant registers initialized from loop variables
+// This is why dst_index is 2 bits
+enum V4_InstructionDefinition
+{
+    V4_OPCODE_BITS = 3,
+    V4_DST_INDEX_BITS = 2,
+    V4_SRC_INDEX_BITS = 3,
+};
+
+struct V4_Instruction
+{
+    uint8_t opcode;
+    uint8_t dst_index;
+    uint8_t src_index;
+    uint32_t C;
+};
+
+#ifndef FORCEINLINE
+#ifdef __GNUC__
+#define FORCEINLINE __attribute__((always_inline)) inline
+#elif _MSC_VER
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE inline
+#endif
+#endif
+
+#ifndef UNREACHABLE_CODE
+#ifdef __GNUC__
+#define UNREACHABLE_CODE __builtin_unreachable()
+#elif _MSC_VER
+#define UNREACHABLE_CODE __assume(false)
+#else
+#define UNREACHABLE_CODE
+#endif
+#endif
+
+#define SWAP32LE(x) x
+#define SWAP64LE(x) x
+#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length))
+
+// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
+// every switch-case will point to the same destination on every iteration of Cryptonight main loop
+//
+// This is about as fast as it can get without using low-level machine code generation
+//template<typename v4_reg>
+static void v4_random_math(const struct V4_Instruction* code, uint32_t r[9])
+{
+#define REG_BITS 32
+#define V4_EXEC(i) \
+    { \
+        const struct V4_Instruction* op = code + i; \
+        const uint32_t src = r[op->src_index]; \
+        uint32_t *dst = r + op->dst_index; \
+        switch (op->opcode) \
+        { \
+        case MUL: \
+            *dst *= src; \
+            break; \
+        case ADD: \
+            *dst += src + op->C; \
+            break; \
+        case SUB: \
+            *dst -= src; \
+            break; \
+        case ROR: \
+            { \
+                const uint32_t shift = src % REG_BITS; \
+                *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+            } \
+            break; \
+        case ROL: \
+            { \
+                const uint32_t shift = src % REG_BITS; \
+                *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+            } \
+            break; \
+        case XOR: \
+            *dst ^= src; \
+            break; \
+        case RET: \
+            return; \
+        default: \
+            UNREACHABLE_CODE; \
+            break; \
+        } \
+    }
+
+#define V4_EXEC_10(j) \
+    V4_EXEC(j + 0) \
+    V4_EXEC(j + 1) \
+    V4_EXEC(j + 2) \
+    V4_EXEC(j + 3) \
+    V4_EXEC(j + 4) \
+    V4_EXEC(j + 5) \
+    V4_EXEC(j + 6) \
+    V4_EXEC(j + 7) \
+    V4_EXEC(j + 8) \
+    V4_EXEC(j + 9)
+
+    // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
+    // I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
+    //
+    // 60      27960
+    // 61      105054
+    // 62      2452759
+    // 63      5115997
+    // 64      1022269
+    // 65      1109635
+    // 66      153145
+    // 67      8550
+    // 68      4529
+    // 69      102
+
+    // Unroll 70 instructions here
+    V4_EXEC_10(0);      // instructions 0-9
+    V4_EXEC_10(10);     // instructions 10-19
+    V4_EXEC_10(20);     // instructions 20-29
+    V4_EXEC_10(30);     // instructions 30-39
+    V4_EXEC_10(40);     // instructions 40-49
+    V4_EXEC_10(50);     // instructions 50-59
+    V4_EXEC_10(60);     // instructions 60-69
+
+#undef V4_EXEC_10
+#undef V4_EXEC
+#undef REG_BITS
+}
+
+// If we don't have enough data available, generate more
+static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
+{
+    if (*data_index + bytes_needed > data_size)
+    {
+        hash_extra_blake(data, data_size, (char*) data);
+        *data_index = 0;
+    }
+}
+
+// Generates as many random math operations as possible with given latency and ALU restrictions
+// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
+static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
+{
+    // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
+    // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
+    //
+    // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
+    // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
+    // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
+    // Source: https://www.agner.org/optimize/instruction_tables.pdf
+    const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+
+    // Instruction latencies for theoretical ASIC implementation
+    const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+
+    // Available ALUs for each instruction
+    const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+
+    int8_t data[32];
+    memset(data, 0, sizeof(data));
+    uint64_t tmp = SWAP64LE(height);
+    memcpy(data, &tmp, sizeof(uint64_t));
+    data[20] = -38;
+
+    // Set data_index past the last byte in data
+    // to trigger full data update with blake hash
+    // before we start using it
+    size_t data_index = sizeof(data);
+
+    int code_size;
+
+    // There is a small chance (1.8%) that register R8 won't be used in the generated program
+    // So we keep track of it and try again if it's not used
+    bool r8_used;
+    do {
+        int latency[9];
+        int asic_latency[9];
+
+        // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
+        // byte 0: current value of the destination register
+        // byte 1: instruction opcode
+        // byte 2: current value of the source register
+        //
+        // Registers R4-R8 are constant and are treated as having the same value because when we do
+        // the same operation twice with two constant source registers, it can be optimized into a single operation
+        uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+
+        bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
+        bool is_rotation[V4_INSTRUCTION_COUNT];
+        bool rotated[4];
+        int rotate_count = 0;
+
+        memset(latency, 0, sizeof(latency));
+        memset(asic_latency, 0, sizeof(asic_latency));
+        memset(alu_busy, 0, sizeof(alu_busy));
+        memset(is_rotation, 0, sizeof(is_rotation));
+        memset(rotated, 0, sizeof(rotated));
+        is_rotation[ROR] = true;
+        is_rotation[ROL] = true;
+
+        int num_retries = 0;
+        code_size = 0;
+
+        int total_iterations = 0;
+        r8_used = false;
+
+        // Generate random code to achieve minimal required latency for our abstract CPU
+        // Try to get this latency for all 4 registers
+        while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+        {
+            // Fail-safe to guarantee loop termination
+            ++total_iterations;
+            if (total_iterations > 256)
+                break;
+
+            check_data(&data_index, 1, data, sizeof(data));
+
+            const uint8_t c = ((uint8_t*)data)[data_index++];
+
+            // MUL = opcodes 0-2
+            // ADD = opcode 3
+            // SUB = opcode 4
+            // ROR/ROL = opcode 5, shift direction is selected randomly
+            // XOR = opcodes 6-7
+            uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
+            if (opcode == 5)
+            {
+                check_data(&data_index, 1, data, sizeof(data));
+                opcode = (data[data_index++] >= 0) ? ROR : ROL;
+            }
+            else if (opcode >= 6)
+            {
+                opcode = XOR;
+            }
+            else
+            {
+                opcode = (opcode <= 2) ? MUL : (opcode - 2);
+            }
+
+            uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
+            uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
+
+            const int a = dst_index;
+            int b = src_index;
+
+            // Don't do ADD/SUB/XOR with the same register
+            if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+            {
+                // a is always < 4, so we don't need to check bounds here
+                b = 8;
+                src_index = b;
+            }
+
+            // Don't do rotation with the same destination twice because it's equal to a single rotation
+            if (is_rotation[opcode] && rotated[a])
+            {
+                continue;
+            }
+
+            // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
+            // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
+            // 2xXOR(a, b) = NOP
+            if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+            {
+                continue;
+            }
+
+            // Find which ALU is available (and when) for this instruction
+            int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
+            int alu_index = -1;
+            while (next_latency < TOTAL_LATENCY)
+            {
+                for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+                {
+                    if (!alu_busy[next_latency][i])
+                    {
+                        // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
+                        if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+                        {
+                            continue;
+                        }
+
+                        // Rotation can only start when previous rotation is finished, so do an additional availability check
+                        if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+                        {
+                            continue;
+                        }
+
+                        alu_index = i;
+                        break;
+                    }
+                }
+                if (alu_index >= 0)
+                {
+                    break;
+                }
+                ++next_latency;
+            }
+
+            // Don't generate instructions that leave some register unchanged for more than 7 cycles
+            if (next_latency > latency[a] + 7)
+            {
+                continue;
+            }
+
+            next_latency += op_latency[opcode];
+
+            if (next_latency <= TOTAL_LATENCY)
+            {
+                if (is_rotation[opcode])
+                {
+                    ++rotate_count;
+                }
+
+                // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
+                alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
+                latency[a] = next_latency;
+
+                // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
+                asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
+
+                rotated[a] = is_rotation[opcode];
+
+                inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
+
+                code[code_size].opcode = opcode;
+                code[code_size].dst_index = dst_index;
+                code[code_size].src_index = src_index;
+                code[code_size].C = 0;
+
+                if (src_index == 8)
+                {
+                    r8_used = true;
+                }
+
+                if (opcode == ADD)
+                {
+                    // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
+                    alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
+
+                    // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
+                    check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
+                    uint32_t t;
+                    memcpy(&t, data + data_index, sizeof(uint32_t));
+                    code[code_size].C = SWAP32LE(t);
+                    data_index += sizeof(uint32_t);
+                }
+
+                ++code_size;
+                if (code_size >= NUM_INSTRUCTIONS_MIN)
+                {
+                    break;
+                }
+            }
+            else
+            {
+                ++num_retries;
+            }
+        }
+
+        // ASIC has more execution resources and can extract as much parallelism from the code as possible
+        // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
+        // Get this latency for at least 1 of the 4 registers
+        const int prev_code_size = code_size;
+        while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+        {
+            int min_idx = 0;
+            int max_idx = 0;
+            for (int i = 1; i < 4; ++i)
+            {
+                if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
+                if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+            }
+
+            const uint8_t pattern[3] = { ROR, MUL, MUL };
+            const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
+            latency[min_idx] = latency[max_idx] + op_latency[opcode];
+            asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
+
+            code[code_size].opcode = opcode;
+            code[code_size].dst_index = min_idx;
+            code[code_size].src_index = max_idx;
+            code[code_size].C = 0;
+            ++code_size;
+        }
+
+    // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+    // It never does more than 4 iterations for all block heights < 10,000,000
+    }  while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+
+    // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
+    // Add final instruction to stop the interpreter
+    code[code_size].opcode = RET;
+    code[code_size].dst_index = 0;
+    code[code_size].src_index = 0;
+    code[code_size].C = 0;
+
+    return code_size;
+}
+
+#endif
--- a/cmake/asm.cmake
+++ b/cmake/asm.cmake
@ -1,30 +1,24 @@
 if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
    set(XMRIG_ASM_LIBRARY "xmrig-asm")

-    if (CMAKE_C_COMPILER_ID MATCHES MSVC)
-        enable_language(ASM_MASM)
+    enable_language(ASM)

-        if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141)
-            set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.asm")
-        else()
-            set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.asm")
-        endif()
-
-        set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
+    if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
+        set(XMRIG_ASM_FILES
+            "crypto/asm/win64/cn_main_loop.S"
+            "crypto/asm/CryptonightR_template.S"
+        )
    else()
-        enable_language(ASM)
-
-        if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
-            set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.S")
-        else()
-            set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.S")
-        endif()
-
-        set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
+        set(XMRIG_ASM_FILES
+            "crypto/asm/cn_main_loop.S"
+            "crypto/asm/CryptonightR_template.S"
+        )
    endif()

-    add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE})
-    set(XMRIG_ASM_SOURCES "")
+    set_property(SOURCE ${XMRIG_ASM_FILES} PROPERTY C)
+
+    add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILES})
+    set(XMRIG_ASM_SOURCES "crypto/CryptonightR_gen.c")
    set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C)
 else()
    set(XMRIG_ASM_SOURCES "")
--- a/cpu.c
+++ b/cpu.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -64,20 +65,20 @@ void cpu_init_common() {

    if (data.flags[CPU_FEATURE_AES]) {
        cpu_info.flags |= CPU_FLAG_AES;
-
-#       ifndef XMRIG_NO_ASM
-        if (data.vendor == VENDOR_AMD) {
-            cpu_info.assembly = ASM_RYZEN;
-        }
-        else if (data.vendor == VENDOR_INTEL) {
-            cpu_info.assembly = ASM_INTEL;
-        }
-#       endif
    }

    if (data.flags[CPU_FEATURE_BMI2]) {
        cpu_info.flags |= CPU_FLAG_BMI2;
    }
+
+#   ifndef XMRIG_NO_ASM
+    if (data.vendor == VENDOR_AMD) {
+        cpu_info.assembly = (data.ext_family >= 23) ? ASM_RYZEN : ASM_BULLDOZER;
+    }
+    else if (data.vendor == VENDOR_INTEL) {
+        cpu_info.assembly = ASM_INTEL;
+    }
+#   endif
 }
 #endif

--- a/cpu.h
+++ b/cpu.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
--- a/crypto/CryptonightR_gen.c
+++ b/crypto/CryptonightR_gen.c
@ -0,0 +1,146 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "crypto/asm/CryptonightR_template.h"
+#include "persistent_memory.h"
+
+
+static inline void add_code(uint8_t **p, void (*p1)(), void (*p2)())
+{
+    const ptrdiff_t size = (const uint8_t*)(p2) - (const uint8_t*)(p1);
+    if (size > 0) {
+        memcpy(*p, (const void *) p1, size);
+        *p += size;
+    }
+}
+
+
+static inline void add_random_math(uint8_t **p, const struct V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, enum Assembly ASM)
+{
+    uint32_t prev_rot_src = (uint32_t)(-1);
+
+    for (int i = 0;; ++i) {
+        const struct V4_Instruction inst = code[i];
+        if (inst.opcode == RET) {
+            break;
+        }
+
+        uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
+        uint8_t dst_index = inst.dst_index;
+        uint8_t src_index = inst.src_index;
+
+        const uint32_t a = inst.dst_index;
+        const uint32_t b = inst.src_index;
+        const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
+
+        switch (inst.opcode) {
+        case ROR:
+        case ROL:
+            if (b != prev_rot_src) {
+                prev_rot_src = b;
+                add_code(p, instructions_mov[c], instructions_mov[c + 1]);
+            }
+            break;
+        }
+
+        if (a == prev_rot_src) {
+            prev_rot_src = (uint32_t)(-1);
+        }
+
+        void_func begin = instructions[c];
+
+        if ((ASM = ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) {
+            // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
+            // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
+            uint8_t* prefix = (uint8_t*) begin;
+
+            if (*prefix == 0x49) {
+                **p = 0x41;
+                *p += 1;
+            }
+
+            begin = (void_func)(prefix + 1);
+        }
+
+        add_code(p, begin, instructions[c + 1]);
+
+        if (inst.opcode == ADD) {
+            *(uint32_t*)(*p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
+            if (is_64_bit) {
+                prev_rot_src = (uint32_t)(-1);
+            }
+        }
+    }
+}
+
+
+void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = machine_code;
+    uint8_t* p  = p0;
+
+    add_code(&p, CryptonightR_template_part1, CryptonightR_template_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_part2, CryptonightR_template_part3);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
+    add_code(&p, CryptonightR_template_part3, CryptonightR_template_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
+
+
+void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = (uint8_t*) machine_code;
+    uint8_t* p = p0;
+
+    add_code(&p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
+    add_code(&p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
+
+
+void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = machine_code;
+    uint8_t* p  = p0;
+
+    add_code(&p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0));
+    add_code(&p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
--- a/crypto/asm/CryptonightR_soft_aes_template.inc
+++ b/crypto/asm/CryptonightR_soft_aes_template.inc
@ -0,0 +1,279 @@
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_soft_aes_template_part1):
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movq	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movq	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movq	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movq	xmm10, QWORD PTR [r10+96]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movq	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movq xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movq	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movq	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movq	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	pxor xmm6, xmm1
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movq rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movq	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+	movaps xmm0, xmm5
+	psrldq xmm0, 8
+	movd r9d, xmm0
+
+FN_PREFIX(CryptonightR_soft_aes_template_part2):
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov edi, edi
+	shl rbp, 32
+	or rbp, rdi
+	xor r8, rbp
+
+	mov ebx, ebx
+	shl rsi, 32
+	or rsi, rbx
+	xor QWORD PTR [rsp+320], rsi
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor xmm6, xmm1
+	paddq	xmm1, xmm7
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
+
+FN_PREFIX(CryptonightR_soft_aes_template_part3):
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+FN_PREFIX(CryptonightR_soft_aes_template_end):
--- a/crypto/asm/CryptonightR_template.S
+++ b/crypto/asm/CryptonightR_template.S
--- a/crypto/asm/CryptonightR_template.h
+++ b/crypto/asm/CryptonightR_template.h
--- a/crypto/asm/CryptonightR_template.inc
+++ b/crypto/asm/CryptonightR_template.inc
@ -0,0 +1,531 @@
+PUBLIC FN_PREFIX(CryptonightR_template_part1)
+PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_template_part2)
+PUBLIC FN_PREFIX(CryptonightR_template_part3)
+PUBLIC FN_PREFIX(CryptonightR_template_end)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
+PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
+PUBLIC FN_PREFIX(CryptonightR_template_double_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_template_part1):
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movq	xmm0, r12
+	movq	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movq	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_template_mainloop):
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movq	xmm0, r15
+	movq	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+
+	mov	r13d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r13d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movaps xmm3, xmm0
+	movdqu	xmm2, XMMWORD PTR [r13+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	pxor xmm0, xmm2
+	pxor xmm5, xmm1
+	pxor xmm5, xmm0
+
+	movq	r12, xmm5
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	paddq	xmm3, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r13+r11], xmm3
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+FN_PREFIX(CryptonightR_template_part2):
+	lea	rcx, [r10+r11]
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor rsp, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov	rax, r13
+	mul	r12
+	add	r15, rax
+	add	rsp, rdx
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	movaps xmm3, xmm1
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	movdqa	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm1, xmm2
+	pxor xmm5, xmm0
+	pxor xmm5, xmm1
+	paddq	xmm3, xmm4
+	paddq	xmm2, xmm6
+	paddq	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqu	XMMWORD PTR [r12+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm3
+
+	movdqa	xmm7, xmm6
+	mov	QWORD PTR [rcx], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [rcx+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	FN_PREFIX(CryptonightR_template_mainloop)
+
+FN_PREFIX(CryptonightR_template_part3):
+	movq	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+FN_PREFIX(CryptonightR_template_end):
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_template_double_part1):
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movq	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movq	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movq	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movq	xmm0, rcx
+	mov	r11d, 524288
+	movq	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movq xmm14, QWORD PTR [rsp+128]
+	movq xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_template_double_mainloop):
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movq	xmm0, r12
+	mov	ecx, ebx
+	movq	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movq	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm1
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	movq	rdx, xmm6
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movq	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm1
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movq	rdi, xmm5
+	movq	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movq xmm0, rsp
+	movq xmm1, rsi
+	movq xmm2, rdi
+	movq xmm11, rbp
+	movq xmm12, r15
+	movq xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+FN_PREFIX(CryptonightR_template_double_part2):
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r14, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r12, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movq rsi, xmm1
+	movq rdi, xmm2
+	movq rbp, xmm11
+	movq r15, xmm12
+	movq rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rcx+rsi]
+	pxor	xmm6, xmm1
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	pxor	xmm6, xmm2
+	paddq	xmm2, xmm3
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	pxor	xmm6, xmm0
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movq rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movq xmm0, rsp
+	movq xmm1, rbx
+	movq xmm2, rsi
+	movq xmm11, rdi
+	movq xmm12, rbp
+	movq xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movq xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+FN_PREFIX(CryptonightR_template_double_part3):
+
+	movq r15, xmm13
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r13, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movq rbx, xmm1
+	movq rsi, xmm2
+	movq rdi, xmm11
+	movq rbp, xmm12
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	mov	rdi, rcx
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm1
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm2
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm0
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movq rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	FN_PREFIX(CryptonightR_template_double_mainloop)
+
+FN_PREFIX(CryptonightR_template_double_part4):
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+FN_PREFIX(CryptonightR_template_double_end):
--- a/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc
+++ b/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc
@ -94,7 +94,7 @@
 	lea	r9, QWORD PTR [rdx+r13]
 	movdqu	xmm15, XMMWORD PTR [r9]

-	ALIGN 16
+	ALIGN(64)
 main_loop_double_sandybridge:
 	movdqu	xmm9, xmm15
 	mov eax, edx
--- a/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
@ -0,0 +1,180 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+cnv2_main_loop_bulldozer:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq xmm6, r8
+	pinsrq xmm6, r11, 1
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+
+	mov edi, 1023
+	shl rdi, 52
+
+	movq	r14, xmm5
+	pextrq rax, xmm5, 1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	div	r9
+	mov	eax, eax
+	shl	rdx, 32
+	lea	r15, [rax+rdx]
+	lea	rax, [r14+r15]
+	shr	rax, 12
+	add	rax, rdi
+	movq	xmm0, rax
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_bulldozer
+	shr	rdi, 19
+
+sqrt_fixup_bulldozer_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	cnv2_main_loop_bulldozer
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_bulldozer_endp
+
+sqrt_fixup_bulldozer:
+	movq r9, xmm5
+	add r9, r15
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_bulldozer_ret
+
+cnv2_main_loop_bulldozer_endp:
--- a/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc
@ -50,7 +50,7 @@
 	punpcklqdq xmm5, xmm0
 	movdqu	 xmm6, XMMWORD PTR [r10+rbx]

-	ALIGN 16
+	ALIGN(64)
 main_loop_ivybridge:
 	lea	 rdx, QWORD PTR [r10+rbx]
 	mov	 ecx, r10d
--- a/crypto/asm/cn2/cnv2_main_loop_ryzen.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_ryzen.inc
@ -45,7 +45,7 @@
 	movq	xmm0, rcx
 	punpcklqdq xmm4, xmm0

-	ALIGN 16
+	ALIGN(64)
 main_loop_ryzen:
 	movdqa	xmm5, XMMWORD PTR [r10+rbx]
 	movq	xmm0, r11
--- a/crypto/asm/cnv2_main_loop.S
+++ b/crypto/asm/cnv2_main_loop.S
@ -1,4 +1,8 @@
-#define ALIGN .align
+#ifdef __APPLE__
+#   define ALIGN(x) .align 6
+#else
+#   define ALIGN(x) .align 64
+#endif
 .intel_syntax noprefix
 #ifdef __APPLE__
 #   define FN_PREFIX(fn) _ ## fn
@ -9,29 +13,42 @@
 #endif
 .global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
 .global FN_PREFIX(cnv2_mainloop_ryzen_asm)
+.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
 .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)

-ALIGN 16
+ALIGN(64)
 FN_PREFIX(cnv2_mainloop_ivybridge_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cnv2_main_loop_ivybridge.inc"
+	#include "cn2/cnv2_main_loop_ivybridge.inc"
 	add rsp, 48
 	ret 0
+	mov eax, 3735929054

-ALIGN 16
+ALIGN(64)
 FN_PREFIX(cnv2_mainloop_ryzen_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cnv2_main_loop_ryzen.inc"
+	#include "cn2/cnv2_main_loop_ryzen.inc"
 	add rsp, 48
 	ret 0
+	mov eax, 3735929054

-ALIGN 16
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_bulldozer_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_bulldozer.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
 	sub rsp, 48
 	mov rcx, rdi
 	mov rdx, rsi
-	#include "cnv2_double_main_loop_sandybridge.inc"
+	#include "cn2/cnv2_double_main_loop_sandybridge.inc"
 	add rsp, 48
 	ret 0
+	mov eax, 3735929054
--- a/crypto/asm/cnv2_main_loop.asm
+++ b/crypto/asm/cnv2_main_loop.asm
@ -1,25 +0,0 @@
-_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
-PUBLIC cnv2_mainloop_ivybridge_asm
-PUBLIC cnv2_mainloop_ryzen_asm
-PUBLIC cnv2_double_mainloop_sandybridge_asm
-
-ALIGN 64
-cnv2_mainloop_ivybridge_asm PROC
-	INCLUDE cnv2_main_loop_ivybridge.inc
-	ret 0
-cnv2_mainloop_ivybridge_asm ENDP
-
-ALIGN 64
-cnv2_mainloop_ryzen_asm PROC
-	INCLUDE cnv2_main_loop_ryzen.inc
-	ret 0
-cnv2_mainloop_ryzen_asm ENDP
-
-ALIGN 64
-cnv2_double_mainloop_sandybridge_asm PROC
-	INCLUDE cnv2_double_main_loop_sandybridge.inc
-	ret 0
-cnv2_double_mainloop_sandybridge_asm ENDP
-
-_TEXT_CNV2_MAINLOOP ENDS
-END
--- a/crypto/asm/win64/cn_main_loop.S
+++ b/crypto/asm/win64/cn_main_loop.S
@ -0,0 +1,31 @@
+#define ALIGN(x) .align 64
+.intel_syntax noprefix
+.section .text
+.global cnv2_mainloop_ivybridge_asm
+.global cnv2_mainloop_ryzen_asm
+.global cnv2_mainloop_bulldozer_asm
+.global cnv2_double_mainloop_sandybridge_asm
+
+ALIGN(64)
+cnv2_mainloop_ivybridge_asm:
+	#include "../cn2/cnv2_main_loop_ivybridge.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_mainloop_ryzen_asm:
+	#include "../cn2/cnv2_main_loop_ryzen.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_mainloop_bulldozer_asm:
+	#include "../cn2/cnv2_main_loop_bulldozer.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_double_mainloop_sandybridge_asm:
+	#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
+	ret 0
+	mov eax, 3735929054
--- a/crypto/asm/win64/cnv2_main_loop.S
+++ b/crypto/asm/win64/cnv2_main_loop.S
@ -1,21 +0,0 @@
-#define ALIGN .align
-.intel_syntax noprefix
-.section .text
-.global cnv2_mainloop_ivybridge_asm
-.global cnv2_mainloop_ryzen_asm
-.global cnv2_double_mainloop_sandybridge_asm
-
-ALIGN 16
-cnv2_mainloop_ivybridge_asm:
-	#include "../cnv2_main_loop_ivybridge.inc"
-	ret 0
-
-ALIGN 16
-cnv2_mainloop_ryzen_asm:
-	#include "../cnv2_main_loop_ryzen.inc"
-	ret 0
-
-ALIGN 16
-cnv2_double_mainloop_sandybridge_asm:
-	#include "../cnv2_double_main_loop_sandybridge.inc"
-	ret 0
--- a/crypto/soft_aes.c
+++ b/crypto/soft_aes.c
@ -1,212 +0,0 @@
-/*
-  * This program is free software: you can redistribute it and/or modify
-  * it under the terms of the GNU General Public License as published by
-  * the Free Software Foundation, either version 3 of the License, or
-  * any later version.
-  *
-  * This program is distributed in the hope that it will be useful,
-  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-  * GNU General Public License for more details.
-  *
-  * You should have received a copy of the GNU General Public License
-  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
-  *
-  * Additional permission under GNU GPL version 3 section 7
-  *
-  * If you modify this Program, or any covered work, by linking or combining
-  * it with OpenSSL (or a modified version of that library), containing parts
-  * covered by the terms of OpenSSL License and SSLeay License, the licensors
-  * of this Program grant you additional permission to convey the resulting work.
-  *
-  */
-
-/*
- * The orginal author of this AES implementation is Karl Malbrain.
- */
-
-#ifdef __GNUC__
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif // __GNUC__
-
-#include <inttypes.h>
-
-#define TABLE_ALIGN     32
-#define WPOLY           0x011b
-#define N_COLS          4
-#define AES_BLOCK_SIZE  16
-#define RC_LENGTH       (5 * (AES_BLOCK_SIZE / 4 - 2))
-
-#if defined(_MSC_VER)
-#define ALIGN __declspec(align(TABLE_ALIGN))
-#elif defined(__GNUC__)
-#define ALIGN __attribute__ ((aligned(16)))
-#else
-#define ALIGN
-#endif
-
-#define rf1(r,c) (r)
-#define word_in(x,c) (*((uint32_t*)(x)+(c)))
-#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
-
-#define s(x,c) x[c]
-#define si(y,x,c) (s(y,c) = word_in(x, c))
-#define so(y,x,c) word_out(y, c, s(x,c))
-#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3)
-#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
-#define round(y,x,k) \
-y[0] = (k)[0]  ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \
-y[1] = (k)[1]  ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \
-y[2] = (k)[2]  ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \
-y[3] = (k)[3]  ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]);
-#define to_byte(x) ((x) & 0xff)
-#define bval(x,n) to_byte((x) >> (8 * (n)))
-
-#define fwd_var(x,r,c)\
- ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
- : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
- : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
- :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
-
-#define fwd_rnd(y,x,k,c)  (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
-
-#define sb_data(w) {\
-    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
-    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
-    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
-    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
-    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
-    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
-    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
-    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
-    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
-    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
-    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
-    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
-    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
-    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
-    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
-    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
-    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
-    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
-    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
-    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
-    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
-    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
-    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
-    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
-    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
-    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
-    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
-    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
-    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
-    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
-    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
-    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
-
-#define rc_data(w) {\
-    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
-    w(0x1b), w(0x36) }
-
-#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
-    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
-
-#define h0(x)   (x)
-#define w0(p)   bytes2word(p, 0, 0, 0)
-#define w1(p)   bytes2word(0, p, 0, 0)
-#define w2(p)   bytes2word(0, 0, p, 0)
-#define w3(p)   bytes2word(0, 0, 0, p)
-
-#define u0(p)   bytes2word(f2(p), p, p, f3(p))
-#define u1(p)   bytes2word(f3(p), f2(p), p, p)
-#define u2(p)   bytes2word(p, f3(p), f2(p), p)
-#define u3(p)   bytes2word(p, p, f3(p), f2(p))
-
-#define v0(p)   bytes2word(fe(p), f9(p), fd(p), fb(p))
-#define v1(p)   bytes2word(fb(p), fe(p), f9(p), fd(p))
-#define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
-#define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
-
-#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
-#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
-#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY))
-#define f3(x)   (f2(x) ^ x)
-#define f9(x)   (f8(x) ^ x)
-#define fb(x)   (f8(x) ^ f2(x) ^ x)
-#define fd(x)   (f8(x) ^ f4(x) ^ x)
-#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
-
-#define t_dec(m,n) t_##m##n
-#define t_set(m,n) t_##m##n
-#define t_use(m,n) t_##m##n
-
-#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) }
-
-#define four_tables(x,tab,vf,rf,c) \
-    (tab[0][bval(vf(x,0,c),rf(0,c))] \
-    ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
-    ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
-    ^ tab[3][bval(vf(x,3,c),rf(3,c))])
-
-d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
-
-__m128i soft_aesenc(__m128i in, __m128i key)
-{
-    uint32_t x0, x1, x2, x3;
-    x0 = _mm_cvtsi128_si32(in);
-    x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
-    x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
-    x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
-
-    __m128i out = _mm_set_epi32(
-        (t_fn[0][x3 & 0xff] ^ t_fn[1][(x0 >> 8) & 0xff] ^ t_fn[2][(x1 >> 16) & 0xff] ^ t_fn[3][x2 >> 24]),
-        (t_fn[0][x2 & 0xff] ^ t_fn[1][(x3 >> 8) & 0xff] ^ t_fn[2][(x0 >> 16) & 0xff] ^ t_fn[3][x1 >> 24]),
-        (t_fn[0][x1 & 0xff] ^ t_fn[1][(x2 >> 8) & 0xff] ^ t_fn[2][(x3 >> 16) & 0xff] ^ t_fn[3][x0 >> 24]),
-        (t_fn[0][x0 & 0xff] ^ t_fn[1][(x1 >> 8) & 0xff] ^ t_fn[2][(x2 >> 16) & 0xff] ^ t_fn[3][x3 >> 24]));
-
-    return _mm_xor_si128(out, key);
-}
-
-uint8_t Sbox[256] = {       // forward s-box
-0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
-
-static inline void sub_word(uint8_t* key)
-{
-    key[0] = Sbox[key[0]];
-    key[1] = Sbox[key[1]];
-    key[2] = Sbox[key[2]];
-    key[3] = Sbox[key[3]];
-}
-
-#ifdef __clang__
-uint32_t _rotr(uint32_t value, uint32_t amount)
-{
-    return (value >> amount) | (value << ((32 - amount) & 31));
-}
-#endif
-
-__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
-{
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
-    sub_word((uint8_t*)&X1);
-    sub_word((uint8_t*)&X3);
-    return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1);
-}
--- a/crypto/soft_aes.h
+++ b/crypto/soft_aes.h
@ -0,0 +1,131 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+/*
+ * Parts of this file are originally copyright (c) 2014-2017, The Monero Project
+ */
+#pragma once
+
+
+#if defined(XMRIG_ARM)
+#   include "crypto/SSE2NEON.h"
+#elif defined(__GNUC__)
+#   include <x86intrin.h>
+#else
+#   include <intrin.h>
+#endif
+
+#include <inttypes.h>
+
+
+#define saes_data(w) {\
+    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
+    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
+    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
+    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
+    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
+    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
+    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
+    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
+    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
+    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
+    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
+    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
+    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
+    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
+    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
+    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
+    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
+    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
+    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
+    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
+    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
+    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
+    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
+    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
+    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
+    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
+    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
+    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
+    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
+    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
+    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
+    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+
+#define SAES_WPOLY           0x011b
+
+#define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
+    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+
+#define saes_f2(x)   ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY))
+#define saes_f3(x)   (saes_f2(x) ^ x)
+#define saes_h0(x)   (x)
+
+#define saes_u0(p)   saes_b2w(saes_f2(p),          p,          p, saes_f3(p))
+#define saes_u1(p)   saes_b2w(saes_f3(p), saes_f2(p),          p,          p)
+#define saes_u2(p)   saes_b2w(         p, saes_f3(p), saes_f2(p),          p)
+#define saes_u3(p)   saes_b2w(         p,          p, saes_f3(p), saes_f2(p))
+
+__attribute__((aligned(16))) const static uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
+__attribute__((aligned(16))) const static uint8_t  saes_sbox[256] = saes_data(saes_h0);
+
+
+static inline __m128i soft_aesenc(__m128i in, __m128i key)
+{
+    uint32_t x0, x1, x2, x3;
+    x0 = _mm_cvtsi128_si32(in);
+    x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
+    x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
+    x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),
+        (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]),
+        (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]),
+        (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, key);
+}
+
+static inline uint32_t sub_word(uint32_t key)
+{
+    return (saes_sbox[key >> 24 ] << 24)   | 
+        (saes_sbox[(key >> 16) & 0xff] << 16 ) | 
+        (saes_sbox[(key >> 8)  & 0xff] << 8  ) | 
+         saes_sbox[key & 0xff];
+}
+
+#if defined(__clang__) || defined(XMRIG_ARM)
+static inline uint32_t _rotr(uint32_t value, uint32_t amount)
+{
+    return (value >> amount) | (value << ((32 - amount) & 31));
+}
+#endif
+
+
+static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
+{
+    const uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)));
+    const uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)));
+    return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1);
+}
--- a/mac/memory_mac.c
+++ b/mac/memory_mac.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -74,3 +75,21 @@ void persistent_memory_free() {
        _mm_free(persistent_memory);
    }
 }
+
+
+void *allocate_executable_memory(size_t size)
+{
+    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
+}
+
+
+void protect_executable_memory(void *p, size_t size)
+{
+    mprotect(p, size, PROT_READ | PROT_EXEC);
+}
+
+
+void flush_instruction_cache(void *p, size_t size)
+{
+    __builtin___clear_cache((char*) p, (char*)(p) + size);
+}
--- a/memory.c
+++ b/memory.c
@ -4,7 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2018 XMRig       <support@xmrig.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -41,14 +43,29 @@ void * persistent_calloc(size_t num, size_t size) {
 }


+void init_cn_r(struct cryptonight_ctx *ctx)
+{
+    uint8_t *p = allocate_executable_memory(0x4000);
+
+    ctx->generated_code        = (cn_mainloop_fun_ms_abi) p;
+    ctx->generated_code_double = (cn_mainloop_double_fun_ms_abi)(p + 0x2000);
+    ctx->generated_code_height = ctx->generated_code_double_height = (uint64_t)(-1);
+    ctx->height                = 0;
+}
+
+
 void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id)
 {
    const int ratio = (opt_double_hash && opt_algo == ALGO_CRYPTONIGHT) ? 2 : 1;
    ctx[0]          = persistent_calloc(1, sizeof(struct cryptonight_ctx));
    ctx[0]->memory  = &persistent_memory[MEMORY * (thr_id * ratio + 1)];

+    init_cn_r(ctx[0]);
+
    if (opt_double_hash) {
        ctx[1]         = persistent_calloc(1, sizeof(struct cryptonight_ctx));
        ctx[1]->memory = ctx[0]->memory + (opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE);
+
+        init_cn_r(ctx[1]);
    }
 }
--- a/options.c
+++ b/options.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -74,6 +75,8 @@ static struct AlgoData const algorithms[] = {
    { "cryptonight/0",         "cn/0",         ALGO_CRYPTONIGHT,       VARIANT_0    },
    { "cryptonight/1",         "cn/1",         ALGO_CRYPTONIGHT,       VARIANT_1    },
    { "cryptonight/2",         "cn/2",         ALGO_CRYPTONIGHT,       VARIANT_2    },
+    { "cryptonight/4",         "cn/4",         ALGO_CRYPTONIGHT,       VARIANT_4    },
+    { "cryptonight/r",         "cn/r",         ALGO_CRYPTONIGHT,       VARIANT_4    },

 #   ifndef XMRIG_NO_AEON
    { "cryptonight-lite",      "cn-lite",      ALGO_CRYPTONIGHT_LITE,  VARIANT_AUTO },
@ -88,7 +91,7 @@ static char const usage[] = "\
 Usage: " APP_ID " [OPTIONS]\n\
 Options:\n\
  -a, --algo=ALGO       cryptonight (default) or cryptonight-lite\n\
-      --variant=N       cryptonight variant: 0-2\n\
+      --variant=N       cryptonight variant: 0-4\n\
  -o, --url=URL         URL of mining server\n\
  -b, --backup-url=URL  URL of backup mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
@ -124,7 +127,7 @@ static struct option const options[] = {
    { "cpu-affinity",  1, NULL, 1020 },
    { "donate-level",  1, NULL, 1003 },
    { "help",          0, NULL, 'h'  },
-    { "keepalive",     0, NULL ,'k'  },
+    { "keepalive",     0, NULL, 'k'  },
    { "max-cpu-usage", 1, NULL, 1004 },
    { "nicehash",      0, NULL, 1006 },
    { "no-color",      0, NULL, 1002 },
@ -156,6 +159,7 @@ static const char *variant_names[] = {
    "0",
    "1",
    "2",
+    "4"
 };


@ -163,7 +167,8 @@ static const char *asm_names[] = {
    "none",
    "auto",
    "intel",
-    "ryzen"
+    "ryzen",
+    "bulldozer"
 };


@ -380,9 +385,13 @@ static void parse_arg(int key, char *arg) {

    case 1021: /* --variant */
        v = atoi(arg);
-        if (v > VARIANT_AUTO && v < VARIANT_MAX) {
+        if (v == 4 || strcasecmp(arg, "r") == 0) {
+            opt_variant = VARIANT_4;
+        }
+        else if (v > VARIANT_AUTO && v < VARIANT_MAX) {
            opt_variant = v;
        }
+
        break;

    case 1006: /* --nicehash */
@ -397,7 +406,7 @@ static void parse_arg(int key, char *arg) {

 static void parse_config(json_t *config, char *ref)
 {
-    int i;
+    size_t i;
    char buf[16];
    json_t *val;

--- a/options.h
+++ b/options.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -44,6 +45,7 @@ enum Variant {
    VARIANT_0    = 0,
    VARIANT_1    = 1,
    VARIANT_2    = 2,
+    VARIANT_4    = 3,
    VARIANT_MAX
 };

@ -63,6 +65,7 @@ enum Assembly {
    ASM_AUTO,
    ASM_INTEL,
    ASM_RYZEN,
+    ASM_BULLDOZER,
    ASM_MAX
 };

--- a/persistent_memory.h
+++ b/persistent_memory.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -47,8 +48,15 @@ extern int persistent_memory_flags;

 const char * persistent_memory_allocate();
 void persistent_memory_free();
-void * persistent_calloc(size_t num, size_t size);
+void *persistent_calloc(size_t num, size_t size);
 void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id);


+void *allocate_executable_memory(size_t size);
+void flush_instruction_cache(void *p, size_t size);
+void init_cn_r(struct cryptonight_ctx *ctx);
+void protect_executable_memory(void *p, size_t size);
+
+
+
 #endif /* XMRIG_PERSISTENT_MEMORY_H */
--- a/stratum.c
+++ b/stratum.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -709,6 +710,8 @@ static bool job_decode(const json_t *job) {
    memset(work.job_id, 0, sizeof(work.job_id));
    memcpy(work.job_id, job_id, strlen(job_id));

+    work.height = (uint64_t) json_integer_value(json_object_get(job, "height"));
+
    return true;
 }

--- a/stratum.h
+++ b/stratum.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -42,6 +43,7 @@ struct work {
    uint32_t target   __attribute__((aligned(16)));
    uint32_t hash[8]  __attribute__((aligned(16)));
    char job_id[64]   __attribute__((aligned(16)));
+    uint64_t height;
 };


--- a/unix/memory_unix.c
+++ b/unix/memory_unix.c
@ -74,3 +74,23 @@ void persistent_memory_free() {
        _mm_free(persistent_memory);
    }
 }
+
+
+void *allocate_executable_memory(size_t size)
+{
+    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+
+void protect_executable_memory(void *p, size_t size)
+{
+    mprotect(p, size, PROT_READ | PROT_EXEC);
+}
+
+
+void flush_instruction_cache(void *p, size_t size)
+{
+#   ifndef __FreeBSD__
+    __builtin___clear_cache((char*) p, (char*)(p) + size);
+#   endif
+}
--- a/version.h
+++ b/version.h
@ -5,7 +5,8 @@
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -27,13 +28,13 @@
 #define APP_ID        "xmrig"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "Monero (XMR) CPU miner"
-#define APP_VERSION   "0.9.0-dev"
+#define APP_VERSION   "0.10.0-dev"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
-#define APP_COPYRIGHT "Copyright (C) 2016-2018 xmrig.com"
+#define APP_COPYRIGHT "Copyright (C) 2016-2019 xmrig.com"

 #define APP_VER_MAJOR  0
-#define APP_VER_MINOR  9
+#define APP_VER_MINOR  10
 #define APP_VER_BUILD  0
 #define APP_VER_REV    0

--- a/win/memory_win.c
+++ b/win/memory_win.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -21,9 +22,6 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __MEMORY_H__
-#define __MEMORY_H__
-
 #include <windows.h>
 #include <ntsecapi.h>
 #include <tchar.h>
@ -172,4 +170,21 @@ void persistent_memory_free() {
    }
 }

-#endif /* __MEMORY_H__ */
+
+void *allocate_executable_memory(size_t size)
+{
+    return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+}
+
+
+void protect_executable_memory(void *p, size_t size)
+{
+    DWORD oldProtect;
+    VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect);
+}
+
+
+void flush_instruction_cache(void *p, size_t size)
+{
+    FlushInstructionCache(GetCurrentProcess(), p, size);
+}
--- a/xmrig.c
+++ b/xmrig.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -306,8 +307,10 @@ static void *miner_thread(void *userdata) {
        struct timeval tv_start;
        gettimeofday(&tv_start, NULL);

+        persistentctx[0]->height = work.height;
+
        /* scan nonces for a proof-of-work hash */
-        const int rc = scanhash_cryptonight(thr_id, hash, (const uint8_t *) work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx);
+        const int rc = scanhash_cryptonight(thr_id, hash, (uint8_t *) work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx);
        stats_add_hashes(thr_id, &tv_start, hashes_done);

        if (!rc) {
@ -391,6 +394,9 @@ static void *miner_thread_double(void *userdata) {
        struct timeval tv_start;
        gettimeofday(&tv_start, NULL);

+        persistentctx[0]->height = work.height;
+        persistentctx[1]->height = work.height;
+
        /* scan nonces for a proof-of-work hash */
        const int rc = scanhash_cryptonight_double(thr_id, (uint32_t *) double_hash, double_blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx);
        stats_add_hashes(thr_id, &tv_start, hashes_done);