diff --git a/algo/cryptonight/cryptonight.c b/algo/cryptonight/cryptonight.c
index cd91f9a8e..a501c61ca 100644
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@@ -41,8 +41,6 @@
 #include "cryptonight_test.h"
 #include "options.h"
 
-#include "utils/applog.h"
-
 
 void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
@@ -101,7 +99,7 @@ static bool self_test() {
     if (opt_algo == ALGO_CRYPTONIGHT) {
         result = verify(VARIANT_0, output, ctx, test_output_v0) &&
                  verify(VARIANT_1, output, ctx, test_output_v1) &&
-                 verify(VARIANT_0, output, ctx, test_output_v0);
+                 verify(VARIANT_2, output, ctx, test_output_v2);
     }
     else {
         result = verify(VARIANT_0, output, ctx, test_output_v0_lite) &&
diff --git a/algo/cryptonight/cryptonight_av1.c b/algo/cryptonight/cryptonight_av1.c
index 4028dd5d0..9ef83b07d 100644
--- a/algo/cryptonight/cryptonight_av1.c
+++ b/algo/cryptonight/cryptonight_av1.c
@@ -136,5 +136,58 @@ void cryptonight_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *res
 
 void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
+    keccak(input, size, ctx[0]->state, 200);
 
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = _mm_aesenc_si128(cx, ax0);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
diff --git a/algo/cryptonight/cryptonight_av3.c b/algo/cryptonight/cryptonight_av3.c
index a70197ce4..f15042b98 100644
--- a/algo/cryptonight/cryptonight_av3.c
+++ b/algo/cryptonight/cryptonight_av3.c
@@ -136,4 +136,58 @@ void cryptonight_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *res
 
 void cryptonight_av3_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = soft_aesenc(cx, ax0);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
diff --git a/algo/cryptonight/cryptonight_monero.h b/algo/cryptonight/cryptonight_monero.h
index 44ac27b03..2f64ad0a8 100644
--- a/algo/cryptonight/cryptonight_monero.h
+++ b/algo/cryptonight/cryptonight_monero.h
@@ -31,45 +31,64 @@
 #include <math.h>
 
 
-#define VARIANT1_INIT(part) \
+static inline __m128i int_sqrt_v2(const uint64_t n0)
+{
+    __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+    x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+    uint64_t r = (uint64_t)(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+
+    const uint64_t s = r >> 20;
+    r >>= 19;
+
+    uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+#   if (defined(_MSC_VER) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 1)) && (defined(__x86_64__) || defined(_M_AMD64))
+    _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#   else
+    if (x2 < n0) ++r;
+#   endif
+
+    return _mm_cvtsi64_si128(r);
+}
+
+
+#   define VARIANT1_INIT(part) \
     uint64_t tweak1_2_##part = (*(const uint64_t*)(input + 35 + part * size) ^ \
                                *((const uint64_t*)(ctx[part]->state) + 24)); \
 
-#ifndef XMRIG_ARM
 #   define VARIANT2_INIT(part) \
     __m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \
     __m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]);
 
 #ifdef _MSC_VER
-#   define VARIANT2_SET_ROUNDING_MODE() if (VARIANT == xmrig::VARIANT_2) { _control87(RC_DOWN, MCW_RC); }
+#   define VARIANT2_SET_ROUNDING_MODE() { _control87(RC_DOWN, MCW_RC); }
 #else
-#   define VARIANT2_SET_ROUNDING_MODE() if (VARIANT == xmrig::VARIANT_2) { fesetround(FE_DOWNWARD); }
+#   define VARIANT2_SET_ROUNDING_MODE() { fesetround(FE_DOWNWARD); }
 #endif
 
 #   define VARIANT2_INTEGER_MATH(part, cl, cx) \
-    do { \
-        const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \
+    { \
+        const uint64_t sqrt_result = (uint64_t)(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \
         const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
-        cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \
-        const uint32_t d = static_cast<uint32_t>(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \
+        cl ^= (uint64_t)(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \
+        const uint32_t d = (uint32_t)(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \
         const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
-        const uint64_t division_result = static_cast<uint32_t>(cx_1 / d) + ((cx_1 % d) << 32); \
-        division_result_xmm_##part = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+        const uint64_t division_result = (uint32_t)(cx_1 / d) + ((cx_1 % d) << 32); \
+        division_result_xmm_##part = _mm_cvtsi64_si128((int64_t)(division_result)); \
         sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \
-    } while (0)
+    }
 
 #   define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \
-    do { \
+    { \
         const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
         const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
         const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
         _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
         _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
         _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
-    } while (0)
+    }
 
 #   define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
-    do { \
+    { \
         const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
         const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
         hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
@@ -78,48 +97,6 @@
         _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
         _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
         _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
-    } while (0)
+    }
 
-#else
-#   define VARIANT2_INIT(part) \
-    uint64_t division_result_##part = h##part[12]; \
-    uint64_t sqrt_result_##part = h##part[13];
-
-#   define VARIANT2_INTEGER_MATH(part, cl, cx) \
-    do { \
-        const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
-        cl ^= division_result_##part ^ (sqrt_result_##part << 32); \
-        const uint32_t d = static_cast<uint32_t>(cx_0 + (sqrt_result_##part << 1)) | 0x80000001UL; \
-        const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
-        division_result_##part = static_cast<uint32_t>(cx_1 / d) + ((cx_1 % d) << 32); \
-        const uint64_t sqrt_input = cx_0 + division_result_##part; \
-        sqrt_result_##part = sqrt(sqrt_input + 18446744073709551616.0) * 2.0 - 8589934592.0; \
-        const uint64_t s = sqrt_result_##part >> 1; \
-        const uint64_t b = sqrt_result_##part & 1; \
-        const uint64_t r2 = (uint64_t)(s) * (s + b) + (sqrt_result_##part << 32); \
-        sqrt_result_##part += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \
-    } while (0)
-
-#   define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \
-    do { \
-        const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))); \
-        const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \
-        const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \
-        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
-        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
-        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
-    } while (0)
-
-#   define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
-    do { \
-        const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \
-        const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \
-        hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
-        lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \
-        const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \
-        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
-        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
-        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
-    } while (0)
-#endif
 #endif /* XMRIG_CRYPTONIGHT_MONERO_H */