diff --git a/algo/cryptonight/cryptonight_av2.c b/algo/cryptonight/cryptonight_av2.c index 7e5f4109f..654dd5bcb 100644 --- a/algo/cryptonight/cryptonight_av2.c +++ b/algo/cryptonight/cryptonight_av2.c @@ -204,5 +204,101 @@ void cryptonight_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *res void cryptonight_av2_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + VARIANT2_INIT(0); + VARIANT2_INIT(1); + VARIANT2_SET_ROUNDING_MODE(); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = al0; + uint64_t idx1 = al1; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + + VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); + + VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); + _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); + + idx0 = _mm_cvtsi128_si64(cx0); + idx1 = _mm_cvtsi128_si64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(0, cl, cx0); + lo = _umul128(idx0, cl, &hi); + VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(1, cl, cx1); + lo = _umul128(idx1, cl, &hi); + VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); + + al1 += hi; + ah1 += lo; + + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; + + al1 ^= cl; + ah1 ^= ch; + idx1 = al1; + + bx01 = bx00; + bx11 = bx10; + + bx00 = cx0; + bx10 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); } diff --git a/algo/cryptonight/cryptonight_av4.c b/algo/cryptonight/cryptonight_av4.c index bb4840952..5ff299c12 100644 --- a/algo/cryptonight/cryptonight_av4.c +++ b/algo/cryptonight/cryptonight_av4.c @@ -202,6 +202,103 @@ void cryptonight_av4_v1(const uint8_t *restrict input, size_t size, uint8_t *res } -void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx *restrict ctx) +void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + VARIANT2_INIT(0); + VARIANT2_INIT(1); + VARIANT2_SET_ROUNDING_MODE(); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = al0; + uint64_t idx1 = al1; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + cx0 = soft_aesenc(cx0, ax0); + cx1 = soft_aesenc(cx1, ax1); + + VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); + + VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); + _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); + + idx0 = _mm_cvtsi128_si64(cx0); + idx1 = _mm_cvtsi128_si64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(0, cl, cx0); + lo = _umul128(idx0, cl, &hi); + VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(1, cl, cx1); + lo = _umul128(idx1, cl, &hi); + VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); + + al1 += hi; + ah1 += lo; + + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; + + al1 ^= cl; + ah1 ^= ch; + idx1 = al1; + + bx01 = bx00; + bx11 = bx10; + + bx00 = cx0; + bx10 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); }