crypto: portable AES key expansion without allocations

Calling `oaes_alloc`, then `oaes_key_import_data`, then `oaes_free` calls the functions `calloc` and `free` four times each. This PR rewrites the key expansion function to expand directly into a 240 byte buffer with no allocations. This should speed up Cryptonight on platforms such as Raspberry PI. Hash tests are expanded to test slow hash using software AES on all platforms.

Depends on #9505
This commit is contained in:
jeffro256 2024-10-07 12:57:23 -05:00
parent 0f9a5c1020
commit e2109447bb
No known key found for this signature in database
GPG key ID: 6F79797A6E392442
4 changed files with 51 additions and 295 deletions

View file

@ -27,25 +27,11 @@
* POSSIBILITY OF SUCH DAMAGE.
* ---------------------------------------------------------------------------
*/
#include <assert.h>
#include <stddef.h>
#include <time.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
// OS X, FreeBSD, OpenBSD and NetBSD don't need malloc.h
#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__OpenBSD__) \
&& !defined(__DragonFly__) && !defined(__NetBSD__)
#include <malloc.h>
#endif
// ANDROID, FreeBSD, OpenBSD and NetBSD also don't need timeb.h
#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__ANDROID__) \
&& !defined(__NetBSD__)
#include <sys/timeb.h>
#else
#include <sys/time.h>
#endif
#ifdef WIN32
#include <process.h>
@ -58,7 +44,6 @@
#define OAES_RKEY_LEN 4
#define OAES_COL_LEN 4
#define OAES_ROUND_BASE 7
static uint8_t oaes_gf_8[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
@ -83,201 +68,68 @@ static uint8_t oaes_sub_byte_value[16][16] = {
/*f*/ { 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 },
};
static OAES_RET oaes_sub_byte( uint8_t * byte )
static void oaes_sub_byte( uint8_t * byte )
{
size_t _x, _y;
if( NULL == byte )
return OAES_RET_ARG1;
_x = _y = *byte;
_x &= 0x0f;
_y &= 0xf0;
_y >>= 4;
*byte = oaes_sub_byte_value[_y][_x];
return OAES_RET_SUCCESS;
}
static OAES_RET oaes_word_rot_left( uint8_t word[OAES_COL_LEN] )
static void oaes_word_rot_left( uint8_t word[OAES_COL_LEN] )
{
uint8_t _temp[OAES_COL_LEN];
if( NULL == word )
return OAES_RET_ARG1;
memcpy( _temp, word + 1, OAES_COL_LEN - 1 );
_temp[OAES_COL_LEN - 1] = word[0];
memcpy( word, _temp, OAES_COL_LEN );
return OAES_RET_SUCCESS;
}
static OAES_RET oaes_key_destroy( oaes_key ** key )
{
if( NULL == *key )
return OAES_RET_SUCCESS;
if( (*key)->data )
{
free( (*key)->data );
(*key)->data = NULL;
}
if( (*key)->exp_data )
{
free( (*key)->exp_data );
(*key)->exp_data = NULL;
}
(*key)->data_len = 0;
(*key)->exp_data_len = 0;
(*key)->num_keys = 0;
(*key)->key_base = 0;
free( *key );
*key = NULL;
return OAES_RET_SUCCESS;
}
static OAES_RET oaes_key_expand( OAES_CTX * ctx )
void aes_expand_256key_portable(const uint8_t data[32], uint8_t expanded_key_out[240])
{
size_t _i, _j;
oaes_ctx * _ctx = (oaes_ctx *) ctx;
if( NULL == _ctx )
return OAES_RET_ARG1;
if( NULL == _ctx->key )
return OAES_RET_NOKEY;
_ctx->key->key_base = _ctx->key->data_len / OAES_RKEY_LEN;
_ctx->key->num_keys = _ctx->key->key_base + OAES_ROUND_BASE;
_ctx->key->exp_data_len = _ctx->key->num_keys * OAES_RKEY_LEN * OAES_COL_LEN;
_ctx->key->exp_data = (uint8_t *)
calloc( _ctx->key->exp_data_len, sizeof( uint8_t ));
if( NULL == _ctx->key->exp_data )
return OAES_RET_MEM;
// the first _ctx->key->data_len are a direct copy
memcpy( _ctx->key->exp_data, _ctx->key->data, _ctx->key->data_len );
assert( data != NULL );
assert( expanded_key_out != NULL );
// the first 32 bytes are a direct copy
memcpy( expanded_key_out, data, 32 );
// the rest are set to 0
memset( expanded_key_out + 32, 0, 208 );
// apply ExpandKey algorithm for remainder
for( _i = _ctx->key->key_base; _i < _ctx->key->num_keys * OAES_RKEY_LEN; _i++ )
for( _i = 8; _i < 15 * OAES_RKEY_LEN; _i++ )
{
uint8_t _temp[OAES_COL_LEN];
memcpy( _temp,
_ctx->key->exp_data + ( _i - 1 ) * OAES_RKEY_LEN, OAES_COL_LEN );
expanded_key_out + ( _i - 1 ) * OAES_RKEY_LEN, OAES_COL_LEN );
// transform key column
if( 0 == _i % _ctx->key->key_base )
if( 0 == _i % 8 )
{
oaes_word_rot_left( _temp );
for( _j = 0; _j < OAES_COL_LEN; _j++ )
oaes_sub_byte( _temp + _j );
_temp[0] = _temp[0] ^ oaes_gf_8[ _i / _ctx->key->key_base - 1 ];
_temp[0] = _temp[0] ^ oaes_gf_8[ _i / 8 - 1 ];
}
else if( _ctx->key->key_base > 6 && 4 == _i % _ctx->key->key_base )
else if( 4 == _i % 8 )
{
for( _j = 0; _j < OAES_COL_LEN; _j++ )
oaes_sub_byte( _temp + _j );
}
for( _j = 0; _j < OAES_COL_LEN; _j++ )
{
_ctx->key->exp_data[ _i * OAES_RKEY_LEN + _j ] =
_ctx->key->exp_data[ ( _i - _ctx->key->key_base ) *
expanded_key_out[ _i * OAES_RKEY_LEN + _j ] =
expanded_key_out[ ( _i - 8 ) *
OAES_RKEY_LEN + _j ] ^ _temp[_j];
}
}
return OAES_RET_SUCCESS;
}
OAES_RET oaes_key_import_data( OAES_CTX * ctx,
const uint8_t * data, size_t data_len )
{
oaes_ctx * _ctx = (oaes_ctx *) ctx;
OAES_RET _rc = OAES_RET_SUCCESS;
if( NULL == _ctx )
return OAES_RET_ARG1;
if( NULL == data )
return OAES_RET_ARG2;
switch( data_len )
{
case 16:
case 24:
case 32:
break;
default:
return OAES_RET_ARG3;
}
if( _ctx->key )
oaes_key_destroy( &(_ctx->key) );
_ctx->key = (oaes_key *) calloc( sizeof( oaes_key ), 1 );
if( NULL == _ctx->key )
return OAES_RET_MEM;
_ctx->key->data_len = data_len;
_ctx->key->data = (uint8_t *)
calloc( data_len, sizeof( uint8_t ));
if( NULL == _ctx->key->data )
{
oaes_key_destroy( &(_ctx->key) );
return OAES_RET_MEM;
}
memcpy( _ctx->key->data, data, data_len );
_rc = _rc || oaes_key_expand( ctx );
if( _rc != OAES_RET_SUCCESS )
{
oaes_key_destroy( &(_ctx->key) );
return _rc;
}
return OAES_RET_SUCCESS;
}
OAES_CTX * oaes_alloc(void)
{
oaes_ctx * _ctx = (oaes_ctx *) calloc( sizeof( oaes_ctx ), 1 );
if( NULL == _ctx )
return NULL;
_ctx->key = NULL;
return (OAES_CTX *) _ctx;
}
OAES_RET oaes_free( OAES_CTX ** ctx )
{
oaes_ctx ** _ctx = (oaes_ctx **) ctx;
if( NULL == _ctx )
return OAES_RET_ARG1;
if( NULL == *_ctx )
return OAES_RET_SUCCESS;
if( (*_ctx)->key )
oaes_key_destroy( &((*_ctx)->key) );
free( *_ctx );
*_ctx = NULL;
return OAES_RET_SUCCESS;
}

View file

@ -32,106 +32,12 @@
#define _OAES_LIB_H
#include <stdint.h>
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _WIN32
# ifdef OAES_SHARED
# ifdef oaes_lib_EXPORTS
# define OAES_API __declspec(dllexport)
# else
# define OAES_API __declspec(dllimport)
# endif
# else
# define OAES_API
# endif
#else
# define OAES_API
#endif // WIN32
#define OAES_VERSION "0.8.1"
#define OAES_BLOCK_SIZE 16
typedef void OAES_CTX;
typedef enum
{
OAES_RET_FIRST = 0,
OAES_RET_SUCCESS = 0,
OAES_RET_UNKNOWN,
OAES_RET_ARG1,
OAES_RET_ARG2,
OAES_RET_ARG3,
OAES_RET_ARG4,
OAES_RET_ARG5,
OAES_RET_NOKEY,
OAES_RET_MEM,
OAES_RET_BUF,
OAES_RET_HEADER,
OAES_RET_COUNT
} OAES_RET;
typedef uint16_t OAES_OPTION;
typedef struct _oaes_key
{
size_t data_len;
uint8_t *data;
size_t exp_data_len;
uint8_t *exp_data;
size_t num_keys;
size_t key_base;
} oaes_key;
typedef struct _oaes_ctx
{
oaes_key * key;
} oaes_ctx;
/*
* // usage:
*
* OAES_CTX * ctx = oaes_alloc();
* .
* .
* .
* {
* oaes_gen_key_xxx( ctx );
* {
* oaes_key_export( ctx, _buf, &_buf_len );
* // or
* oaes_key_export_data( ctx, _buf, &_buf_len );\
* }
* }
* // or
* {
* oaes_key_import( ctx, _buf, _buf_len );
* // or
* oaes_key_import_data( ctx, _buf, _buf_len );
* }
* .
* .
* .
* oaes_encrypt( ctx, m, m_len, c, &c_len );
* .
* .
* .
* oaes_decrypt( ctx, c, c_len, m, &m_len );
* .
* .
* .
* oaes_free( &ctx );
*/
OAES_API OAES_CTX * oaes_alloc(void);
OAES_API OAES_RET oaes_free( OAES_CTX ** ctx );
// directly import data into key
OAES_API OAES_RET oaes_key_import_data( OAES_CTX * ctx,
const uint8_t * data, size_t data_len );
void aes_expand_256key_portable(const uint8_t data[32], uint8_t expanded_key_out[240]);
#ifdef __cplusplus
}

View file

@ -31,6 +31,7 @@
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
@ -885,7 +886,6 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
size_t i, j;
uint64_t *p = NULL;
oaes_ctx *aes_ctx = NULL;
int useAes = !force_software_aes() && check_aes_hw();
static void (*const extra_hashes[4])(const void *, size_t, char *) =
@ -927,12 +927,11 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
else
{
aes_ctx = (oaes_ctx *) oaes_alloc();
oaes_key_import_data(aes_ctx, state.hs.b, AES_KEY_SIZE);
aes_expand_256key_portable(state.hs.b, expandedKey);
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
{
for(j = 0; j < INIT_SIZE_BLK; j++)
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], expandedKey);
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
}
@ -987,16 +986,15 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
else
{
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
aes_expand_256key_portable(&state.hs.b[32], expandedKey);
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
{
for(j = 0; j < INIT_SIZE_BLK; j++)
{
xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], expandedKey);
}
}
oaes_free((OAES_CTX **) &aes_ctx);
}
/* CryptoNight Step 5: Apply Keccak to the state again, and then
@ -1322,7 +1320,6 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
size_t i, j;
uint64_t *p = NULL;
oaes_ctx *aes_ctx = NULL;
int useAes = !force_software_aes() && check_aes_hw();
static void (*const extra_hashes[4])(const void *, size_t, char *) =
@ -1366,12 +1363,11 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
else
{
aes_ctx = (oaes_ctx *) oaes_alloc();
oaes_key_import_data(aes_ctx, state.hs.b, AES_KEY_SIZE);
aes_expand_256key_portable(state.hs.b, expandedKey);
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
{
for(j = 0; j < INIT_SIZE_BLK; j++)
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], expandedKey);
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
}
@ -1429,16 +1425,15 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
else
{
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
aes_expand_256key_portable(&state.hs.b[32], expandedKey);
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
{
for(j = 0; j < INIT_SIZE_BLK; j++)
{
xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], expandedKey);
}
}
oaes_free((OAES_CTX **) &aes_ctx);
}
/* CryptoNight Step 5: Apply Keccak to the state again, and then
@ -1584,7 +1579,6 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
size_t i, j;
uint8_t *p = NULL;
oaes_ctx *aes_ctx;
static void (*const extra_hashes[4])(const void *, size_t, char *) =
{
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
@ -1603,15 +1597,13 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
memcpy(text, state.init, INIT_SIZE_BYTE);
aes_ctx = (oaes_ctx *) oaes_alloc();
oaes_key_import_data(aes_ctx, state.hs.b, AES_KEY_SIZE);
aes_expand_256key_portable(state.hs.b, expandedKey);
VARIANT1_INIT64();
VARIANT2_INIT64();
VARIANT4_RANDOM_MATH_INIT();
// use aligned data
memcpy(expandedKey, aes_ctx->key->exp_data, aes_ctx->key->exp_data_len);
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
{
for(j = 0; j < INIT_SIZE_BLK; j++)
@ -1664,8 +1656,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
memcpy(text, state.init, INIT_SIZE_BYTE);
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
memcpy(expandedKey, aes_ctx->key->exp_data, aes_ctx->key->exp_data_len);
aes_expand_256key_portable(&state.hs.b[32], expandedKey);
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
{
for(j = 0; j < INIT_SIZE_BLK; j++)
@ -1675,7 +1666,6 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
}
oaes_free((OAES_CTX **) &aes_ctx);
memcpy(state.init, text, INIT_SIZE_BYTE);
hash_permutation(&state.hs);
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
@ -1790,7 +1780,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
uint8_t d[AES_BLOCK_SIZE];
size_t i, j;
uint8_t aes_key[AES_KEY_SIZE];
oaes_ctx *aes_ctx;
uint8_t expandedKey[240];
if (prehashed) {
memcpy(&state.hs, data, length);
@ -1799,16 +1789,15 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
memcpy(text, state.init, INIT_SIZE_BYTE);
memcpy(aes_key, state.hs.b, AES_KEY_SIZE);
aes_ctx = (oaes_ctx *) oaes_alloc();
VARIANT1_PORTABLE_INIT();
VARIANT2_PORTABLE_INIT();
VARIANT4_RANDOM_MATH_INIT();
oaes_key_import_data(aes_ctx, aes_key, AES_KEY_SIZE);
aes_expand_256key_portable(aes_key, expandedKey);
for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) {
for (j = 0; j < INIT_SIZE_BLK; j++) {
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], expandedKey);
}
memcpy(&long_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
}
@ -1854,18 +1843,17 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
memcpy(text, state.init, INIT_SIZE_BYTE);
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
aes_expand_256key_portable(&state.hs.b[32], expandedKey);
for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) {
for (j = 0; j < INIT_SIZE_BLK; j++) {
xor_blocks(&text[j * AES_BLOCK_SIZE], &long_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], expandedKey);
}
}
memcpy(state.init, text, INIT_SIZE_BYTE);
hash_permutation(&state.hs);
/*memcpy(hash, &state, 32);*/
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
oaes_free((OAES_CTX **) &aes_ctx);
#ifdef FORCE_USE_HEAP
free(long_state);

View file

@ -49,6 +49,16 @@ foreach (hash IN ITEMS fast slow slow-1 slow-2 slow-4 tree extra-blake extra-gro
COMMAND hash-tests "${hash}" "${CMAKE_CURRENT_SOURCE_DIR}/tests-${hash}.txt")
endforeach ()
# Additionally test cn_slow_hash() using software AES
foreach (hash IN ITEMS slow slow-1 slow-2 slow-4)
add_test(
NAME "hash-${hash}-soft-aes"
COMMAND hash-tests "${hash}" "${CMAKE_CURRENT_SOURCE_DIR}/tests-${hash}.txt")
set_tests_properties(
"hash-${hash}-soft-aes"
PROPERTIES ENVIRONMENT "MONERO_USE_SOFTWARE_AES=1")
endforeach ()
add_test(
NAME "hash-variant2-int-sqrt"
COMMAND hash-tests "variant2_int_sqrt")