Add vzeroupper for processors with AVX

To avoid false dependencies on upper 128 bits of YMM registers.
This commit is contained in:
SChernykh 2019-12-18 09:12:25 +01:00
parent 59e8fdb9ed
commit 7459677fd5
3 changed files with 12 additions and 0 deletions

View file

@ -289,6 +289,11 @@ namespace randomx {
JitCompilerX86::JitCompilerX86() {
applyTweaks();
int32_t info[4];
cpuid(1, info);
hasAVX = (info[2] & (1 << 28)) != 0;
allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
@ -374,6 +379,9 @@ namespace randomx {
code[codePos + 5] = 0xc0 + pcfg.readReg1;
*(uint32_t*)(code + codePos + 10) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
*(uint32_t*)(code + codePos + 20) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
if (hasAVX) {
*(uint32_t*)(code + codePos + 29) = 0xE977F8C5;
}
codePos = prologueSize;
memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask));

View file

@ -73,6 +73,7 @@ namespace randomx {
uint32_t vm_flags;
static bool BranchesWithin32B;
bool hasAVX;
static void applyTweaks();
void generateProgramPrologue(Program&, ProgramConfiguration&);

View file

@ -82,6 +82,9 @@ randomx_program_prologue_first_load PROC
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
stmxcsr dword ptr [rsp-20]
nop
nop
nop
jmp randomx_program_loop_begin
randomx_program_prologue_first_load ENDP