- << " .quad " << &JCBM << "\n"
- << ResolverBlockName << ":\n";
-
- uint64_t ReturnAddrOffset = saveX86Regs(AsmStream);
-
- // Compute index, load object address, and call JIT.
- AsmStream << " leaq jit_callback_manager_addr(%rip), %rdi\n"
- << " movq (%rdi), %rdi\n"
- << " movq " << ReturnAddrOffset << "(%rsp), %rsi\n"
- << " movabsq $" << CallbackAddr << ", %rax\n"
- << " callq *%rax\n"
- << " movq %rax, " << ReturnAddrOffset << "(%rsp)\n";
-
- restoreX86Regs(AsmStream);
-
- AsmStream << " retq\n";
+ << " .quad " << &JCBM << "\n";
+
+ // Start the resolver function.
+ AsmStream << ResolverBlockName << ":\n"
+ << " pushq %rbp\n"
+ << " movq %rsp, %rbp\n";
+
+ // Store the GPRs.
+ for (const auto &GPR : GPRs)
+ AsmStream << " pushq %" << GPR << "\n";
+
+ // Store floating-point state with FXSAVE.
+ // Note: We need to keep the stack 16-byte aligned, so if we've emitted an odd
+ // number of 64-bit pushes so far (GPRs.size() plus 1 for RBP) then add
+ // an extra 64 bits of padding to the FXSave area.
+ unsigned Padding = (GPRs.size() + 1) % 2 ? 8 : 0;
+ unsigned FXSaveSize = 512 + Padding;
+ AsmStream << " subq $" << FXSaveSize << ", %rsp\n"
+ << " fxsave64 (%rsp)\n"
+
+ // Load callback manager address, compute trampoline address, call JIT.
+ << " lea jit_callback_manager_addr(%rip), %rdi\n"
+ << " movq (%rdi), %rdi\n"
+ << " movq 0x8(%rbp), %rsi\n"
+ << " subq $" << X86_64_TrampolineLength << ", %rsi\n"
+ << " movabsq $" << CallbackAddr << ", %rax\n"
+ << " callq *%rax\n"
+
+ // Replace the return to the trampoline with the return address of the
+ // compiled function body.
+ << " movq %rax, 0x8(%rbp)\n"
+
+ // Restore the floating point state.
+ << " fxrstor64 (%rsp)\n"
+ << " addq $" << FXSaveSize << ", %rsp\n";
+
+ for (const auto &GPR : make_range(GPRs.rbegin(), GPRs.rend()))
+ AsmStream << " popq %" << GPR << "\n";
+
+ // Restore original RBP and return to compiled function body.
+ AsmStream << " popq %rbp\n"
+ << " retq\n";