From 0d1978b8130c14b97d2e5ca66e5bfb975d6e46b4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 20 Oct 2014 22:14:22 +0000 Subject: [PATCH] [X86] Memory folding for commutative instructions (updated) This patch improves support for commutative instructions in the x86 memory folding implementation by attempting to fold a commuted version of the instruction if the original folding fails - if that folding fails as well the instruction is 're-commuted' back to its original order before returning. Updated version of r219584 (reverted in r219595) - the commutation attempt now explicitly ensures that neither of the commuted source operands are tied to the destination operand / register, which was the source of all the regressions that occurred with the original patch attempt. Added additional regression test case provided by Joerg Sonnenberger. Differential Revision: http://reviews.llvm.org/D5818 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@220239 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FastISel.cpp | 3 +- lib/Target/X86/X86InstrInfo.cpp | 155 +++++++++++------- lib/Target/X86/X86InstrInfo.h | 3 +- test/CodeGen/X86/avx1-stack-reload-folding.ll | 16 ++ test/CodeGen/X86/fold-tied-op.ll | 84 ++++++++++ 5 files changed, 202 insertions(+), 59 deletions(-) create mode 100644 test/CodeGen/X86/avx1-stack-reload-folding.ll create mode 100644 test/CodeGen/X86/fold-tied-op.ll diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 70eec08cce2..4c9fef5c3b2 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -3337,7 +3337,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, AM.getFullAddress(AddrOps); MachineInstr *Result = - XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); + XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, + Size, Alignment, /*AllowCommute=*/true); if (!Result) return false; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index a7637f9e65d..ff41e48c8f2 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3907,10 +3907,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. -MachineInstr* X86InstrInfo:: -optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, - unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const { +MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { if (FoldAsLoadDefReg == 0) return nullptr; // To be conservative, if there exists another load, clear the load candidate. @@ -3926,55 +3926,35 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, if (!DefMI->isSafeToMove(this, nullptr, SawStore)) return nullptr; - // We try to commute MI if possible. - unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1; - for (unsigned Idx = 0; Idx < IdxEnd; Idx++) { - // Collect information about virtual register operands of MI. - unsigned SrcOperandId = 0; - bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (Reg != FoldAsLoadDefReg) - continue; - // Do not fold if we have a subreg use or a def or multiple uses. - if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) - return nullptr; - - SrcOperandId = i; - FoundSrcOperand = true; - } - if (!FoundSrcOperand) return nullptr; - - // Check whether we can fold the def into SrcOperandId. - SmallVector Ops; - Ops.push_back(SrcOperandId); - MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); - if (FoldMI) { - FoldAsLoadDefReg = 0; - return FoldMI; - } - - if (Idx == 1) { - // MI was changed but it didn't help, commute it back! - commuteInstruction(MI, false); + // Collect information about virtual register operands of MI. + unsigned SrcOperandId = 0; + bool FoundSrcOperand = false; + for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != FoldAsLoadDefReg) + continue; + // Do not fold if we have a subreg use or a def or multiple uses. + if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) return nullptr; - } - // Check whether we can commute MI and enable folding. - if (MI->isCommutable()) { - MachineInstr *NewMI = commuteInstruction(MI, false); - // Unable to commute. - if (!NewMI) return nullptr; - if (NewMI != MI) { - // New instruction. It doesn't need to be kept. - NewMI->eraseFromParent(); - return nullptr; - } - } + SrcOperandId = i; + FoundSrcOperand = true; } + if (!FoundSrcOperand) + return nullptr; + + // Check whether we can fold the def into SrcOperandId. + SmallVector Ops; + Ops.push_back(SrcOperandId); + MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + if (FoldMI) { + FoldAsLoadDefReg = 0; + return FoldMI; + } + return nullptr; } @@ -4134,7 +4114,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned i, const SmallVectorImpl &MOs, - unsigned Size, unsigned Align) const { + unsigned Size, unsigned Align, + bool AllowCommute) const { const DenseMap > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); @@ -4202,8 +4183,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) return nullptr; // If this is a 64-bit load, but the spill slot is 32, then we can do - // a 32-bit load which is implicitly zero-extended. This likely is due - // to liveintervalanalysis remat'ing a load from stack slot. + // a 32-bit load which is implicitly zero-extended. This likely is + // due to live interval analysis remat'ing a load from stack slot. if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) return nullptr; Opcode = X86::MOV32rm; @@ -4222,8 +4203,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // to a 32-bit one. unsigned DstReg = NewMI->getOperand(0).getReg(); if (TargetRegisterInfo::isPhysicalRegister(DstReg)) - NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, - X86::sub_32bit)); + NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); else NewMI->getOperand(0).setSubReg(X86::sub_32bit); } @@ -4231,6 +4211,65 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, } } + // If the instruction and target operand are commutable, commute the + // instruction and try again. + if (AllowCommute) { + unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2; + if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { + bool HasDef = MI->getDesc().getNumDefs(); + unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; + unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); + unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); + bool Tied0 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); + + // If either of the commutable operands are tied to the destination + // then we can not commute + fold. + if ((HasDef && Reg0 == Reg1 && Tied0) || + (HasDef && Reg0 == Reg2 && Tied1)) + return nullptr; + + if ((CommuteOpIdx1 == OriginalOpIdx) || + (CommuteOpIdx2 == OriginalOpIdx)) { + MachineInstr *CommutedMI = commuteInstruction(MI, false); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } + + // Attempt to fold with the commuted version of the instruction. + unsigned CommuteOp = + (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align, + /*AllowCommute=*/false); + if (NewMI) + return NewMI; + + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = commuteInstruction(MI, false); + if (!UncommutedMI) { + // Unable to commute. + return nullptr; + } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; + } + } + } + // No fusion if (PrintFailedFusing && !MI->isCopy()) dbgs() << "We failed to fuse operand " << i << " in " << *MI; @@ -4440,7 +4479,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, SmallVector MOs; MOs.push_back(MachineOperand::CreateFI(FrameIndex)); - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, + Size, Alignment, /*AllowCommute=*/true); } static bool isPartialRegisterLoad(const MachineInstr &LoadMI, @@ -4593,7 +4633,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, break; } } - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, + /*Size=*/0, Alignment, /*AllowCommute=*/true); } diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index ca90976a83a..f3f54ae5664 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -404,7 +404,8 @@ public: MachineInstr* MI, unsigned OpNum, const SmallVectorImpl &MOs, - unsigned Size, unsigned Alignment) const; + unsigned Size, unsigned Alignment, + bool AllowCommute) const; void getUnconditionalBranch(MCInst &Branch, diff --git a/test/CodeGen/X86/avx1-stack-reload-folding.ll b/test/CodeGen/X86/avx1-stack-reload-folding.ll new file mode 100644 index 00000000000..5da1b4c5767 --- /dev/null +++ b/test/CodeGen/X86/avx1-stack-reload-folding.ll @@ -0,0 +1,16 @@ +; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Function Attrs: nounwind readonly uwtable +define <32 x double> @_Z14vstack_foldDv32_dS_(<32 x double> %a, <32 x double> %b) #0 { + %1 = fadd <32 x double> %a, %b + %2 = fsub <32 x double> %a, %b + %3 = fmul <32 x double> %1, %2 + ret <32 x double> %3 + + ;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload + ;CHECK: vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + ;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload +} diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll new file mode 100644 index 00000000000..a643d86e6f7 --- /dev/null +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -0,0 +1,84 @@ +; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s +; Regression test for http://reviews.llvm.org/D5701 + +; ModuleID = 'xxhash.i' +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386--netbsd" + +; CHECK-LABEL: fn1 +; CHECK: shldl {{.*#+}} 4-byte Folded Spill +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: shldl {{.*#+}} 4-byte Folded Spill +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: imull {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: retl + +%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } + +@a = common global i32 0, align 4 +@b = common global i64 0, align 8 + +; Function Attrs: nounwind uwtable +define i64 @fn1() #0 { +entry: + %0 = load i32* @a, align 4, !tbaa !1 + %1 = inttoptr i32 %0 to %struct.XXH_state64_t* + %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0 + %2 = load i32* %total_len, align 4, !tbaa !5 + %tobool = icmp eq i32 %2, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %entry + %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3 + %3 = load i64* %v3, align 4, !tbaa !8 + %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4 + %4 = load i64* %v4, align 4, !tbaa !9 + %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2 + %5 = load i64* %v2, align 4, !tbaa !10 + %shl = shl i64 %5, 1 + %or = or i64 %shl, %5 + %shl2 = shl i64 %3, 2 + %shr = lshr i64 %3, 1 + %or3 = or i64 %shl2, %shr + %add = add i64 %or, %or3 + %mul = mul i64 %4, -4417276706812531889 + %shl4 = mul i64 %4, -8834553413625063778 + %shr5 = ashr i64 %mul, 3 + %or6 = or i64 %shr5, %shl4 + %mul7 = mul nsw i64 %or6, 1400714785074694791 + %xor = xor i64 %add, %mul7 + store i64 %xor, i64* @b, align 8, !tbaa !11 + %mul8 = mul nsw i64 %xor, 1400714785074694791 + br label %if.end + +if.else: ; preds = %entry + %6 = load i64* @b, align 8, !tbaa !11 + %xor10 = xor i64 %6, -4417276706812531889 + %mul11 = mul nsw i64 %xor10, 400714785074694791 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ] + %storemerge = add i64 %storemerge.in, -8796714831421723037 + store i64 %storemerge, i64* @b, align 8, !tbaa !11 + ret i64 undef +} + +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"} +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"int", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} +!5 = metadata !{metadata !6, metadata !2, i64 0} +!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24} +!7 = metadata !{metadata !"long long", metadata !3, i64 0} +!8 = metadata !{metadata !6, metadata !7, i64 16} +!9 = metadata !{metadata !6, metadata !7, i64 24} +!10 = metadata !{metadata !6, metadata !7, i64 8} +!11 = metadata !{metadata !7, metadata !7, i64 0} -- 2.34.1