[x86] generalize reassociation optimization in machine combiner to 2 instructions

author Sanjay Patel <spatel@rotateright.com>

Tue, 23 Jun 2015 00:39:40 +0000 (00:39 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Tue, 23 Jun 2015 00:39:40 +0000 (00:39 +0000)
author Sanjay Patel <spatel@rotateright.com>
Tue, 23 Jun 2015 00:39:40 +0000 (00:39 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Tue, 23 Jun 2015 00:39:40 +0000 (00:39 +0000)
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp

index 5019e8eef19b45cf9e33d44891854a83b913a2ef..66b9fce4e460eb06972bbe7f38a81ad8b74491c7 100644 (file)
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -67,10 +67,11 @@ private:
    unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
                        MachineTraceMetrics::Trace BlockTrace);
    bool
-  preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
+  improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
                             MachineTraceMetrics::Trace BlockTrace,
                             SmallVectorImpl<MachineInstr *> &InsInstrs,
-                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg);
+                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                           bool NewCodeHasLessInsts);
    bool preservesResourceLen(MachineBasicBlock *MBB,
                              MachineTraceMetrics::Trace BlockTrace,
                              SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -208,19 +209,24 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
    return NewRootLatency;
  }
  
-/// True when the new instruction sequence does not
-/// lengthen the critical path. The DAGCombine code sequence ends in MI
-/// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A
-/// necessary condition for the new sequence to replace the old sequence is that
-/// it cannot lengthen the critical path. This is decided by the formula
+/// True when the new instruction sequence does not lengthen the critical path
+/// and the new sequence has less instructions or the new sequence improves the
+/// critical path.
+/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
+/// The new code sequence ends in MI NewRoot. A necessary condition for the new
+/// sequence to replace the old sequence is that it cannot lengthen the critical
+/// path. This is decided by the formula:
  /// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)).
-/// The slack is the number of cycles Root can be delayed before the critical
-/// patch becomes longer.
-bool MachineCombiner::preservesCriticalPathLen(
+/// If the new sequence has an equal length critical path but does not reduce
+/// the number of instructions (NewCodeHasLessInsts is false), then it is not
+/// considered an improvement. The slack is the number of cycles Root can be
+/// delayed before the critical patch becomes longer.
+bool MachineCombiner::improvesCriticalPathLen(
      MachineBasicBlock *MBB, MachineInstr *Root,
      MachineTraceMetrics::Trace BlockTrace,
      SmallVectorImpl<MachineInstr *> &InsInstrs,
-    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+    bool NewCodeHasLessInsts) {
  
    assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
    // NewRoot is the last instruction in the \p InsInstrs vector.
@@ -245,9 +251,13 @@ bool MachineCombiner::preservesCriticalPathLen(
          dbgs() << " RootDepth + RootLatency + RootSlack "
                 << RootDepth + RootLatency + RootSlack << "\n";);
  
-  /// True when the new sequence does not lengthen the critical path.
-  return ((NewRootDepth + NewRootLatency) <=
-          (RootDepth + RootLatency + RootSlack));
+  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
+  unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
+  
+  if (NewCodeHasLessInsts)
+    return NewCycleCount <= OldCycleCount;
+  else
+    return NewCycleCount < OldCycleCount;
  }
  
  /// helper routine to convert instructions into SC
@@ -359,18 +369,21 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
          Traces->verifyAnalysis();
          TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
                                          InstrIdxForVirtReg);
+        unsigned NewInstCount = InsInstrs.size();
+        unsigned OldInstCount = DelInstrs.size();
          // Found pattern, but did not generate alternative sequence.
          // This can happen e.g. when an immediate could not be materialized
          // in a single instruction.
-        if (!InsInstrs.size())
+        if (!NewInstCount)
            continue;
          // Substitute when we optimize for codesize and the new sequence has
          // fewer instructions OR
          // the new sequence neither lengthens the critical path nor increases
          // resource pressure.
-        if (doSubstitute(InsInstrs.size(), DelInstrs.size()) ||
-            (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                      InstrIdxForVirtReg) &&
+        if (doSubstitute(NewInstCount, OldInstCount) ||
+            (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
+                                      InstrIdxForVirtReg,
+                                      NewInstCount < OldInstCount) &&
               preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
            for (auto *InstrPtr : InsInstrs)
              MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 4dee5b72240efece50969fbee0ea3c7b1f53f019..78b127d47141bb9b081dbd70e7d1e3a2b7d6174f 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -6370,22 +6370,11 @@ hasHighOperandLatency(const TargetSchedModel &SchedModel,
    return isHighLatencyDef(DefMI->getOpcode());
  }
  
-/// If the input instruction is part of a chain of dependent ops that are
-/// suitable for reassociation, return the earlier instruction in the sequence
-/// that defines its first operand, otherwise return a nullptr.
-/// If the instruction's operands must be commuted to be considered a
-/// reassociation candidate, Commuted will be set to true.
-static MachineInstr *isReassocCandidate(const MachineInstr &Inst,
-                                        unsigned AssocOpcode,
-                                        bool checkPrevOneUse,
-                                        bool &Commuted) {
-  if (Inst.getOpcode() != AssocOpcode)
-    return nullptr;
-  
-  MachineOperand Op1 = Inst.getOperand(1);
-  MachineOperand Op2 = Inst.getOperand(2);
-  
-  const MachineBasicBlock *MBB = Inst.getParent();
+static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst,
+                                          const MachineBasicBlock *MBB) {
+  assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators");
+  const MachineOperand &Op1 = Inst.getOperand(1);
+  const MachineOperand &Op2 = Inst.getOperand(2);
    const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
  
    // We need virtual register definitions.
@@ -6395,80 +6384,99 @@ static MachineInstr *isReassocCandidate(const MachineInstr &Inst,
      MI1 = MRI.getUniqueVRegDef(Op1.getReg());
    if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
      MI2 = MRI.getUniqueVRegDef(Op2.getReg());
-  
+
    // And they need to be in the trace (otherwise, they won't have a depth).
-  if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB)
-    return nullptr;
-  
-  Commuted = false;
-  if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) {
+  if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB)
+    return true;
+
+  return false;
+}
+
+static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) {
+  const MachineBasicBlock *MBB = Inst.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
+  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+  unsigned AssocOpcode = Inst.getOpcode();
+
+  // If only one operand has the same opcode and it's the second source operand,
+  // the operands must be commuted.
+  Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
+  if (Commuted)
      std::swap(MI1, MI2);
-    Commuted = true;
-  }
  
-  // Avoid reassociating operands when it won't provide any benefit. If both
-  // operands are produced by instructions of this type, we may already
-  // have the optimal sequence.
-  if (MI2->getOpcode() == AssocOpcode)
-    return nullptr;
-  
-  // The instruction must only be used by the other instruction that we
-  // reassociate with.
-  if (checkPrevOneUse && !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
-    return nullptr;
-  
-  // We must match a simple chain of dependent ops.
-  // TODO: This check is not necessary for the earliest instruction in the
-  // sequence. Instead of a sequence of 3 dependent instructions with the same
-  // opcode, we only need to find a sequence of 2 dependent instructions with
-  // the same opcode plus 1 other instruction that adds to the height of the
-  // trace.
-  if (MI1->getOpcode() != AssocOpcode)
-    return nullptr;
+  // 1. The previous instruction must be the same type as Inst.
+  // 2. The previous instruction must have virtual register definitions for its
+  //    operands in the same basic block as Inst.
+  // 3. The previous instruction's result must only be used by Inst.
+  if (MI1->getOpcode() == AssocOpcode &&
+      hasVirtualRegDefsInBasicBlock(*MI1, MBB) &&
+      MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
+    return true;
    
-  return MI1;
+  return false;
  }
  
-/// Select a pattern based on how the operands of each associative operation
-/// need to be commuted.
-static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev,
-                                                     bool CommuteRoot) {
-  if (CommutePrev) {
-    if (CommuteRoot)
-      return MachineCombinerPattern::MC_REASSOC_XA_YB;
-    return MachineCombinerPattern::MC_REASSOC_XA_BY;
-  } else {
-    if (CommuteRoot)
-      return MachineCombinerPattern::MC_REASSOC_AX_YB;
-    return MachineCombinerPattern::MC_REASSOC_AX_BY;
-  }
+/// Return true if the input instruction is part of a chain of dependent ops
+/// that are suitable for reassociation, otherwise return false.
+/// If the instruction's operands must be commuted to have a previous
+/// instruction of the same type define the first source operand, Commuted will
+/// be set to true.
+static bool isReassocCandidate(const MachineInstr &Inst, unsigned AssocOpcode,
+                               bool &Commuted) {
+  // 1. The instruction must have the correct type.
+  // 2. The instruction must have virtual register definitions for its
+  //    operands in the same basic block.
+  // 3. The instruction must have a reassociatable sibling.
+  if (Inst.getOpcode() == AssocOpcode &&
+      hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) &&
+      hasReassocSibling(Inst, Commuted))
+    return true;
+
+  return false;
  }
  
+// FIXME: This has the potential to be expensive (compile time) while not
+// improving the code at all. Some ways to limit the overhead:
+// 1. Track successful transforms; bail out if hit rate gets too low.
+// 2. Only enable at -O3 or some other non-default optimization level.
+// 3. Pre-screen pattern candidates here: if an operand of the previous
+//    instruction is known to not increase the critical path, then don't match
+//    that pattern.
  bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
          SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
    if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
      return false;
  
+  // TODO: There is nothing x86-specific here except the instruction type.
+  // This logic could be hoisted into the machine combiner pass itself.
+
+  // Look for this reassociation pattern:
+  //   B = A op X (Prev)
+  //   C = B op Y (Root)
+
    // TODO: There are many more associative instruction types to match:
    //       1. Other forms of scalar FP add (non-AVX)
    //       2. Other data types (double, integer, vectors)
    //       3. Other math / logic operations (mul, and, or)
    unsigned AssocOpcode = X86::VADDSSrr;
  
-  // TODO: There is nothing x86-specific here except the instruction type.
-  // This logic could be hoisted into the machine combiner pass itself.
-  bool CommuteRoot;
-  if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true,
-                                              CommuteRoot)) {
-    bool CommutePrev;
-    if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) {
-      // We found a sequence of instructions that may be suitable for a
-      // reassociation of operands to increase ILP.
-      Patterns.push_back(getPattern(CommutePrev, CommuteRoot));
-      return true;
+  bool Commute = false;
+  if (isReassocCandidate(Root, AssocOpcode, Commute)) {
+    // We found a sequence of instructions that may be suitable for a
+    // reassociation of operands to increase ILP. Specify each commutation
+    // possibility for the Prev instruction in the sequence and let the
+    // machine combiner decide if changing the operands is worthwhile.
+    if (Commute) {
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB);
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB);
+    } else {
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY);
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY);
      }
+    return true;
    }
-  
+
    return false;
  }
  
@@ -6561,14 +6569,16 @@ void X86InstrInfo::genAlternativeCodeSequence(
  
    // Select the previous instruction in the sequence based on the input pattern.
    MachineInstr *Prev = nullptr;
-  if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY ||
-      Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY)
-    Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
-  else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB ||
-           Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB)
-    Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
-  else
-    llvm_unreachable("Unknown pattern for machine combiner");
+  switch (Pattern) {
+    case MachineCombinerPattern::MC_REASSOC_AX_BY:
+    case MachineCombinerPattern::MC_REASSOC_XA_BY:
+      Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+      break;
+    case MachineCombinerPattern::MC_REASSOC_AX_YB:
+    case MachineCombinerPattern::MC_REASSOC_XA_YB:
+      Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+  }
+  assert(Prev && "Unknown pattern for machine combiner");
    
    reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
    return;
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll

index 4f503af716a80baf98e99827a00c23bf47f19105..27af5738ca3e86ace9d8153813cb3ee694838c9e 100644 (file)
--- a/test/CodeGen/X86/fp-fast.ll
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -114,81 +114,3 @@ define float @test11(float %a) {
    ret float %t2
  }
  
-; Verify that the first two adds are independent regardless of how the inputs are 
-; commuted. The destination registers are used as source registers for the third add.
-
-define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %t0, %x2
-  %t2 = fadd float %t1, %x3
-  ret float %t2
-}
-
-define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %x2, %t0
-  %t2 = fadd float %t1, %x3
-  ret float %t2
-}
-
-define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %t0, %x2
-  %t2 = fadd float %x3, %t1
-  ret float %t2
-}
-
-define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %x2, %t0
-  %t2 = fadd float %x3, %t1
-  ret float %t2
-}
-
-; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
-; produced because that would cost more compile time.
-
-define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-LABEL: reassociate_adds5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm7, %xmm6, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %t0, %x2
-  %t2 = fadd float %t1, %x3
-  %t3 = fadd float %t2, %x4
-  %t4 = fadd float %t3, %x5
-  %t5 = fadd float %t4, %x6
-  %t6 = fadd float %t5, %x7
-  ret float %t6
-}
diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll

new file mode 100644 (file)

index 0000000..d4cd59f
--- /dev/null
+++ b/test/CodeGen/X86/machine-combiner.ll
@@ -0,0 +1,99 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
+
+; Verify that the first two adds are independent regardless of how the inputs are
+; commuted. The destination registers are used as source registers for the third add.
+
+define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
+; produced because that would cost more compile time.
+
+define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL: reassociate_adds5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
+; CHECK-NEXT:    vaddss %xmm6, %xmm1, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm7, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  %t3 = fadd float %t2, %x4
+  %t4 = fadd float %t3, %x5
+  %t5 = fadd float %t4, %x6
+  %t6 = fadd float %t5, %x7
+  ret float %t6
+}
+
+; Verify that we only need two associative operations to reassociate the operands.
+; Also, we should reassociate such that the result of the high latency division
+; is used by the final 'add' rather than reassociating the %x3 operand with the
+; division. The latter reassociation would not improve anything.
+ 
+define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
author	Sanjay Patel <spatel@rotateright.com>
	Tue, 23 Jun 2015 00:39:40 +0000 (00:39 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Tue, 23 Jun 2015 00:39:40 +0000 (00:39 +0000)
lib/CodeGen/MachineCombiner.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
test/CodeGen/X86/fp-fast.ll		patch \| blob \| history
test/CodeGen/X86/machine-combiner.ll	[new file with mode: 0644]	patch \| blob