[X86] Simplify X87 stackifier pass.

author Akira Hatanaka <ahatanaka@apple.com>

Fri, 1 Aug 2014 22:19:41 +0000 (22:19 +0000)

committer Akira Hatanaka <ahatanaka@apple.com>

Fri, 1 Aug 2014 22:19:41 +0000 (22:19 +0000)
author Akira Hatanaka <ahatanaka@apple.com>
Fri, 1 Aug 2014 22:19:41 +0000 (22:19 +0000)
committer Akira Hatanaka <ahatanaka@apple.com>
Fri, 1 Aug 2014 22:19:41 +0000 (22:19 +0000)
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td

index 86c01bd6464706df074d35da807e01afa89a440d..1f3cde828b6871f772fcdba1207da60dda0a9e18 100644 (file)
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -59,20 +59,20 @@ def RetCC_X86Common : CallingConv<[
    // MM0, it doesn't support these vector types.
    CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
  
-  // Long double types are always returned in ST0 (even with SSE).
-  CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+  // Long double types are always returned in FP0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
  ]>;
  
  // X86-32 C return-value convention.
  def RetCC_X86_32_C : CallingConv<[
-  // The X86-32 calling convention returns FP values in ST0, unless marked
+  // The X86-32 calling convention returns FP values in FP0, unless marked
    // with "inreg" (used here to distinguish one kind of reg from another,
    // weirdly; this is really the sse-regparm calling convention) in which
    // case they use XMM0, otherwise it is the same as the common X86 calling
    // conv.
    CCIfInReg<CCIfSubtarget<"hasSSE2()",
      CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
-  CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+  CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
    CCDelegateTo<RetCC_X86Common>
  ]>;
  
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp

index 0938eda9dadd9bee32e5ef3d688942d74bdec37d..5b2008323089e9ad572a8dd25993f2af99dcf565 100644 (file)
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1017,7 +1017,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
  
      // The calling-convention tables for x87 returns don't tell
      // the whole story.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+    if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
        return false;
  
      unsigned SrcReg = Reg + VA.getValNo();
@@ -2989,39 +2989,33 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) {
        report_fatal_error("SSE register return with SSE disabled");
      }
  
-    // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the value is popped from the stack, so
-    // a COPY is not good enough - the copy instruction may be eliminated if the
-    // return value is not used. We use the FpPOP_RETVAL instruction instead.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
-      // If we prefer to use the value in xmm registers, copy it out as f80 and
-      // use a truncate to move it from fp stack reg to xmm reg.
-      if (isScalarFPTypeInSSEReg(VA.getValVT())) {
-        CopyVT = MVT::f80;
-        CopyReg = createResultReg(&X86::RFP80RegClass);
-      }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(X86::FpPOP_RETVAL), CopyReg);
-
-      // Round the f80 to the right size, which also moves it to the appropriate
-      // xmm register. This is accomplished by storing the f80 value in memory
-      // and then loading it back.
-      if (CopyVT != VA.getValVT()) {
-        EVT ResVT = VA.getValVT();
-        unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
-        unsigned MemSize = ResVT.getSizeInBits()/8;
-        int FI = MFI.CreateStackObject(MemSize, MemSize, false);
-        addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                  TII.get(Opc)), FI)
-          .addReg(CopyReg);
-        Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
-        addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                  TII.get(Opc), ResultReg + i), FI);
-      }
-    } else {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
-      InRegs.push_back(VA.getLocReg());
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      CopyVT = MVT::f80;
+      CopyReg = createResultReg(&X86::RFP80RegClass);
+    }
+
+    // Copy out the result.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+    InRegs.push_back(VA.getLocReg());
+
+    // Round the f80 to the right size, which also moves it to the appropriate
+    // xmm register. This is accomplished by storing the f80 value in memory
+    // and then loading it back.
+    if (CopyVT != VA.getValVT()) {
+      EVT ResVT = VA.getValVT();
+      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+      unsigned MemSize = ResVT.getSizeInBits()/8;
+      int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(Opc)), FI)
+        .addReg(CopyReg);
+      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(Opc), ResultReg + i), FI);
      }
    }
  
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp

index c8a3ab3082ad895bcd534ec69da122cfe13e969f..e74613b949868d53744e2eea638cd1b001632472 100644 (file)
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -25,15 +25,18 @@
  
  #include "X86.h"
  #include "X86InstrInfo.h"
+#include "llvm/ADT/BitVector.h"
  #include "llvm/ADT/DepthFirstIterator.h"
  #include "llvm/ADT/STLExtras.h"
  #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/CodeGen/EdgeBundles.h"
  #include "llvm/CodeGen/MachineFunctionPass.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
  #include "llvm/CodeGen/Passes.h"
  #include "llvm/IR/InlineAsm.h"
  #include "llvm/Support/Debug.h"
@@ -50,6 +53,8 @@ STATISTIC(NumFXCH, "Number of fxch instructions inserted");
  STATISTIC(NumFP  , "Number of floating point instructions");
  
  namespace {
+  const unsigned ScratchFPReg = 7;
+
    struct FPS : public MachineFunctionPass {
      static char ID;
      FPS() : MachineFunctionPass(ID) {
@@ -137,7 +142,7 @@ namespace {
      unsigned StackTop;          // The current top of the FP stack.
  
      enum {
-      NumFPRegs = 16            // Including scratch pseudo-registers.
+      NumFPRegs = 8             // Including scratch pseudo-registers.
      };
  
      // For each live FP<n> register, point to its Stack[] entry.
@@ -146,27 +151,6 @@ namespace {
      // register allocator thinks.
      unsigned RegMap[NumFPRegs];
  
-    // Pending fixed registers - Inline assembly needs FP registers to appear
-    // in fixed stack slot positions. This is handled by copying FP registers
-    // to ST registers before the instruction, and copying back after the
-    // instruction.
-    //
-    // This is modeled with pending ST registers. NumPendingSTs is the number
-    // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP
-    // register that holds the ST value. The ST registers are not moved into
-    // place until immediately before the instruction that needs them.
-    //
-    // It can happen that we need an ST register to be live when no FP register
-    // holds the value:
-    //
-    //   %ST0 = COPY %FP4<kill>
-    //
-    // When that happens, we allocate a scratch FP register to hold the ST
-    // value. That means every register in PendingST must be live.
-
-    unsigned NumPendingSTs;
-    unsigned char PendingST[8];
-
      // Set up our stack model to match the incoming registers to MBB.
      void setupBlockStack();
  
@@ -180,9 +164,6 @@ namespace {
          dbgs() << " FP" << Stack[i];
          assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
        }
-      for (unsigned i = 0; i != NumPendingSTs; ++i)
-        dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
-      dbgs() << "\n";
      }
  #endif
  
@@ -199,19 +180,6 @@ namespace {
        return Slot < StackTop && Stack[Slot] == RegNo;
      }
  
-    /// getScratchReg - Return an FP register that is not currently in use.
-    unsigned getScratchReg() const {
-      for (int i = NumFPRegs - 1; i >= 8; --i)
-        if (!isLive(i))
-          return i;
-      llvm_unreachable("Ran out of scratch FP registers");
-    }
-
-    /// isScratchReg - Returns trus if RegNo is a scratch FP register.
-    static bool isScratchReg(unsigned RegNo) {
-      return RegNo > 8 && RegNo < NumFPRegs;
-    }
-
      /// getStackEntry - Return the X86::FP<n> register in register ST(i).
      unsigned getStackEntry(unsigned STi) const {
        if (STi >= StackTop)
@@ -263,21 +231,6 @@ namespace {
        BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
      }
  
-    /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
-    /// RegNo. If any PendingST registers still need the RegNo value, duplicate
-    /// them to new scratch registers.
-    void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
-      for (unsigned i = 0; i != NumPendingSTs; ++i) {
-        if (PendingST[i] != RegNo)
-          continue;
-        unsigned SR = getScratchReg();
-        DEBUG(dbgs() << "Duplicating pending ST" << i
-                     << " in FP" << RegNo << " to FP" << SR << '\n');
-        duplicateToTop(RegNo, SR, I);
-        PendingST[i] = SR;
-      }
-    }
-
      /// popStackAfter - Pop the current value off of the top of the FP stack
      /// after the specified instruction.
      void popStackAfter(MachineBasicBlock::iterator &I);
@@ -304,6 +257,7 @@ namespace {
  
      bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
  
+    void handleCall(MachineBasicBlock::iterator &I);
      void handleZeroArgFP(MachineBasicBlock::iterator &I);
      void handleOneArgFP(MachineBasicBlock::iterator &I);
      void handleOneArgFPRW(MachineBasicBlock::iterator &I);
@@ -320,6 +274,8 @@ namespace {
        return X86::RFP80RegClass.contains(DstReg) ||
          X86::RFP80RegClass.contains(SrcReg);
      }
+
+    void setKillFlags(MachineBasicBlock &MBB) const;
    };
    char FPS::ID = 0;
  }
@@ -409,8 +365,8 @@ void FPS::bundleCFG(MachineFunction &MF) {
  bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
    bool Changed = false;
    MBB = &BB;
-  NumPendingSTs = 0;
  
+  setKillFlags(BB);
    setupBlockStack();
  
    for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
@@ -428,6 +384,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
          X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
        FPInstClass = X86II::SpecialFP;
  
+    if (MI->isCall())
+      FPInstClass = X86II::SpecialFP;
+
      if (FPInstClass == X86II::NotFP)
        continue;  // Efficiently ignore non-fp insts!
  
@@ -462,7 +421,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
      // after definition.  If so, pop them.
      for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
        unsigned Reg = DeadRegs[i];
-      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+      // Check if Reg is live on the stack. An inline-asm register operand that
+      // is in the clobber list and marked dead might not be live on the stack.
+      if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
          DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
          freeStackSlotAfter(I, Reg-X86::FP0);
        }
@@ -966,6 +927,31 @@ void FPS::shuffleStackTop(const unsigned char *FixStack,
  // Instruction transformation implementation
  //===----------------------------------------------------------------------===//
  
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+  unsigned STReturns = 0;
+
+  for (const auto &MO : I->operands()) {
+    if (!MO.isReg())
+      continue;
+
+    unsigned R = MO.getReg() - X86::FP0;
+
+    if (R < 8) {
+      assert(MO.isDef() && MO.isImplicit());
+      STReturns |= 1 << R;
+    }
+  }
+
+  unsigned N = CountTrailingOnes_32(STReturns);
+
+  // FP registers used for function return must be consecutive starting at
+  // FP0.
+  assert(STReturns == 0 || isMask_32(STReturns) && N <= 2);
+
+  for (unsigned I = 0; I < N; ++I)
+    pushReg(N - I - 1);
+}
+
  /// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
  ///
  void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
@@ -992,9 +978,6 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
    unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
    bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
  
-  if (KillsSrc)
-    duplicatePendingSTBeforeKill(Reg, I);
-
    // FISTP64m is strange because there isn't a non-popping versions.
    // If we have one _and_ we don't want to pop the operand, duplicate the value
    // on the stack instead of moving it.  This ensure that popping the value is
@@ -1015,7 +998,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
         MI->getOpcode() == X86::ISTT_Fp32m80 ||
         MI->getOpcode() == X86::ISTT_Fp64m80 ||
         MI->getOpcode() == X86::ST_FpP80m)) {
-    duplicateToTop(Reg, getScratchReg(), I);
+    duplicateToTop(Reg, ScratchFPReg, I);
    } else {
      moveToTop(Reg, I);            // Move to the top of the stack...
    }
@@ -1058,7 +1041,6 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
    bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
  
    if (KillsSrc) {
-    duplicatePendingSTBeforeKill(Reg, I);
      // If this is the last use of the source register, just make sure it's on
      // the top of the stack.
      moveToTop(Reg, I);
@@ -1316,69 +1298,20 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
  ///
  void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
    MachineInstr *MI = I;
+
+  if (MI->isCall()) {
+    handleCall(I);
+    return;
+  }
+
    switch (MI->getOpcode()) {
    default: llvm_unreachable("Unknown SpecialFP instruction!");
    case TargetOpcode::COPY: {
      // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
      const MachineOperand &MO1 = MI->getOperand(1);
      const MachineOperand &MO0 = MI->getOperand(0);
-    unsigned DstST = MO0.getReg() - X86::ST0;
-    unsigned SrcST = MO1.getReg() - X86::ST0;
      bool KillsSrc = MI->killsRegister(MO1.getReg());
  
-    // ST = COPY FP. Set up a pending ST register.
-    if (DstST < 8) {
-      unsigned SrcFP = getFPReg(MO1);
-      assert(isLive(SrcFP) && "Cannot copy dead register");
-      assert(!MO0.isDead() && "Cannot copy to dead ST register");
-
-      // Unallocated STs are marked as the nonexistent FP255.
-      while (NumPendingSTs <= DstST)
-        PendingST[NumPendingSTs++] = NumFPRegs;
-
-      // STi could still be live from a previous inline asm.
-      if (isScratchReg(PendingST[DstST])) {
-        DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST])
-                     << '\n');
-        freeStackSlotBefore(MI, PendingST[DstST]);
-      }
-
-      // When the source is killed, allocate a scratch FP register.
-      if (KillsSrc) {
-        duplicatePendingSTBeforeKill(SrcFP, I);
-        unsigned Slot = getSlot(SrcFP);
-        unsigned SR = getScratchReg();
-        PendingST[DstST] = SR;
-        Stack[Slot] = SR;
-        RegMap[SR] = Slot;
-      } else
-        PendingST[DstST] = SrcFP;
-      break;
-    }
-
-    // FP = COPY ST. Extract fixed stack value.
-    // Any instruction defining ST registers must have assigned them to a
-    // scratch register.
-    if (SrcST < 8) {
-      unsigned DstFP = getFPReg(MO0);
-      assert(!isLive(DstFP) && "Cannot copy ST to live FP register");
-      assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register");
-      unsigned SrcFP = PendingST[SrcST];
-      assert(isScratchReg(SrcFP) && "Expected ST in a scratch register");
-      assert(isLive(SrcFP) && "Scratch holding ST is dead");
-
-      // DstFP steals the stack slot from SrcFP.
-      unsigned Slot = getSlot(SrcFP);
-      Stack[Slot] = DstFP;
-      RegMap[DstFP] = Slot;
-
-      // Always treat the ST as killed.
-      PendingST[SrcST] = NumFPRegs;
-      while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
-        --NumPendingSTs;
-      break;
-    }
-
      // FP <- FP copy.
      unsigned DstFP = getFPReg(MO0);
      unsigned SrcFP = getFPReg(MO1);
@@ -1406,36 +1339,6 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
      break;
    }
  
-  case X86::FpPOP_RETVAL: {
-    // The FpPOP_RETVAL instruction is used after calls that return a value on
-    // the floating point stack. We cannot model this with ST defs since CALL
-    // instructions have fixed clobber lists. This instruction is interpreted
-    // to mean that there is one more live register on the stack than we
-    // thought.
-    //
-    // This means that StackTop does not match the hardware stack between a
-    // call and the FpPOP_RETVAL instructions.  We do tolerate FP instructions
-    // between CALL and FpPOP_RETVAL as long as they don't overflow the
-    // hardware stack.
-    unsigned DstFP = getFPReg(MI->getOperand(0));
-
-    // Move existing stack elements up to reflect reality.
-    assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL");
-    if (StackTop) {
-      std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1);
-      for (unsigned i = 0; i != NumFPRegs; ++i)
-        ++RegMap[i];
-    }
-    ++StackTop;
-
-    // DstFP is the new bottom of the stack.
-    Stack[0] = DstFP;
-    RegMap[DstFP] = 0;
-
-    // DstFP will be killed by processBasicBlock if this was a dead def.
-    break;
-  }
-
    case TargetOpcode::INLINEASM: {
      // The inline asm MachineInstr currently only *uses* FP registers for the
      // 'f' constraint.  These should be turned into the current ST(x) register
@@ -1472,19 +1375,30 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
      // only tell clobbers from defs by looking at the asm descriptor.
      unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
      unsigned NumOps = 0;
+    SmallSet<unsigned, 1> FRegIdx;
+    unsigned RCID;
+
      for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
           i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
        unsigned Flags = MI->getOperand(i).getImm();
+
        NumOps = InlineAsm::getNumOperandRegisters(Flags);
        if (NumOps != 1)
          continue;
        const MachineOperand &MO = MI->getOperand(i + 1);
        if (!MO.isReg())
          continue;
-      unsigned STReg = MO.getReg() - X86::ST0;
+      unsigned STReg = MO.getReg() - X86::FP0;
        if (STReg >= 8)
          continue;
  
+      // If the flag has a register class constraint, this must be an operand
+      // with constraint "f". Record its index and continue.
+      if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+        FRegIdx.insert(i + 1);
+        continue;
+      }
+
        switch (InlineAsm::getKind(Flags)) {
        case InlineAsm::Kind_RegUse:
          STUses |= (1u << STReg);
@@ -1527,71 +1441,42 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
      DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
                   << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
  
-    // Scan the instruction for FP uses corresponding to "f" constraints.
-    // Collect FP registers to kill afer the instruction.
-    // Always kill all the scratch regs.
+#ifndef NDEBUG
+    // If any input operand uses constraint "f", all output register
+    // constraints must be early-clobber defs.
+    for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I)
+      if (FRegIdx.count(I)) {
+        assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 &&
+               "Operands with constraint \"f\" cannot overlap with defs");
+      }
+#endif
+
+    // Collect all FP registers (register operands with constraints "t", "u",
+    // and "f") to kill afer the instruction.
      unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
-    unsigned FPUsed = 0;
      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
        MachineOperand &Op = MI->getOperand(i);
        if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
          continue;
-      if (!Op.isUse())
-        MI->emitError("illegal \"f\" output constraint");
        unsigned FPReg = getFPReg(Op);
-      FPUsed |= 1U << FPReg;
  
        // If we kill this operand, make sure to pop it from the stack after the
        // asm.  We just remember it for now, and pop them all off at the end in
        // a batch.
-      if (Op.isKill())
+      if (Op.isUse() && Op.isKill())
          FPKills |= 1U << FPReg;
      }
  
-    // The popped inputs will be killed by the instruction, so duplicate them
-    // if the FP register needs to be live after the instruction, or if it is
-    // used in the instruction itself. We effectively treat the popped inputs
-    // as early clobbers.
-    for (unsigned i = 0; i < NumSTPopped; ++i) {
-      if ((FPKills & ~FPUsed) & (1u << PendingST[i]))
-        continue;
-      unsigned SR = getScratchReg();
-      duplicateToTop(PendingST[i], SR, I);
-      DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
-                   << unsigned(PendingST[i]) << " to avoid clobbering it.\n");
-      PendingST[i] = SR;
-    }
-
-    // Make sure we have a unique live register for every fixed use. Some of
-    // them could be undef uses, and we need to emit LD_F0 instructions.
-    for (unsigned i = 0; i < NumSTUses; ++i) {
-      if (i < NumPendingSTs && PendingST[i] < NumFPRegs) {
-        // Check for shared assignments.
-        for (unsigned j = 0; j < i; ++j) {
-          if (PendingST[j] != PendingST[i])
-            continue;
-          // STi and STj are inn the same register, create a copy.
-          unsigned SR = getScratchReg();
-          duplicateToTop(PendingST[i], SR, I);
-          DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
-                       << unsigned(PendingST[i])
-                       << " to avoid collision with ST" << j << '\n');
-          PendingST[i] = SR;
-        }
-        continue;
-      }
-      unsigned SR = getScratchReg();
-      DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n');
-      BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
-      pushReg(SR);
-      PendingST[i] = SR;
-      if (NumPendingSTs == i)
-        ++NumPendingSTs;
-    }
-    assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned");
+    // Do not include registers that are implicitly popped by defs/clobbers.
+    FPKills &= ~(STDefs | STClobbers);
  
      // Now we can rearrange the live registers to match what was requested.
-    shuffleStackTop(PendingST, NumPendingSTs, I);
+    unsigned char STUsesArray[8];
+
+    for (unsigned I = 0; I < NumSTUses; ++I)
+      STUsesArray[I] = I;
+
+    shuffleStackTop(STUsesArray, NumSTUses, I);
      DEBUG({dbgs() << "Before asm: "; dumpStack();});
  
      // With the stack layout fixed, rewrite the FP registers.
@@ -1599,36 +1484,22 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
        MachineOperand &Op = MI->getOperand(i);
        if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
          continue;
+
        unsigned FPReg = getFPReg(Op);
-      Op.setReg(getSTReg(FPReg));
+
+      if (FRegIdx.count(i))
+        // Operand with constraint "f".
+        Op.setReg(getSTReg(FPReg));
+      else
+        // Operand with a single register class constraint ("t" or "u").
+        Op.setReg(X86::ST0 + FPReg);
      }
  
      // Simulate the inline asm popping its inputs and pushing its outputs.
      StackTop -= NumSTPopped;
  
-    // Hold the fixed output registers in scratch FP registers. They will be
-    // transferred to real FP registers by copies.
-    NumPendingSTs = 0;
-    for (unsigned i = 0; i < NumSTDefs; ++i) {
-      unsigned SR = getScratchReg();
-      pushReg(SR);
-      FPKills &= ~(1u << SR);
-    }
      for (unsigned i = 0; i < NumSTDefs; ++i)
-      PendingST[NumPendingSTs++] = getStackEntry(i);
-    DEBUG({dbgs() << "After asm: "; dumpStack();});
-
-    // If any of the ST defs were dead, pop them immediately. Our caller only
-    // handles dead FP defs.
-    MachineBasicBlock::iterator InsertPt = MI;
-    for (unsigned i = 0; STDefs & (1u << i); ++i) {
-      if (!(STDeadDefs & (1u << i)))
-        continue;
-      freeStackSlotAfter(InsertPt, PendingST[i]);
-      PendingST[i] = NumFPRegs;
-    }
-    while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
-      --NumPendingSTs;
+      pushReg(NumSTDefs - i - 1);
  
      // If this asm kills any FP registers (is the last use of them) we must
      // explicitly emit pop instructions for them.  Do this now after the asm has
@@ -1640,9 +1511,10 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
      while (FPKills) {
        unsigned FPReg = countTrailingZeros(FPKills);
        if (isLive(FPReg))
-        freeStackSlotAfter(InsertPt, FPReg);
+        freeStackSlotAfter(I, FPReg);
        FPKills &= ~(1U << FPReg);
      }
+
      // Don't delete the inline asm!
      return;
    }
@@ -1738,7 +1610,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
  
        // Duplicate the TOS so that we return it twice.  Just pick some other FPx
        // register to hold it.
-      unsigned NewReg = getScratchReg();
+      unsigned NewReg = ScratchFPReg;
        duplicateToTop(FirstFPRegOp, NewReg, MI);
        FirstFPRegOp = NewReg;
      }
@@ -1771,3 +1643,41 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
    } else
      --I;
  }
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+  const TargetRegisterInfo *TRI = MBB.getParent()->getTarget()
+    .getRegisterInfo();
+  LivePhysRegs LPR(TRI);
+
+  LPR.addLiveOuts(&MBB);
+
+  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+       I != E; ++I) {
+    BitVector Defs(8);
+    SmallVector<MachineOperand *, 2> Uses;
+    MachineInstr &MI = *I;
+
+    for (auto &MO : I->operands()) {
+      if (!MO.isReg())
+        continue;
+
+      unsigned Reg = MO.getReg() - X86::FP0;
+
+      if (Reg >= 8)
+        continue;
+
+      if (MO.isDef()) {
+        Defs.set(Reg);
+        if (!LPR.contains(MO.getReg()))
+          MO.setIsDead();
+      } else
+        Uses.push_back(&MO);
+    }
+
+    for (auto *MO : Uses)
+      if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+        MO->setIsKill();
+
+    LPR.stepBackward(MI);
+  }
+}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index dabfdb60796a3d0473f78b5968de444800b2a106..d2c410240da696443ba32fcb0b7bc971663d10be 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1968,8 +1968,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
  
      // Returns in ST0/ST1 are handled specially: these are pushed as operands to
      // the RET instruction and handled by the FP Stackifier.
-    if (VA.getLocReg() == X86::ST0 ||
-        VA.getLocReg() == X86::ST1) {
+    if (VA.getLocReg() == X86::FP0 ||
+        VA.getLocReg() == X86::FP1) {
        // If this is a copy from an xmm register to ST(0), use an FPExtend to
        // change the value to the FP stack register class.
        if (isScalarFPTypeInSSEReg(VA.getValVT()))
@@ -2107,33 +2107,21 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
        report_fatal_error("SSE register return with SSE disabled");
      }
  
-    SDValue Val;
-
-    // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the value is popped from the stack, so
-    // a CopyFromReg is not good enough - the copy instruction may be eliminated
-    // if the return value is not used. We use the FpPOP_RETVAL instruction
-    // instead.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
-      // If we prefer to use the value in xmm registers, copy it out as f80 and
-      // use a truncate to move it from fp stack reg to xmm reg.
-      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
-      SDValue Ops[] = { Chain, InFlag };
-      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
-                                         MVT::Other, MVT::Glue, Ops), 1);
-      Val = Chain.getValue(0);
-
-      // Round the f80 to the right size, which also moves it to the appropriate
-      // xmm register.
-      if (CopyVT != VA.getValVT())
-        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
-                          // This truncation won't change the value.
-                          DAG.getIntPtrConstant(1));
-    } else {
-      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                 CopyVT, InFlag).getValue(1);
-      Val = Chain.getValue(0);
-    }
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT()))
+      CopyVT = MVT::f80;
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               CopyVT, InFlag).getValue(1);
+    SDValue Val = Chain.getValue(0);
+
+    if (CopyVT != VA.getValVT())
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1));
+
      InFlag = Chain.getValue(2);
      InVals.push_back(Val);
    }
@@ -3291,7 +3279,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
      for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
        CCValAssign &VA = RVLocs[i];
-      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
          return false;
      }
    }
@@ -23078,14 +23066,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
          Constraint[5] == ')' &&
          Constraint[6] == '}') {
  
-      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.first = X86::FP0+Constraint[4]-'0';
        Res.second = &X86::RFP80RegClass;
        return Res;
      }
  
      // GCC allows "st(0)" to be called just plain "st".
      if (StringRef("{st}").equals_lower(Constraint)) {
-      Res.first = X86::ST0;
+      Res.first = X86::FP0;
        Res.second = &X86::RFP80RegClass;
        return Res;
      }
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td

index ca4f608a6b80d4892a7f49365c28738957f135aa..fb69255d5ef3b6cc9267457672571aea9ace099c 100644 (file)
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -407,7 +407,8 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
  // All calls clobber the non-callee saved registers. ESP is marked as
  // a use to prevent stack-pointer assignments that appear immediately
  // before calls from potentially appearing dead.
-let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
@@ -426,7 +427,8 @@ def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
  // a use to prevent stack-pointer assignments that appear immediately
  // before calls from potentially appearing dead.
  let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
-            FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td

index 4ad7b7e32a0ad42e90768b9214fcedd85379fa76..d9f173e1451a72d750ccbae67fbffe65c25cdf68 100644 (file)
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -114,9 +114,6 @@ let usesCustomInserter = 1 in {  // Expanded after instruction selection.
  // a pattern) and the FPI instruction should have emission info (e.g. opcode
  // encoding and asm printing info).
  
-// Pseudo Instruction for FP stack return values.
-def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>;
-
  // FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
  // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
  // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td

index 0da98637496e7e73d25492edd10c22ba42d7df56..311a7178149aaf25ab30fce6b0e37361ecc04da9 100644 (file)
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -166,6 +166,7 @@ def FP3 : X86Reg<"fp3", 0>;
  def FP4 : X86Reg<"fp4", 0>;
  def FP5 : X86Reg<"fp5", 0>;
  def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
  
  // XMM Registers, used by the various SSE instruction set extensions.
  def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
@@ -234,22 +235,18 @@ let SubRegIndices = [sub_ymm] in {
    def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
    def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
  
-class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
-  let Aliases = A;
-}
-
  // Floating point stack registers. These don't map one-to-one to the FP
  // pseudo registers, but we still mark them as aliasing FP registers. That
  // way both kinds can be live without exceeding the stack depth. ST registers
  // are only live around inline assembly.
-def ST0 : STRegister<"st(0)", 0, []>,    DwarfRegNum<[33, 12, 11]>;
-def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
-def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
-def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
-def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
-def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
-def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
-def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
  
  // Floating-point status word
  def FPSW : X86Reg<"fpsw", 0>;
diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll

index 91c477baaa51f48dafa8c7aa1f35139993f61bfd..bb3778a28116cb7d96606407a055e048aae09792 100644 (file)
--- a/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/test/CodeGen/X86/inline-asm-fpstack.ll
@@ -340,3 +340,65 @@ entry:
    %0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind
    ret i32 %0
  }
+
+; <rdar://problem/16952634>
+; X87 stackifier asserted when there was an ST register defined by an
+; inline-asm instruction and the ST register was live across another
+; inline-asm instruction.
+;
+; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; %FP0<def> = COPY %ST0
+
+; CHECK-LABEL: _test_live_st
+; CHECK: ## InlineAsm Start
+; CHECK: frndint
+; CHECK: ## InlineAsm End
+; CHECK: ## InlineAsm Start
+; CHECK: fldcw
+; CHECK: ## InlineAsm End
+
+%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] }
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, i32, i32, i32 }
+
+@fpu = external global %struct.fpu_t, align 16
+
+; Function Attrs: ssp
+define void @test_live_st(i32 %a1) {
+entry:
+  %0 = load x86_fp80* undef, align 16
+  %cond = icmp eq i32 %a1, 1
+  br i1 %cond, label %sw.bb4.i, label %_Z5tointRKe.exit
+
+sw.bb4.i:
+  %1 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %0)
+  call void asm sideeffect "fldcw $0", "*m,~{dirflag},~{fpsr},~{flags}"(i32* undef)
+  br label %_Z5tointRKe.exit
+
+_Z5tointRKe.exit:
+  %result.0.i = phi x86_fp80 [ %1, %sw.bb4.i ], [ %0, %entry ]
+  %conv.i1814 = fptosi x86_fp80 %result.0.i to i32
+  %conv626 = sitofp i32 %conv.i1814 to x86_fp80
+  store x86_fp80 %conv626, x86_fp80* getelementptr inbounds (%struct.fpu_t* @fpu, i32 0, i32 1)
+  br label %return
+
+return:
+  ret void
+}
+
+; Check that x87 stackifier is correctly rewriting FP registers to ST registers.
+;
+; CHECK-LABEL: _test_operand_rewrite
+; CHECK: ## InlineAsm Start
+; CHECK: foo %st(0), %st(1)
+; CHECK: ## InlineAsm End
+
+define double @test_operand_rewrite() {
+entry:
+  %0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { double, double } %0, 0
+  %asmresult1 = extractvalue { double, double } %0, 1
+  %sub = fsub double %asmresult, %asmresult1
+  ret double %sub
+}
author	Akira Hatanaka <ahatanaka@apple.com>
	Fri, 1 Aug 2014 22:19:41 +0000 (22:19 +0000)
committer	Akira Hatanaka <ahatanaka@apple.com>
	Fri, 1 Aug 2014 22:19:41 +0000 (22:19 +0000)
lib/Target/X86/X86CallingConv.td		patch \| blob \| history
lib/Target/X86/X86FastISel.cpp		patch \| blob \| history
lib/Target/X86/X86FloatingPoint.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrCompiler.td		patch \| blob \| history
lib/Target/X86/X86InstrFPStack.td		patch \| blob \| history
lib/Target/X86/X86RegisterInfo.td		patch \| blob \| history
test/CodeGen/X86/inline-asm-fpstack.ll		patch \| blob \| history