[X86] Convert esp-relative movs of function arguments into pushes, step 1

author Michael Kuperstein <michael.m.kuperstein@intel.com>

Tue, 9 Dec 2014 06:10:44 +0000 (06:10 +0000)

committer Michael Kuperstein <michael.m.kuperstein@intel.com>

Tue, 9 Dec 2014 06:10:44 +0000 (06:10 +0000)
author Michael Kuperstein <michael.m.kuperstein@intel.com>
Tue, 9 Dec 2014 06:10:44 +0000 (06:10 +0000)
committer Michael Kuperstein <michael.m.kuperstein@intel.com>
Tue, 9 Dec 2014 06:10:44 +0000 (06:10 +0000)
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp

index e7f9c11dbcfc0a5f84aa845b4ac6606a6bd1353e..a3a3dce87fa4dac1c5895ad08fde4047253a4ead 100644 (file)
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -93,6 +93,15 @@ static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
    return X86::AND32ri;
  }
  
+static unsigned getPUSHiOpcode(bool IsLP64, int64_t Imm) {
+  // We don't support LP64 for now.
+  assert(!IsLP64);
+
+  if (isInt<8>(Imm))
+    return X86::PUSH32i8;
+  return X86::PUSHi32;
+}
+
  static unsigned getLEArOpcode(unsigned IsLP64) {
    return IsLP64 ? X86::LEA64r : X86::LEA32r;
  }
@@ -1802,6 +1811,103 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
  #endif
  }
  
+bool X86FrameLowering::
+convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I, uint64_t Amount) const {
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+    MF.getSubtarget().getRegisterInfo());
+  unsigned StackPtr = RegInfo.getStackRegister();
+
+  // Scan the call setup sequence for the pattern we're looking for.
+  // We only handle a simple case now - a sequence of MOV32mi or MOV32mr
+  // instructions, that push a sequence of 32-bit values onto the stack, with
+  // no gaps.  
+  std::map<int64_t, MachineBasicBlock::iterator> MovMap;
+  do {
+    int Opcode = I->getOpcode();
+    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
+      break;
+ 
+    // We only want movs of the form:
+    // movl imm/r32, k(%ecx)
+    // If we run into something else, bail
+    // Note that AddrBaseReg may, counterintuitively, not be a register...
+    if (!I->getOperand(X86::AddrBaseReg).isReg() || 
+        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+        !I->getOperand(X86::AddrScaleAmt).isImm() ||
+        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+        !I->getOperand(X86::AddrDisp).isImm())
+      return false;
+
+    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+    
+    // We don't want to consider the unaligned case.
+    if (StackDisp % 4)
+      return false;
+
+    // If the same stack slot is being filled twice, something's fishy.
+    if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second)
+      return false;
+
+    ++I;
+  } while (I != MBB.end());
+
+  // We now expect the end of the sequence - a call and a stack adjust.
+  if (I == MBB.end())
+    return false;
+  if (!I->isCall())
+    return false;
+  MachineBasicBlock::iterator Call = I;
+  if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode())
+    return false;
+
+  // Now, go through the map, and see that we don't have any gaps,
+  // but only a series of 32-bit MOVs.
+  // Since std::map provides ordered iteration, the original order
+  // of the MOVs doesn't matter.
+  int64_t ExpectedDist = 0;
+  for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; 
+       ++MMI, ExpectedDist += 4)
+    if (MMI->first != ExpectedDist)
+      return false;
+
+  // Ok, everything looks fine. Do the transformation.
+  DebugLoc DL = I->getDebugLoc();
+
+  // It's possible the original stack adjustment amount was larger than
+  // that done by the pushes. If so, we still need a SUB.
+  Amount -= ExpectedDist;
+  if (Amount) {
+    MachineInstr* Sub = BuildMI(MBB, Call, DL,
+                          TII.get(getSUBriOpcode(false, Amount)), StackPtr)
+                  .addReg(StackPtr).addImm(Amount);
+    Sub->getOperand(3).setIsDead();
+  }
+
+  // Now, iterate through the map in reverse order, and replace the movs
+  // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
+  for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
+    MachineBasicBlock::iterator MOV = MMI->second;
+    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    int PushOpcode;
+    if (MOV->getOpcode() == X86::MOV32mi) {
+      int64_t Val = PushOp.getImm();
+      BuildMI(MBB, Call, DL, TII.get(getPUSHiOpcode(false, Val)))
+        .addImm(Val);
+    } else {
+      PushOpcode = X86::PUSH32r;
+      BuildMI(MBB, Call, DL, TII.get(X86::PUSH32r))
+        .addReg(PushOp.getReg());
+    }
+    MBB.erase(MOV);
+  }
+
+  return true;
+}
+
  void X86FrameLowering::
  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I) const {
@@ -1809,21 +1915,20 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
    const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
                                         MF.getSubtarget().getRegisterInfo());
    unsigned StackPtr = RegInfo.getStackRegister();
-  bool reseveCallFrame = hasReservedCallFrame(MF);
+  bool reserveCallFrame = hasReservedCallFrame(MF);
    int Opcode = I->getOpcode();
    bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
    const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
    bool IsLP64 = STI.isTarget64BitLP64();
    DebugLoc DL = I->getDebugLoc();
-  uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
+  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
    uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
    I = MBB.erase(I);
  
-  if (!reseveCallFrame) {
+  if (!reserveCallFrame) {
      // If the stack pointer can be changed after prologue, turn the
      // adjcallstackup instruction into a 'sub ESP, <amt>' and the
      // adjcallstackdown instruction into 'add ESP, <amt>'
-    // TODO: consider using push / pop instead of sub + store / add
      if (Amount == 0)
        return;
  
@@ -1838,6 +1943,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
  
      MachineInstr *New = nullptr;
      if (Opcode == TII.getCallFrameSetupOpcode()) {
+      // Try to convert movs to the stack into pushes.
+      // We currently only look for a pattern that appears in 32-bit
+      // calling conventions.
+      if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount))
+        return;
+
        New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
                      StackPtr)
          .addReg(StackPtr)
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h

index 2ee71159c190b197d79e7dd05b09473a0756cdcb..ee0ee227cad862058d126b205489f3cd9b50a225 100644 (file)
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -76,6 +76,16 @@ public:
    void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                   MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MI) const override;
+
+private:
+  /// convertArgMovsToPushes - This method tries to convert a call sequence
+  /// that uses sub and mov instructions to put the argument onto the stack
+  /// into a series of pushes.
+  /// Returns true if the transformation succeeded, false if not.
+  bool convertArgMovsToPushes(MachineFunction &MF, 
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I, 
+                              uint64_t Amount) const;
  };
  
  } // End llvm namespace
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll

index 95defc83db1f3d4f172c1da84407ff73012bd946..bd9806943920bb0392adef5e32cc7626d218e504 100644 (file)
--- a/test/CodeGen/X86/force-align-stack-alloca.ll
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -33,14 +33,14 @@ define i64 @g(i32 %i) nounwind {
  ; CHECK-NOT:         {{[^ ,]*}}, %esp
  ;
  ; Next we set up the memset call, and then undo it.
-; CHECK:      subl   $32, %esp
+; CHECK:      subl   $20, %esp
  ; CHECK-NOT:         {{[^ ,]*}}, %esp
  ; CHECK:      calll  memset
  ; CHECK-NEXT: addl   $32, %esp
  ; CHECK-NOT:         {{[^ ,]*}}, %esp
  ;
  ; Next we set up the call to 'f'.
-; CHECK:      subl   $32, %esp
+; CHECK:      subl   $28, %esp
  ; CHECK-NOT:         {{[^ ,]*}}, %esp
  ; CHECK:      calll  f
  ; CHECK-NEXT: addl   $32, %esp
diff --git a/test/CodeGen/X86/inalloca-ctor.ll b/test/CodeGen/X86/inalloca-ctor.ll

index 7cfa929135786d5649dff2253d8f3d91a4f5b56b..b1781d30f91364f08f72018d8ffa4a68de19e61f 100644 (file)
--- a/test/CodeGen/X86/inalloca-ctor.ll
+++ b/test/CodeGen/X86/inalloca-ctor.ll
@@ -17,16 +17,16 @@ entry:
  ; CHECK: movl %esp,
    call void @Foo_ctor(%Foo* %c)
  ; CHECK: leal 12(%{{.*}}),
-; CHECK: subl $4, %esp
-; CHECK: calll _Foo_ctor
+; CHECK-NEXT: pushl
+; CHECK-NEXT: calll _Foo_ctor
  ; CHECK: addl $4, %esp
    %b = getelementptr %frame* %args, i32 0, i32 1
    store i32 42, i32* %b
  ; CHECK: movl $42,
    %a = getelementptr %frame* %args, i32 0, i32 0
    call void @Foo_ctor(%Foo* %a)
-; CHECK: subl $4, %esp
-; CHECK: calll _Foo_ctor
+; CHECK-NEXT: pushl
+; CHECK-NEXT: calll _Foo_ctor
  ; CHECK: addl $4, %esp
    call void @f(%frame* inalloca %args)
  ; CHECK: calll   _f
diff --git a/test/CodeGen/X86/inalloca-invoke.ll b/test/CodeGen/X86/inalloca-invoke.ll

index 6cff9ac0640c6e0b53538cf368f73c955d3dfdc4..b56f24d996286c66f8977f6e324278e3c1b3120c 100644 (file)
--- a/test/CodeGen/X86/inalloca-invoke.ll
+++ b/test/CodeGen/X86/inalloca-invoke.ll
@@ -37,7 +37,7 @@ blah:
  invoke.cont:
    call void @begin(%Iter* sret %beg)
  
-; CHECK:  movl %[[beg]],
+; CHECK:  pushl %[[beg]]
  ; CHECK:  calll _begin
  
    invoke void @reverse(%frame.reverse* inalloca align 4 %rev_args)
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll

index 54f97d99a9c7450a85e24636138eb4dafbdb86fc..e5b07e262c7b78db4ffd9dd488522eef3ce3dc82 100644 (file)
--- a/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -19,7 +19,7 @@ define void @g() {
    call x86_stdcallcc void @f(%Foo* inalloca %b)
  ; CHECK: calll   _f@8
  ; CHECK-NOT: %esp
-; CHECK: subl $4, %esp
+; CHECK: pushl
  ; CHECK: calll   _i@4
    call x86_stdcallcc void @i(i32 0)
    ret void
diff --git a/test/CodeGen/X86/mem-intrin-base-reg.ll b/test/CodeGen/X86/mem-intrin-base-reg.ll

index dd7f3964020414e4dbc924fac5439bf4a6d44caa..9a6de3dd1d9226754297d061a1db5a99e41bbfd2 100644 (file)
--- a/test/CodeGen/X86/mem-intrin-base-reg.ll
+++ b/test/CodeGen/X86/mem-intrin-base-reg.ll
@@ -63,7 +63,7 @@ spill_vectors:
  ; CHECK-LABEL: _memcpy_vla_vector:
  ; CHECK: andl $-16, %esp
  ; CHECK: movl %esp, %esi
-; CHECK: movl $128, {{.*}}(%esp)
+; CHECK: pushl $128
  ; CHECK: calll _memcpy
  ; CHECK: calll __chkstk
  
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll

new file mode 100644 (file)

index 0000000..6a848b7
--- /dev/null
+++ b/test/CodeGen/X86/movtopush.ll
@@ -0,0 +1,97 @@
+; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL\r
+; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED \r
+declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)\r
+declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)\r
+\r
+; Here, we should have a reserved frame, so we don't expect pushes\r
+; NORMAL-LABEL: test1\r
+; NORMAL: subl    $16, %esp\r
+; NORMAL-NEXT: movl    $4, 12(%esp)\r
+; NORMAL-NEXT: movl    $3, 8(%esp)\r
+; NORMAL-NEXT: movl    $2, 4(%esp)\r
+; NORMAL-NEXT: movl    $1, (%esp)\r
+; NORMAL-NEXT: call\r
+define void @test1() {\r
+entry:\r
+  call void @good(i32 1, i32 2, i32 3, i32 4)\r
+  ret void\r
+}\r
+\r
+; Here, we expect a sequence of 4 immediate pushes\r
+; NORMAL-LABEL: test2\r
+; NORMAL-NOT: subl {{.*}} %esp\r
+; NORMAL: pushl   $4\r
+; NORMAL-NEXT: pushl   $3\r
+; NORMAL-NEXT: pushl   $2\r
+; NORMAL-NEXT: pushl   $1\r
+; NORMAL-NEXT: call\r
+define void @test2(i32 %k) {\r
+entry:\r
+  %a = alloca i32, i32 %k\r
+  call void @good(i32 1, i32 2, i32 3, i32 4)\r
+  ret void\r
+}\r
+\r
+; Again, we expect a sequence of 4 immediate pushes\r
+; Checks that we generate the right pushes for >8bit immediates\r
+; NORMAL-LABEL: test2b\r
+; NORMAL-NOT: subl {{.*}} %esp\r
+; NORMAL: pushl   $4096\r
+; NORMAL-NEXT: pushl   $3072\r
+; NORMAL-NEXT: pushl   $2048\r
+; NORMAL-NEXT: pushl   $1024\r
+; NORMAL-NEXT: call\r
+define void @test2b(i32 %k) {\r
+entry:\r
+  %a = alloca i32, i32 %k\r
+  call void @good(i32 1024, i32 2048, i32 3072, i32 4096)\r
+  ret void\r
+}\r
+\r
+; The first push should push a register\r
+; NORMAL-LABEL: test3\r
+; NORMAL-NOT: subl {{.*}} %esp\r
+; NORMAL: pushl   $4\r
+; NORMAL-NEXT: pushl   $3\r
+; NORMAL-NEXT: pushl   $2\r
+; NORMAL-NEXT: pushl   %e{{..}}\r
+; NORMAL-NEXT: call\r
+define void @test3(i32 %k) {\r
+entry:\r
+  %a = alloca i32, i32 %k\r
+  call void @good(i32 %k, i32 2, i32 3, i32 4)\r
+  ret void\r
+}\r
+\r
+; We don't support weird calling conventions\r
+; NORMAL-LABEL: test4\r
+; NORMAL: subl    $12, %esp\r
+; NORMAL-NEXT: movl    $4, 8(%esp)\r
+; NORMAL-NEXT: movl    $3, 4(%esp)\r
+; NORMAL-NEXT: movl    $1, (%esp)\r
+; NORMAL-NEXT: movl    $2, %eax\r
+; NORMAL-NEXT: call\r
+define void @test4(i32 %k) {\r
+entry:\r
+  %a = alloca i32, i32 %k\r
+  call void @inreg(i32 1, i32 2, i32 3, i32 4)\r
+  ret void\r
+}\r
+\r
+; Check that additional alignment is added when the pushes\r
+; don't add up to the required alignment.\r
+; ALIGNED-LABEL: test5\r
+; ALIGNED: subl    $16, %esp\r
+; ALIGNED-NEXT: pushl   $4\r
+; ALIGNED-NEXT: pushl   $3\r
+; ALIGNED-NEXT: pushl   $2\r
+; ALIGNED-NEXT: pushl   $1\r
+; ALIGNED-NEXT: call\r
+define void @test5(i32 %k) {\r
+entry:\r
+  %a = alloca i32, i32 %k\r
+  call void @good(i32 1, i32 2, i32 3, i32 4)\r
+  ret void\r
+}\r
+\r
+\r
author	Michael Kuperstein <michael.m.kuperstein@intel.com>
	Tue, 9 Dec 2014 06:10:44 +0000 (06:10 +0000)
committer	Michael Kuperstein <michael.m.kuperstein@intel.com>
	Tue, 9 Dec 2014 06:10:44 +0000 (06:10 +0000)
lib/Target/X86/X86FrameLowering.cpp		patch \| blob \| history
lib/Target/X86/X86FrameLowering.h		patch \| blob \| history
test/CodeGen/X86/force-align-stack-alloca.ll		patch \| blob \| history
test/CodeGen/X86/inalloca-ctor.ll		patch \| blob \| history
test/CodeGen/X86/inalloca-invoke.ll		patch \| blob \| history
test/CodeGen/X86/inalloca-stdcall.ll		patch \| blob \| history
test/CodeGen/X86/mem-intrin-base-reg.ll		patch \| blob \| history
test/CodeGen/X86/movtopush.ll	[new file with mode: 0644]	patch \| blob