ARM: Fix and re-enable load/store optimizer for Thumb1.

author Moritz Roth <moritz.roth@arm.com>

Fri, 15 Aug 2014 17:00:30 +0000 (17:00 +0000)

committer Moritz Roth <moritz.roth@arm.com>

Fri, 15 Aug 2014 17:00:30 +0000 (17:00 +0000)
author Moritz Roth <moritz.roth@arm.com>
Fri, 15 Aug 2014 17:00:30 +0000 (17:00 +0000)
committer Moritz Roth <moritz.roth@arm.com>
Fri, 15 Aug 2014 17:00:30 +0000 (17:00 +0000)
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp

index c9de2a7d670cdcb66aac30feaa1cd86dc9255efe..2d9223fe7e4b0661b0a90a2c9e94ae8a6f7317c3 100644 (file)
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -97,10 +97,6 @@ namespace {
      void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
                            const MemOpQueue &MemOps, unsigned DefReg,
                            unsigned RangeBegin, unsigned RangeEnd);
-    void UpdateBaseRegUses(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI,
-                           DebugLoc dl, unsigned Base, unsigned WordOffset,
-                           ARMCC::CondCodes Pred, unsigned PredReg);
      bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                    int Offset, unsigned Base, bool BaseKill, int Opcode,
                    ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
@@ -311,101 +307,6 @@ static bool isi32Store(unsigned Opc) {
    return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
  }
  
-static unsigned getImmScale(unsigned Opc) {
-  switch (Opc) {
-  default: llvm_unreachable("Unhandled opcode!");
-  case ARM::tLDRi:
-  case ARM::tSTRi:
-    return 1;
-  case ARM::tLDRHi:
-  case ARM::tSTRHi:
-    return 2;
-  case ARM::tLDRBi:
-  case ARM::tSTRBi:
-    return 4;
-  }
-}
-
-/// Update future uses of the base register with the offset introduced
-/// due to writeback. This function only works on Thumb1.
-void
-ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   DebugLoc dl, unsigned Base,
-                                   unsigned WordOffset,
-                                   ARMCC::CondCodes Pred, unsigned PredReg) {
-  assert(isThumb1 && "Can only update base register uses for Thumb1!");
-
-  // Start updating any instructions with immediate offsets. Insert a sub before
-  // the first non-updateable instruction (if any).
-  for (; MBBI != MBB.end(); ++MBBI) {
-    if (MBBI->readsRegister(Base)) {
-      unsigned Opc = MBBI->getOpcode();
-      int Offset;
-      bool InsertSub = false;
-
-      if (Opc == ARM::tLDRi  || Opc == ARM::tSTRi  ||
-          Opc == ARM::tLDRHi || Opc == ARM::tSTRHi ||
-          Opc == ARM::tLDRBi || Opc == ARM::tSTRBi) {
-        // Loads and stores with immediate offsets can be updated, but only if
-        // the new offset isn't negative.
-        // The MachineOperand containing the offset immediate is the last one
-        // before predicates.
-        MachineOperand &MO =
-          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
-        // The offsets are scaled by 1, 2 or 4 depending on the Opcode
-        Offset = MO.getImm() - WordOffset * getImmScale(Opc);
-        if (Offset >= 0)
-          MO.setImm(Offset);
-        else
-          InsertSub = true;
-
-      } else if (Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) {
-        // SUB/ADD using this register. Merge it with the update.
-        // If the merged offset is too large, insert a new sub instead.
-        MachineOperand &MO =
-          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
-        Offset = (Opc == ARM::tSUBi8) ?
-          MO.getImm() + WordOffset * 4 :
-          MO.getImm() - WordOffset * 4 ;
-        if (TL->isLegalAddImmediate(Offset)) {
-          MO.setImm(Offset);
-          // The base register has now been reset, so exit early.
-          return;
-        } else {
-          InsertSub = true;
-        }
-
-      } else {
-        // Can't update the instruction.
-        InsertSub = true;
-      }
-
-      if (InsertSub) {
-        // An instruction above couldn't be updated, so insert a sub.
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base))
-          .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
-          .addImm(Pred).addReg(PredReg);
-        return;
-      }
-    }
-
-    if (MBBI->killsRegister(Base))
-      // Register got killed. Stop updating.
-      return;
-  }
-
-  // The end of the block was reached. This means register liveness escapes the
-  // block, and it's necessary to insert a sub before the last instruction.
-  if (MBB.succ_size() > 0)
-    // But only insert the SUB if there is actually a successor block.
-    // FIXME: Check more carefully if register is live at this point, e.g. by
-    // also examining the successor block's register liveness information.
-    AddDefaultT1CC(BuildMI(MBB, --MBBI, dl, TII->get(ARM::tSUBi8), Base))
-      .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
-      .addImm(Pred).addReg(PredReg);
-}
-
  /// MergeOps - Create and insert a LDM or STM with Base as base register and
  /// registers in Regs as the register operands that would be loaded / stored.
  /// It returns true if the transformation is done.
@@ -512,6 +413,14 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
          break;
        }
  
+  // If the merged instruction has writeback and the base register is not killed
+  // it's not safe to do the merge on Thumb1. This is because resetting the base
+  // register writeback by inserting a SUBS sets the condition flags.
+  // FIXME: Try something clever here to see if resetting the base register can
+  // be avoided, e.g. by updating a later ADD/SUB of the base register with the
+  // writeback.
+  if (isThumb1 && Writeback && !BaseKill) return false;
+
    MachineInstrBuilder MIB;
  
    if (Writeback) {
@@ -524,12 +433,6 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
      // Thumb1: we might need to set base writeback when building the MI.
      MIB.addReg(Base, getDefRegState(true))
         .addReg(Base, getKillRegState(BaseKill));
-
-    // The base isn't dead after a merged instruction with writeback. Update
-    // future uses of the base with the added offset (if possible), or reset
-    // the base register as necessary.
-    if (!BaseKill)
-      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
    } else {
      // No writeback, simply build the MachineInstr.
      MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
@@ -1740,12 +1643,6 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
    isThumb2 = AFI->isThumb2Function();
    isThumb1 = AFI->isThumbFunction() && !isThumb2;
  
-  // FIXME: Temporarily disabling for Thumb-1 due to miscompiles
-  if (isThumb1) {
-    delete RS;
-    return false;
-  }
-
    bool Modified = false;
    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
         ++MFI) {
diff --git a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll

index ae663697ebeb440a10f224bc39641458d2cd32a9..18e17766313b327c5716d8cadea09e8e8f3188f0 100644 (file)
--- a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
+++ b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
@@ -1,12 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - | FileCheck %s
  
  define void @foo(i32* %A) #0 {
  entry:
  ; CHECK-LABEL: foo:
  ; CHECK: push {r7, lr}
-; CHECK: ldm [[REG0:r[0-9]]]!,
-; CHECK-NEXT: subs [[REG0]]
+; CHECK: ldr
+; CHECK-NEXT: ldr
  ; CHECK-NEXT: bl
    %0 = load i32* %A, align 4
    %arrayidx1 = getelementptr inbounds i32* %A, i32 1
diff --git a/test/CodeGen/Thumb/dyn-stackalloc.ll b/test/CodeGen/Thumb/dyn-stackalloc.ll

index 6c6de55347a4e8495e5676ce40e27427f0630d8b..706a21350d05f8d243ac9dc07195021abfdf7257 100644 (file)
--- a/test/CodeGen/Thumb/dyn-stackalloc.ll
+++ b/test/CodeGen/Thumb/dyn-stackalloc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
  
         %struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
         %struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
diff --git a/test/CodeGen/Thumb/ldm-merge-call.ll b/test/CodeGen/Thumb/ldm-merge-call.ll

new file mode 100644 (file)

index 0000000..febc96b
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-merge-call.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m--linux-gnueabi"
+
+; Function Attrs: nounwind optsize
+define void @foo(i32* nocapture readonly %A) #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: ldm r[[BASE:[0-9]]]!,
+; CHECK-NEXT: mov r[[BASE]],
+  %0 = load i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i32 1
+  %1 = load i32* %arrayidx1, align 4
+  %call = tail call i32 @bar(i32 %0, i32 %1, i32 %0, i32 %1) #2
+  %call2 = tail call i32 @bar(i32 %0, i32 %1, i32 %0, i32 %1) #2
+  ret void
+}
+
+; Function Attrs: optsize
+declare i32 @bar(i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize }
diff --git a/test/CodeGen/Thumb/ldm-merge-struct.ll b/test/CodeGen/Thumb/ldm-merge-struct.ll

new file mode 100644 (file)

index 0000000..2f732e0
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-merge-struct.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-none--eabi"
+
+%struct.S = type { i32, i32 }
+
+@s = common global %struct.S zeroinitializer, align 4
+
+define i32 @f() {
+entry:
+; CHECK-LABEL: f:
+; CHECK: ldm r[[BASE:[0-9]]],
+; CHECK-NEXT-NOT: subs r[[BASE]]
+  %0 = load i32* getelementptr inbounds (%struct.S* @s, i32 0, i32 0), align 4
+  %1 = load i32* getelementptr inbounds (%struct.S* @s, i32 0, i32 1), align 4
+  %cmp = icmp sgt i32 %0, %1
+  %2 = sub i32 0, %1
+  %cond.p = select i1 %cmp, i32 %1, i32 %2
+  %cond = add i32 %cond.p, %0
+  ret i32 %cond
+}
diff --git a/test/CodeGen/Thumb/stm-merge.ll b/test/CodeGen/Thumb/stm-merge.ll

new file mode 100644 (file)

index 0000000..76e71f4
--- /dev/null
+++ b/test/CodeGen/Thumb/stm-merge.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m--linux-gnueabi"
+
+@d = internal unnamed_addr global i32 0, align 4
+@c = internal global i32* null, align 4
+@e = internal unnamed_addr global i32* null, align 4
+
+; Function Attrs: nounwind optsize
+define void @fn1() #0 {
+entry:
+; CHECK-LABEL: fn1:
+; CHECK: stm r[[BASE:[0-9]]]!, {{.*}}
+; CHECK-NOT: {{.*}} r[[BASE]]
+; CHECK: ldr r[[BASE]], {{.*}}
+  %g = alloca i32, align 4
+  %h = alloca i32, align 4
+  store i32 1, i32* %g, align 4
+  store i32 0, i32* %h, align 4
+  %.pr = load i32* @d, align 4
+  %cmp11 = icmp slt i32 %.pr, 1
+  br i1 %cmp11, label %for.inc.lr.ph, label %for.body5
+
+for.inc.lr.ph:                                    ; preds = %entry
+  store i32 1, i32* @d, align 4
+  br label %for.body5
+
+for.body5:                                        ; preds = %entry, %for.inc.lr.ph, %for.body5
+  %f.010 = phi i32 [ %inc7, %for.body5 ], [ 0, %for.inc.lr.ph ], [ 0, %entry ]
+  store volatile i32* %g, i32** @c, align 4
+  %inc7 = add nsw i32 %f.010, 1
+  %exitcond = icmp eq i32 %inc7, 2
+  br i1 %exitcond, label %for.end8, label %for.body5
+
+for.end8:                                         ; preds = %for.body5
+  store i32* %h, i32** @e, align 4
+  ret void
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Thumb/thumb-ldm.ll b/test/CodeGen/Thumb/thumb-ldm.ll

index 95f3edc9c4c33d296514953747c137765e5b680f..dd98e6f300ea775c21f0bf1ce9569210890d63b5 100644 (file)
--- a/test/CodeGen/Thumb/thumb-ldm.ll
+++ b/test/CodeGen/Thumb/thumb-ldm.ll
@@ -1,5 +1,4 @@
  ; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
-; XFAIL: *
  
  @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
  
diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll

index dedc82bf68ce68fe82af78654f382293439dc92f..e1e38739f1d2b953daa38eccbcf747b38b12fbd5 100644 (file)
--- a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
+++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
@@ -1,5 +1,4 @@
  ; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s
-; XFAIL: *
  
  @d = external global [64 x i32]
  @s = external global [64 x i32]
@@ -7,29 +6,19 @@
  ; Function Attrs: nounwind
  define void @t1() #0 {
  entry:
-; CHECK: ldr [[REG0:r[0-9]]],
-; CHECK: ldm [[REG0]]!,
-; CHECK: ldr [[REG1:r[0-9]]],
-; CHECK: stm [[REG1]]!,
-; CHECK: subs [[REG0]], #32
-; CHECK-NEXT: ldrb
-; CHECK: subs [[REG1]], #32
-; CHECK-NEXT: strb
-    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 33, i32 4, i1 false)
+; CHECK-LABEL: t1
+; CHECK-NOT: ldm
+; CHECK-NOT: stm
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false)
      ret void
  }
  
  ; Function Attrs: nounwind
  define void @t2() #0 {
  entry:
-; CHECK: ldr [[REG0:r[0-9]]],
-; CHECK: ldm [[REG0]]!,
-; CHECK: ldr [[REG1:r[0-9]]],
-; CHECK: stm [[REG1]]!,
-; CHECK: ldrh
-; CHECK: ldrb
-; CHECK: strb
-; CHECK: strh
+; CHECK-LABEL: t2:
+; CHECK-NOT: ldm
+; CHECK-NOT: stm
      tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
      ret void
  }
author	Moritz Roth <moritz.roth@arm.com>
	Fri, 15 Aug 2014 17:00:30 +0000 (17:00 +0000)
committer	Moritz Roth <moritz.roth@arm.com>
	Fri, 15 Aug 2014 17:00:30 +0000 (17:00 +0000)
lib/Target/ARM/ARMLoadStoreOptimizer.cpp		patch \| blob \| history
test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll		patch \| blob \| history
test/CodeGen/Thumb/dyn-stackalloc.ll		patch \| blob \| history
test/CodeGen/Thumb/ldm-merge-call.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/Thumb/ldm-merge-struct.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/Thumb/stm-merge.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/Thumb/thumb-ldm.ll		patch \| blob \| history
test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll		patch \| blob \| history