Modify how the formulae are rated in Loop Strength Reduce.

author Quentin Colombet <qcolombet@apple.com>

Fri, 31 May 2013 17:20:29 +0000 (17:20 +0000)

committer Quentin Colombet <qcolombet@apple.com>

Fri, 31 May 2013 17:20:29 +0000 (17:20 +0000)
author Quentin Colombet <qcolombet@apple.com>
Fri, 31 May 2013 17:20:29 +0000 (17:20 +0000)
committer Quentin Colombet <qcolombet@apple.com>
Fri, 31 May 2013 17:20:29 +0000 (17:20 +0000)
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

index c7344f6bbd76bc10a9e1c1c1f9deae7134bc54fd..ecc96ae0b22c9bb626098d2ced7fe65a27c2a29f 100644 (file)
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -773,6 +773,13 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
    return Changed;
  }
  
+namespace {
+class LSRUse;
+}
+// Check if it is legal to fold 2 base registers.
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+                             const Formula &F);
+
  namespace {
  
  /// Cost - This class is used to measure and compare candidate formulae.
@@ -810,12 +817,14 @@ public:
      return NumRegs == ~0u;
    }
  
-  void RateFormula(const Formula &F,
+  void RateFormula(const TargetTransformInfo &TTI,
+                   const Formula &F,
                     SmallPtrSet<const SCEV *, 16> &Regs,
                     const DenseSet<const SCEV *> &VisitedRegs,
                     const Loop *L,
                     const SmallVectorImpl<int64_t> &Offsets,
                     ScalarEvolution &SE, DominatorTree &DT,
+                   const LSRUse &LU,
                     SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
  
    void print(raw_ostream &OS) const;
@@ -900,12 +909,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
    }
  }
  
-void Cost::RateFormula(const Formula &F,
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+                       const Formula &F,
                         SmallPtrSet<const SCEV *, 16> &Regs,
                         const DenseSet<const SCEV *> &VisitedRegs,
                         const Loop *L,
                         const SmallVectorImpl<int64_t> &Offsets,
                         ScalarEvolution &SE, DominatorTree &DT,
+                       const LSRUse &LU,
                         SmallPtrSet<const SCEV *, 16> *LoserRegs) {
    // Tally up the registers.
    if (const SCEV *ScaledReg = F.ScaledReg) {
@@ -932,7 +943,9 @@ void Cost::RateFormula(const Formula &F,
    // Determine how many (unfolded) adds we'll need inside the loop.
    size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
    if (NumBaseParts > 1)
-    NumBaseAdds += NumBaseParts - 1;
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
  
    // Tally up the non-zero immediates.
    for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@@ -1359,6 +1372,30 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
                      F.BaseOffset, F.HasBaseReg, F.Scale);
  }
  
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+                             const Formula &F) {
+  // If F is used as an Addressing Mode, it may fold one Base plus one
+  // scaled register. If the scaled register is nil, do as if another
+  // element of the base regs is a 1-scaled register.
+  // This is possible if BaseRegs has at least 2 registers.
+
+  // If this is not an address calculation, this is not an addressing mode
+  // use.
+  if (LU.Kind !=  LSRUse::Address)
+    return false;
+
+  // F is already scaled.
+  if (F.Scale != 0)
+    return false;
+
+  // We need to keep one register for the base and one to scale.
+  if (F.BaseRegs.size() < 2)
+    return false;
+
+  return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
+ }
+
  static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                               LSRUse::KindType Kind, Type *AccessTy,
                               GlobalValue *BaseGV, int64_t BaseOffset,
@@ -3690,7 +3727,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
        // the corresponding bad register from the Regs set.
        Cost CostF;
        Regs.clear();
-      CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
+      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
                          &LoserRegs);
        if (CostF.isLoser()) {
          // During initial formula generation, undesirable formulae are generated
@@ -3726,7 +3763,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
  
          Cost CostBest;
          Regs.clear();
-        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
+        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
+                             DT, LU);
          if (CostF < CostBest)
            std::swap(F, Best);
          DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
@@ -4079,7 +4117,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
      // the current best, prune the search at that point.
      NewCost = CurCost;
      NewRegs = CurRegs;
-    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
+    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
+                        LU);
      if (NewCost < SolutionCost) {
        Workspace.push_back(&F);
        if (Workspace.size() != Uses.size()) {
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll

index a7b036e9b65885d8579d58f15de0e4930627cfca..c33cac2e05a2310d1f3b96332a839b30f2d74985 100644 (file)
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -3,9 +3,8 @@
  ; RUN: not grep movz %t
  ; RUN: not grep sar %t
  ; RUN: not grep shl %t
-; RUN: grep add %t | count 1
-; RUN: grep inc %t | count 4
-; RUN: grep dec %t | count 2
+; RUN: grep add %t | count 5
+; RUN: grep inc %t | count 2
  ; RUN: grep lea %t | count 3
  
  ; Optimize away zext-inreg and sext-inreg on the loop induction
diff --git a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll

index 8fbddf8ae4c82c450d22b514831f146397291af5..652eb06225cea032bc3b931df9480ec832f3ef58 100644 (file)
--- a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
+++ b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
@@ -10,12 +10,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
  ; Verify that nothing uses the "dead" ptrtoint from "undef".
  ; CHECK: @VerifyDiagnosticConsumerTest
  ; CHECK: bb:
-; CHECK: %0 = ptrtoint i8* undef to i64
-; CHECK-NOT: %0
+; "dead" ptrpoint not emitted (or dead code eliminated) with
+; current LSR cost model.
+; CHECK-NOT: = ptrtoint i8* undef to i64
  ; CHECK: .lr.ph
-; CHECK-NOT: %0
-; CHECK: sub i64 %7, %tmp6
-; CHECK-NOT: %0
+; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1
+; CHECK: sub i64 [[TMP]], %tmp6
  ; CHECK: ret void
  define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 {
  bb:
diff --git a/test/Transforms/LoopStrengthReduce/uglygep.ll b/test/Transforms/LoopStrengthReduce/uglygep.ll

index 8af5cf1dfd726e4404798f1c91233308203238ca..10c77d5f645a715b16fdae7520305bc7a8d60e1a 100644 (file)
--- a/test/Transforms/LoopStrengthReduce/uglygep.ll
+++ b/test/Transforms/LoopStrengthReduce/uglygep.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | not grep uglygep
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
  
  ; LSR shouldn't consider %t8 to be an interesting user of %t6, and it
  ; should be able to form pretty GEPs.
@@ -6,6 +6,7 @@
  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
  
  define void @Z4() nounwind {
+; CHECK: define void @Z4
  bb:
    br label %bb3
  
@@ -20,11 +21,26 @@ bb3:                                              ; preds = %bb2, %bb
    %t4 = phi i64 [ %t, %bb2 ], [ 0, %bb ]      ; <i64> [#uses=3]
    br label %bb1
  
+; CHECK: bb10:
+; CHECK-NEXT: %t7 = icmp eq i64 %t4, 0
+; Host %t2 computation outside the loop.
+; CHECK-NEXT: [[SCEVGEP:%[^ ]+]] = getelementptr i8* undef, i64 %t4
+; CHECK-NEXT: br label %bb14
  bb10:                                             ; preds = %bb9
    %t7 = icmp eq i64 %t4, 0                    ; <i1> [#uses=1]
    %t3 = add i64 %t4, 16                     ; <i64> [#uses=1]
    br label %bb14
  
+; CHECK: bb14:
+; CHECK-NEXT: store i8 undef, i8* [[SCEVGEP]]
+; CHECK-NEXT: %t6 = load float** undef
+; Fold %t3's add within the address.
+; CHECK-NEXT: [[SCEVGEP1:%[^ ]+]] = getelementptr float* %t6, i64 4
+; CHECK-NEXT: [[SCEVGEP2:%[^ ]+]] = bitcast float* [[SCEVGEP1]] to i8*
+; Use the induction variable (%t4) to access the right element
+; CHECK-NEXT: [[ADDRESS:%[^ ]+]] = getelementptr i8* [[SCEVGEP2]], i64 %t4
+; CHECK-NEXT: store i8 undef, i8* [[ADDRESS]]
+; CHECK-NEXT: br label %bb14
  bb14:                                             ; preds = %bb14, %bb10
    %t2 = getelementptr inbounds i8* undef, i64 %t4 ; <i8*> [#uses=1]
    store i8 undef, i8* %t2
@@ -36,9 +52,15 @@ bb14:                                             ; preds = %bb14, %bb10
  }
  
  define fastcc void @TransformLine() nounwind {
+; CHECK: @TransformLine
  bb:
    br label %loop0
  
+; CHECK: loop0:
+; Induction variable is initialized to -2.
+; CHECK-NEXT: [[PHIIV:%[^ ]+]] = phi i32 [ [[IVNEXT:%[^ ]+]], %loop0 ], [ -2, %bb ]
+; CHECK-NEXT: [[IVNEXT]] = add i32 [[PHIIV]], 1
+; CHECK-NEXT: br i1 false, label %loop0, label %bb0
  loop0:                                            ; preds = %loop0, %bb
    %i0 = phi i32 [ %i0.next, %loop0 ], [ 0, %bb ]  ; <i32> [#uses=2]
    %i0.next = add i32 %i0, 1                       ; <i32> [#uses=1]
@@ -47,18 +69,52 @@ loop0:                                            ; preds = %loop0, %bb
  bb0:                                              ; preds = %loop0
    br label %loop1
  
+; CHECK: loop1:
+; CHECK-NEXT: %i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ]
+; IVNEXT covers the uses of %i0 and %t0.
+; Therefore, %t0 has been removed.
+; The critical edge has been split.
+; CHECK-NEXT: br i1 false, label %bb2, label %[[LOOP1BB6:.+]]
  loop1:                                            ; preds = %bb5, %bb0
    %i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ]   ; <i32> [#uses=4]
    %t0 = add i32 %i0, %i1                          ; <i32> [#uses=1]
    br i1 false, label %bb2, label %bb6
  
+; CHECK: bb2:
+; Critical edge split.
+; CHECK-NEXT: br i1 true, label %[[BB2BB6:[^,]+]], label %bb5
  bb2:                                              ; preds = %loop1
    br i1 true, label %bb6, label %bb5
  
+; CHECK: bb5:
+; CHECK-NEXT: %i1.next = add i32 %i1, 1
+; CHECK-NEXT: br i1 true, label %[[BB5BB6:[^,]+]], label %loop1
  bb5:                                              ; preds = %bb2
    %i1.next = add i32 %i1, 1                       ; <i32> [#uses=1]
    br i1 true, label %bb6, label %loop1
  
+; bb5 to bb6 split basic block.
+; CHECK: [[BB5BB6]]:
+; CHECK-NEXT: [[INITIALVAL:%[^ ]+]] = add i32 [[IVNEXT]], %i1.next
+; CHECK-NEXT: br label %[[SPLITTOBB6:.+]]
+
+; bb2 to bb6 split basic block.
+; CHECK: [[BB2BB6]]:
+; CHECK-NEXT: br label %[[SPLITTOBB6]]
+
+; Split basic blocks to bb6.
+; CHECK: [[SPLITTOBB6]]:
+; CHECK-NEXT: [[INITP8:%[^ ]+]] = phi i32 [ [[INITIALVAL]], %[[BB5BB6]] ], [ undef, %[[BB2BB6]] ]
+; CHECK-NEXT: [[INITP9:%[^ ]+]] = phi i32 [ undef, %[[BB5BB6]] ], [ %i1, %[[BB2BB6]] ]
+; CHECK-NEXT: br label %bb6
+  
+; CHECK: [[LOOP1BB6]]:
+; CHECK-NEXT: br label %bb6
+
+; CHECK: bb6:
+; CHECK-NEXT: %p8 = phi i32 [ undef, %[[LOOP1BB6]] ], [ [[INITP8]], %[[SPLITTOBB6]] ]
+; CHECK-NEXT: %p9 = phi i32 [ %i1, %[[LOOP1BB6]] ], [ [[INITP9]], %[[SPLITTOBB6]] ]
+; CHECK-NEXT: unreachable
  bb6:                                              ; preds = %bb5, %bb2, %loop1
    %p8 = phi i32 [ %t0, %bb5 ], [ undef, %loop1 ], [ undef, %bb2 ] ; <i32> [#uses=0]
    %p9 = phi i32 [ undef, %bb5 ], [ %i1, %loop1 ], [ %i1, %bb2 ] ; <i32> [#uses=0]
author	Quentin Colombet <qcolombet@apple.com>
	Fri, 31 May 2013 17:20:29 +0000 (17:20 +0000)
committer	Quentin Colombet <qcolombet@apple.com>
	Fri, 31 May 2013 17:20:29 +0000 (17:20 +0000)
lib/Transforms/Scalar/LoopStrengthReduce.cpp		patch \| blob \| history
test/CodeGen/X86/masked-iv-safe.ll		patch \| blob \| history
test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll		patch \| blob \| history
test/Transforms/LoopStrengthReduce/uglygep.ll		patch \| blob \| history