From 455a0bcfcce8a12503a3029d4d7c4469a5b0c74c Mon Sep 17 00:00:00 2001
From: Hal Finkel <hfinkel@anl.gov>
Date: Sun, 8 Nov 2015 08:04:40 +0000
Subject: [PATCH] [PowerPC] Fix LoopPreIncPrep not to depend on SCEV constant
 simplifications

Under most circumstances, if SCEV can simplify X-Y to a constant, then it can
also simplify Y-X to a constant. However, there is no guarantee that this is
always true, and concensus is not to consider that a correctness bug in SCEV
(although it is undesirable).

PPCLoopPreIncPrep gathers pointers used to access memory (via loads, stores and
prefetches) into buckets, where in each bucket the relative pointer offsets are
constant. We used to keep each bucket as a multimap, where SCEV's subtraction
operation was used to define the ordering predicate. Instead, use a fixed SCEV
base expression for each bucket, record the constant offsets from that base
expression, and adjust it later, if desirable, once all pointers have been
collected.

Doing it this way should be more compile-time efficient than the previous
scheme (in addition to making the implementation less sensitive to SCEV
simplification quirks).

Fixes PR25170.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@252417 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCLoopPreIncPrep.cpp      | 114 ++++++++++++------
 .../PowerPC/preincprep-nontrans-crash.ll      |  94 +++++++++++++++
 2 files changed, 172 insertions(+), 36 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/preincprep-nontrans-crash.ll
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index 61c08b75379..9a0d9c2e703 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -101,17 +101,20 @@ FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
 }
 
 namespace {
-  struct SCEVLess : std::binary_function<const SCEV *, const SCEV *, bool>
-  {
-    SCEVLess(ScalarEvolution *SE) : SE(SE) {}
+  struct BucketElement {
+    BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
+    BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
 
-    bool operator() (const SCEV *X, const SCEV *Y) const {
-      const SCEV *Diff = SE->getMinusSCEV(X, Y);
-      return cast<SCEVConstant>(Diff)->getValue()->getSExtValue() < 0;
-    }
+    const SCEVConstant *Offset;
+    Instruction *Instr;
+  };
 
-  protected:
-    ScalarEvolution *SE;
+  struct Bucket {
+    Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
+                                            Elements(1, BucketElement(I)) {}
+
+    const SCEV *BaseSCEV;
+    SmallVector<BucketElement, 16> Elements;
   };
 }
 
@@ -169,7 +172,6 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     std::distance(pred_begin(Header), pred_end(Header));
 
   // Collect buckets of comparable addresses used by loads and stores.
-  typedef std::multimap<const SCEV *, Instruction *, SCEVLess> Bucket;
   SmallVector<Bucket, 16> Buckets;
   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
        I != IE; ++I) {
@@ -212,25 +214,24 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
       }
 
       bool FoundBucket = false;
-      for (unsigned i = 0, e = Buckets.size(); i != e; ++i)
-        for (Bucket::iterator K = Buckets[i].begin(), KE = Buckets[i].end();
-             K != KE; ++K) {
-          const SCEV *Diff = SE->getMinusSCEV(K->first, LSCEV);
-          if (isa<SCEVConstant>(Diff)) {
-            Buckets[i].insert(std::make_pair(LSCEV, MemI));
-            FoundBucket = true;
-            break;
-          }
+      for (auto &B : Buckets) {
+        const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
+        if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
+          B.Elements.push_back(BucketElement(CDiff, MemI));
+          FoundBucket = true;
+          break;
         }
+      }
 
       if (!FoundBucket) {
-        Buckets.push_back(Bucket(SCEVLess(SE)));
-        Buckets[Buckets.size()-1].insert(std::make_pair(LSCEV, MemI));
+        if (Buckets.size() == MaxVars)
+          return MadeChange;
+        Buckets.push_back(Bucket(LSCEV, MemI));
       }
     }
   }
 
-  if (Buckets.empty() || Buckets.size() > MaxVars)
+  if (Buckets.empty())
     return MadeChange;
 
   BasicBlock *LoopPredecessor = L->getLoopPredecessor();
@@ -253,8 +254,45 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     // The base address of each bucket is transformed into a phi and the others
     // are rewritten as offsets of that variable.
 
+    // We have a choice now of which instruction's memory operand we use as the
+    // base for the generated PHI. Always picking the first instruction in each
+    // bucket does not work well, specifically because that instruction might
+    // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
+    // the choice is somewhat arbitrary, because the backend will happily
+    // generate direct offsets from both the pre-incremented and
+    // post-incremented pointer values. Thus, we'll pick the first non-prefetch
+    // instruction in each bucket, and adjust the recurrence and other offsets
+    // accordingly. 
+    for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) {
+      if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr))
+        if (II->getIntrinsicID() == Intrinsic::prefetch)
+          continue;
+
+      // If we'd otherwise pick the first element anyway, there's nothing to do.
+      if (j == 0)
+        break;
+
+      // If our chosen element has no offset from the base pointer, there's
+      // nothing to do.
+      if (!Buckets[i].Elements[j].Offset ||
+          Buckets[i].Elements[j].Offset->isZero())
+        break;
+
+      const SCEV *Offset = Buckets[i].Elements[j].Offset;
+      Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset);
+      for (auto &E : Buckets[i].Elements) {
+        if (E.Offset)
+          E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
+        else
+          E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
+      }
+
+      std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]);
+      break;
+    }
+
     const SCEVAddRecExpr *BasePtrSCEV =
-      cast<SCEVAddRecExpr>(Buckets[i].begin()->first);
+      cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV);
     if (!BasePtrSCEV->isAffine())
       continue;
 
@@ -262,7 +300,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     assert(BasePtrSCEV->getLoop() == L &&
            "AddRec for the wrong loop?");
 
-    Instruction *MemI = Buckets[i].begin()->second;
+    // The instruction corresponding to the Bucket's BaseSCEV must be the first
+    // in the vector of elements.
+    Instruction *MemI = Buckets[i].Elements.begin()->Instr;
     Value *BasePtr = GetPointerOperand(MemI);
     assert(BasePtr && "No pointer operand");
 
@@ -327,18 +367,20 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     BasePtr->replaceAllUsesWith(NewBasePtr);
     RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
 
-    Value *LastNewPtr = NewBasePtr;
-    for (Bucket::iterator I = std::next(Buckets[i].begin()),
-         IE = Buckets[i].end(); I != IE; ++I) {
-      Value *Ptr = GetPointerOperand(I->second);
+    // Keep track of the replacement pointer values we've inserted so that we
+    // don't generate more pointer values than necessary.
+    SmallPtrSet<Value *, 16> NewPtrs;
+    NewPtrs.insert( NewBasePtr);
+
+    for (auto I = std::next(Buckets[i].Elements.begin()),
+         IE = Buckets[i].Elements.end(); I != IE; ++I) {
+      Value *Ptr = GetPointerOperand(I->Instr);
       assert(Ptr && "No pointer operand");
-      if (Ptr == LastNewPtr)
+      if (NewPtrs.count(Ptr))
         continue;
 
       Instruction *RealNewPtr;
-      const SCEVConstant *Diff =
-        cast<SCEVConstant>(SE->getMinusSCEV(I->first, BasePtrSCEV));
-      if (Diff->isZero()) {
+      if (!I->Offset || I->Offset->getValue()->isZero()) {
         RealNewPtr = NewBasePtr;
       } else {
         Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
@@ -348,11 +390,11 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
         else if (isa<PHINode>(PtrIP))
           PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
         else if (!PtrIP)
-          PtrIP = I->second;
+          PtrIP = I->Instr;
 
         GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
-            I8Ty, PtrInc, Diff->getValue(),
-            I->second->hasName() ? I->second->getName() + ".off" : "", PtrIP);
+            I8Ty, PtrInc, I->Offset->getValue(),
+            I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP);
         if (!PtrIP)
           NewPtr->insertAfter(cast<Instruction>(PtrInc));
         NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
@@ -373,7 +415,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
       Ptr->replaceAllUsesWith(ReplNewPtr);
       RecursivelyDeleteTriviallyDeadInstructions(Ptr);
 
-      LastNewPtr = RealNewPtr;
+      NewPtrs.insert(RealNewPtr);
     }
 
     MadeChange = true;
diff --git a/test/CodeGen/PowerPC/preincprep-nontrans-crash.ll b/test/CodeGen/PowerPC/preincprep-nontrans-crash.ll
new file mode 100644
index 00000000000..cfec302d469
--- /dev/null
+++ b/test/CodeGen/PowerPC/preincprep-nontrans-crash.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64-i64:64-n32:64"
+target triple = "powerpc64le-linux"
+
+%struct.BSS1.0.9.28.39.43.46.47.54.56.57.64.65.69.71.144 = type <{ [220 x i8] }>
+
+@.BSS1 = external unnamed_addr global %struct.BSS1.0.9.28.39.43.46.47.54.56.57.64.65.69.71.144, align 32
+
+; Function Attrs: noinline nounwind
+define void @ety2_() #0 {
+
+; This test case used to crash because the preinc prep pass would assume that
+; if X-Y could be simplified to a constant, than so could Y-X. While not
+; desirable, we cannot actually make this guarantee.
+; CHECK-LABEL: @ety2_
+
+L.entry:
+  %0 = load i32, i32* undef, align 4
+  %1 = sext i32 %0 to i64
+  %2 = shl nsw i64 %1, 3
+  %3 = add nsw i64 %2, 8
+  br label %L.LB1_425
+
+L.LB1_425:                                        ; preds = %L.LB1_427, %L.entry
+  %4 = phi i64 [ %21, %L.LB1_427 ], [ undef, %L.entry ]
+  br i1 undef, label %L.LB1_427, label %L.LB1_816
+
+L.LB1_816:                                        ; preds = %L.LB1_425
+  switch i32 undef, label %L.LB1_432 [
+    i32 30, label %L.LB1_805
+    i32 10, label %L.LB1_451
+    i32 20, label %L.LB1_451
+  ]
+
+L.LB1_451:                                        ; preds = %L.LB1_816, %L.LB1_816
+  unreachable
+
+L.LB1_432:                                        ; preds = %L.LB1_816
+  %.in.31 = lshr i64 %4, 32
+  %5 = trunc i64 %.in.31 to i32
+  br i1 undef, label %L.LB1_769, label %L.LB1_455
+
+L.LB1_455:                                        ; preds = %L.LB1_432
+  unreachable
+
+L.LB1_769:                                        ; preds = %L.LB1_432
+  %6 = sext i32 %5 to i64
+  %7 = add nsw i64 %6, 2
+  %8 = add nsw i64 %6, -1
+  %9 = mul i64 %8, %1
+  %10 = add i64 %9, %7
+  %11 = shl i64 %10, 3
+  %12 = getelementptr i8, i8* undef, i64 %11
+  %13 = mul nsw i64 %6, %1
+  %14 = add i64 %7, %13
+  %15 = shl i64 %14, 3
+  %16 = getelementptr i8, i8* undef, i64 %15
+  br i1 undef, label %L.LB1_662, label %L.LB1_662.prol
+
+L.LB1_662.prol:                                   ; preds = %L.LB1_662.prol, %L.LB1_769
+  %indvars.iv.next20.prol = add nuw nsw i64 undef, 1
+  br i1 undef, label %L.LB1_662, label %L.LB1_662.prol
+
+L.LB1_662:                                        ; preds = %L.LB1_437.2, %L.LB1_662.prol, %L.LB1_769
+  %indvars.iv19 = phi i64 [ %indvars.iv.next20.3, %L.LB1_437.2 ], [ 0, %L.LB1_769 ], [ %indvars.iv.next20.prol, %L.LB1_662.prol ]
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %17 = mul i64 %indvars.iv.next20, %3
+  %18 = getelementptr i8, i8* %16, i64 %17
+  %19 = bitcast i8* %18 to double*
+  store double 0.000000e+00, double* %19, align 8
+  %indvars.iv.next20.1 = add nsw i64 %indvars.iv19, 2
+  %20 = mul i64 %indvars.iv.next20.1, %3
+  br i1 undef, label %L.LB1_437.2, label %L.LB1_824.2
+
+L.LB1_427:                                        ; preds = %L.LB1_425
+  %21 = load i64, i64* bitcast (i8* getelementptr inbounds (%struct.BSS1.0.9.28.39.43.46.47.54.56.57.64.65.69.71.144, %struct.BSS1.0.9.28.39.43.46.47.54.56.57.64.65.69.71.144* @.BSS1, i64 0, i32 0, i64 8) to i64*), align 8
+  br label %L.LB1_425
+
+L.LB1_805:                                        ; preds = %L.LB1_816
+  ret void
+
+L.LB1_824.2:                                      ; preds = %L.LB1_662
+  %22 = getelementptr i8, i8* %12, i64 %20
+  %23 = bitcast i8* %22 to double*
+  store double 0.000000e+00, double* %23, align 8
+  br label %L.LB1_437.2
+
+L.LB1_437.2:                                      ; preds = %L.LB1_824.2, %L.LB1_662
+  %indvars.iv.next20.3 = add nsw i64 %indvars.iv19, 4
+  br label %L.LB1_662
+}
+
+attributes #0 = { noinline nounwind }
+
-- 
2.34.1