Swap loop invariant GEP with loop variant GEP to allow more LICM.

author Lawrence Hu <lawrence@codeaurora.org>

Wed, 23 Sep 2015 19:25:30 +0000 (19:25 +0000)

committer Lawrence Hu <lawrence@codeaurora.org>

Wed, 23 Sep 2015 19:25:30 +0000 (19:25 +0000)
author Lawrence Hu <lawrence@codeaurora.org>
Wed, 23 Sep 2015 19:25:30 +0000 (19:25 +0000)
committer Lawrence Hu <lawrence@codeaurora.org>
Wed, 23 Sep 2015 19:25:30 +0000 (19:25 +0000)
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

index bccd8164ed4ffd5179956656980d1a97286ccc21..44ca2b78b3801aa0ca74d02c31d1e594aedc56d8 100644 (file)
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -157,6 +157,9 @@
  //===----------------------------------------------------------------------===//
  
  #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
  #include "llvm/Analysis/ValueTracking.h"
  #include "llvm/IR/Constants.h"
@@ -324,7 +327,9 @@ public:
      AU.addRequired<DominatorTreeWrapperPass>();
      AU.addRequired<ScalarEvolutionWrapperPass>();
      AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
      AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
    }
  
    bool doInitialization(Module &M) override {
@@ -395,10 +400,20 @@ private:
    /// Verify F is free of dead code.
    void verifyNoDeadCode(Function &F);
  
+  bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+  // Swap the index operand of two GEP.
+  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+  // Check if it is safe to swap operand of two GEP.
+  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+                            Loop *CurLoop);
+
    const DataLayout *DL;
    DominatorTree *DT;
    ScalarEvolution *SE;
    const TargetMachine *TM;
+
+  LoopInfo *LI;
+  TargetLibraryInfo *TLI;
    /// Whether to lower a GEP with multiple indices into arithmetic operations or
    /// multiple GEPs with a single index.
    bool LowerGEP;
@@ -414,6 +429,8 @@ INITIALIZE_PASS_BEGIN(
  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
  INITIALIZE_PASS_END(
      SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
      "Split GEPs to a variadic base and a constant offset for better CSE", false,
@@ -756,6 +773,13 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
    Type *I8PtrTy =
        Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
    Value *ResultPtr = Variadic->getOperand(0);
+  Loop *L = LI->getLoopFor(Variadic->getParent());
+  // Check if the base is not loop invariant or used more than once.
+  bool isSwapCandidate =
+      L && L->isLoopInvariant(ResultPtr) &&
+      !hasMoreThanOneUseInLoop(ResultPtr, L);
+  Value *FirstResult = nullptr;
+
    if (ResultPtr->getType() != I8PtrTy)
      ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
  
@@ -784,6 +808,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
        // Create an ugly GEP with a single index for each index.
        ResultPtr =
            Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+      if (FirstResult == nullptr)
+        FirstResult = ResultPtr;
      }
    }
  
@@ -792,7 +818,17 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
      Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
      ResultPtr =
          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
-  }
+  } else
+    isSwapCandidate = false;
+
+  // If we created a GEP with constant index, and the base is loop invariant,
+  // then we swap the first one with it, so LICM can move constant GEP out
+  // later.
+  GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult);
+  GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
+  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+    swapGEPOperand(FirstGEP, SecondGEP);
+
    if (ResultPtr->getType() != Variadic->getType())
      ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
  
@@ -1036,16 +1072,15 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
  
    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
    bool Changed = false;
    for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
-    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) {
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;)
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
          Changed |= splitGEP(GEP);
-      }
-      // No need to split GEP ConstantExprs because all its indices are constant
-      // already.
-    }
+    // No need to split GEP ConstantExprs because all its indices are constant
+    // already.
    }
  
    Changed |= reuniteExts(F);
@@ -1138,3 +1173,93 @@ void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
      }
    }
  }
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+  if (!FirstGEP || !FirstGEP->hasOneUse())
+    return false;
+
+  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+    return false;
+
+  if (FirstGEP == SecondGEP)
+    return false;
+
+  unsigned FirstNum = FirstGEP->getNumOperands();
+  unsigned SecondNum = SecondGEP->getNumOperands();
+  // Give up if the number of operands are not 2.
+  if (FirstNum != SecondNum || FirstNum != 2)
+    return false;
+
+  Value *FirstBase = FirstGEP->getOperand(0);
+  Value *SecondBase = SecondGEP->getOperand(0);
+  Value *FirstOffset = FirstGEP->getOperand(1);
+  // Give up if the index of the first GEP is loop invariant.
+  if (CurLoop->isLoopInvariant(FirstOffset))
+    return false;
+
+  // Give up if base doesn't have same type.
+  if (FirstBase->getType() != SecondBase->getType())
+    return false;
+
+  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+  // Check if the second operand of first GEP has constant coefficient.
+  // For an example, for the following code,  we won't gain anything by
+  // hoisting the second GEP out because the second GEP can be folded away.
+  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+  //   %67 = shl i64 %scevgep.sum.ur159, 2
+  //   %uglygep160 = getelementptr i8* %65, i64 %67
+  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+  // Skip constant shift instruction which may be generated by Splitting GEPs.
+  if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+      dyn_cast<ConstantInt>(FirstOffsetDef->getOperand(1)))
+    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+  // Give up if FirstOffsetDef is an Add or Sub with constant.
+  // Because it may not profitable at all due to constant folding.
+  if (FirstOffsetDef)
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+      unsigned opc = BO->getOpcode();
+      if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+          (dyn_cast<ConstantInt>(BO->getOperand(0)) ||
+           dyn_cast<ConstantInt>(BO->getOperand(1))))
+        return false;
+    }
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+  int UsesInLoop = 0;
+  for (User *U : V->users()) {
+    if (Instruction *User = dyn_cast<Instruction>(U))
+      if (L->contains(User))
+        if (++UsesInLoop > 1)
+          return true;
+  }
+  return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+                                                GetElementPtrInst *Second) {
+  Value *Offset1 = First->getOperand(1);
+  Value *Offset2 = Second->getOperand(1);
+  First->setOperand(1, Offset2);
+  Second->setOperand(1, Offset1);
+
+  // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+  const DataLayout &DAL = First->getModule()->getDataLayout();
+  APInt Offset(DAL.getPointerSizeInBits(
+                   cast<PointerType>(First->getType())->getAddressSpace()),
+               0);
+  Value *NewBase =
+      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+  uint64_t ObjectSize;
+  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+     Offset.ugt(ObjectSize)) {
+    First->setIsInBounds(false);
+    Second->setIsInBounds(false);
+  } else
+    First->setIsInBounds(true);
+}
diff --git a/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll b/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll

new file mode 100644 (file)

index 0000000..8427799
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O3 -aarch64-gep-opt=true  -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck <%t %s
+; REQUIRES: asserts
+target triple = "aarch64--linux-android"
+
+%typeD = type { i32, i32, [256 x i32], [257 x i32] }
+
+; Function Attrs: noreturn nounwind uwtable
+define i32 @test1(%typeD* nocapture %s) {
+entry:
+; CHECK-LABEL: entry:
+; CHECK:    %uglygep = getelementptr i8, i8* %0, i64 1032
+; CHECK:    br label %do.body.i
+
+
+  %tPos = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 0
+  %k0 = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 1
+  %.pre = load i32, i32* %tPos, align 4
+  br label %do.body.i
+
+do.body.i:
+; CHECK-LABEL: do.body.i:
+; CHECK:          %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3
+; CHECK-NEXT:     %4 = bitcast i8* %uglygep2 to i32*
+; CHECK-NOT:      %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032
+
+
+  %0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ]
+  %1 = phi i32 [ 0, %entry ], [ %.be6, %do.body.i.backedge ]
+  %add.i = add nsw i32 %1, %0
+  %shr.i = ashr i32 %add.i, 1
+  %idxprom.i = sext i32 %shr.i to i64
+  %arrayidx.i = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 3, i64 %idxprom.i
+  %2 = load i32, i32* %arrayidx.i, align 4
+  %cmp.i = icmp sle i32 %2, %.pre
+  %na.1.i = select i1 %cmp.i, i32 %0, i32 %shr.i
+  %nb.1.i = select i1 %cmp.i, i32 %shr.i, i32 %1
+  %sub.i = sub nsw i32 %na.1.i, %nb.1.i
+  %cmp1.i = icmp eq i32 %sub.i, 1
+  br i1 %cmp1.i, label %fooo.exit, label %do.body.i.backedge
+
+do.body.i.backedge:
+  %.be = phi i32 [ %na.1.i, %do.body.i ], [ 256, %fooo.exit ]
+  %.be6 = phi i32 [ %nb.1.i, %do.body.i ], [ 0, %fooo.exit ]
+  br label %do.body.i
+
+fooo.exit:                              ; preds = %do.body.i
+  store i32 %nb.1.i, i32* %k0, align 4
+  br label %do.body.i.backedge
+}
+
author	Lawrence Hu <lawrence@codeaurora.org>
	Wed, 23 Sep 2015 19:25:30 +0000 (19:25 +0000)
committer	Lawrence Hu <lawrence@codeaurora.org>
	Wed, 23 Sep 2015 19:25:30 +0000 (19:25 +0000)
lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp		patch \| blob \| history
test/CodeGen/AArch64/aarch64-loop-gep-opt.ll	[new file with mode: 0644]	patch \| blob