From c4aff77ec0ab43d21c93b20070b9edc517b8a2e1 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Fri, 14 Aug 2015 02:02:05 +0000 Subject: [PATCH] [SeparateConstOffsetFromGEP] sext(a)+sext(b) => sext(a+b) when a+b can't sign-overflow. Summary: This patch implements my promised optimization to reunites certain sexts from operands after we extract the constant offset. See the header comment of reuniteExts for its motivation. One key building block that enables this optimization is Bjarke's poison value analysis (D11212). That helps to prove "a +nsw b" can't overflow. Reviewers: broune Subscribers: jholewinski, sanjoy, llvm-commits Differential Revision: http://reviews.llvm.org/D12016 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245003 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/SeparateConstOffsetFromGEP.cpp | 105 +++++++++++++++++- .../NVPTX/split-gep-and-gvn.ll | 38 +++++++ 2 files changed, 138 insertions(+), 5 deletions(-) diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index ad86c8c2828..97950caa3de 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -156,6 +156,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -164,6 +165,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Operator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" @@ -174,6 +176,7 @@ #include "llvm/IR/IRBuilder.h" using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), @@ -319,6 +322,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.setPreservesCFG(); } @@ -373,15 +377,32 @@ private: /// /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. + /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting + /// the constant offset. After extraction, it becomes desirable to reunion the + /// distributed sexts. For example, + /// + /// &a[sext(i +nsw (j +nsw 5)] + /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)] + /// => constant extraction &a[sext(i) + sext(j)] + 5 + /// => reunion &a[sext(i +nsw j)] + 5 + bool reuniteExts(Function &F); + /// A helper that reunites sexts in an instruction. + bool reuniteExts(Instruction *I); + /// Find the closest dominator of that is equivalent to . + Instruction *findClosestMatchingDominator(const SCEV *Key, + Instruction *Dominatee); /// Verify F is free of dead code. void verifyNoDeadCode(Function &F); const DataLayout *DL; - const DominatorTree *DT; + DominatorTree *DT; + ScalarEvolution *SE; const TargetMachine *TM; /// Whether to lower a GEP with multiple indices into arithmetic operations or /// multiple GEPs with a single index. bool LowerGEP; + DenseMap> DominatingExprs; }; } // anonymous namespace @@ -391,6 +412,7 @@ INITIALIZE_PASS_BEGIN( "Split GEPs to a variadic base and a constant offset for better CSE", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", @@ -891,13 +913,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Clear the inbounds attribute because the new index may be off-bound. // e.g., // - // b = add i64 a, 5 - // addr = gep inbounds float, float* p, i64 b + // b = add i64 a, 5 + // addr = gep inbounds float, float* p, i64 b // // is transformed to: // - // addr2 = gep float, float* p, i64 a ; inbounds removed - // addr = gep inbounds float, float* addr2, i64 5 + // addr2 = gep float, float* p, i64 a ; inbounds removed + // addr = gep inbounds float, float* addr2, i64 5 // // If a is -4, although the old index b is in bounds, the new index a is // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the @@ -1013,6 +1035,7 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { return false; DT = &getAnalysis().getDomTree(); + SE = &getAnalysis(); bool Changed = false; for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { @@ -1025,12 +1048,84 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { } } + Changed |= reuniteExts(F); + if (VerifyNoDeadCode) verifyNoDeadCode(F); return Changed; } +Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( + const SCEV *Key, Instruction *Dominatee) { + auto Pos = DominatingExprs.find(Key); + if (Pos == DominatingExprs.end()) + return nullptr; + + auto &Candidates = Pos->second; + // Because we process the basic blocks in pre-order of the dominator tree, a + // candidate that doesn't dominate the current instruction won't dominate any + // future instruction either. Therefore, we pop it out of the stack. This + // optimization makes the algorithm O(n). + while (!Candidates.empty()) { + Instruction *Candidate = Candidates.back(); + if (DT->dominates(Candidate, Dominatee)) + return Candidate; + Candidates.pop_back(); + } + return nullptr; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { + if (!SE->isSCEVable(I->getType())) + return false; + + // Dom: LHS+RHS + // I: sext(LHS)+sext(RHS) + // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). + // TODO: handle zext + Value *LHS = nullptr, *RHS = nullptr; + if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) || + match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { + if (LHS->getType() == RHS->getType()) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + if (auto *Dom = findClosestMatchingDominator(Key, I)) { + Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); + NewSExt->takeName(I); + I->replaceAllUsesWith(NewSExt); + RecursivelyDeleteTriviallyDeadInstructions(I); + return true; + } + } + } + + // Add I to DominatingExprs if it's an add/sub that can't sign overflow. + if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) || + match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { + if (isKnownNotFullPoison(I)) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + DominatingExprs[Key].push_back(I); + } + } + return false; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { + bool Changed = false; + DominatingExprs.clear(); + for (auto Node = GraphTraits::nodes_begin(DT); + Node != GraphTraits::nodes_end(DT); ++Node) { + BasicBlock *BB = Node->getBlock(); + for (auto I = BB->begin(); I != BB->end(); ) { + Instruction *Cur = I++; + Changed |= reuniteExts(Cur); + } + } + return Changed; +} + void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { for (auto &B : F) { for (auto &I : B) { diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll index a0410024f6e..e7b3545839c 100644 --- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll +++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll @@ -194,3 +194,41 @@ define void @sum_of_array4(i32 %x, i32 %y, float* nocapture %output) { ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i64 1 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i64 32 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i64 33 + + +; The source code is: +; p0 = &input[sext(x + y)]; +; p1 = &input[sext(x + (y + 5))]; +; +; Without reuniting extensions, SeparateConstOffsetFromGEP would emit +; p0 = &input[sext(x + y)]; +; t1 = &input[sext(x) + sext(y)]; +; p1 = &t1[5]; +; +; With reuniting extensions, it merges p0 and t1 and thus emits +; p0 = &input[sext(x + y)]; +; p1 = &p0[5]; +define void @reunion(i32 %x, i32 %y, float* %input) { +; IR-LABEL: @reunion( +; PTX-LABEL: reunion( +entry: + %xy = add nsw i32 %x, %y + %0 = sext i32 %xy to i64 + %p0 = getelementptr inbounds float, float* %input, i64 %0 + %v0 = load float, float* %p0, align 4 +; PTX: ld.f32 %f{{[0-9]+}}, {{\[}}[[p0:%rd[0-9]+]]{{\]}} + call void @use(float %v0) + + %y5 = add nsw i32 %y, 5 + %xy5 = add nsw i32 %x, %y5 + %1 = sext i32 %xy5 to i64 + %p1 = getelementptr inbounds float, float* %input, i64 %1 +; IR: getelementptr inbounds float, float* %p0, i64 5 + %v1 = load float, float* %p1, align 4 +; PTX: ld.f32 %f{{[0-9]+}}, {{\[}}[[p0]]+20{{\]}} + call void @use(float %v1) + + ret void +} + +declare void @use(float) -- 2.34.1