X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FCodeGen%2FCodeGenPrepare.cpp;h=ac8fbbf9c762fd7582809a43c1025b867b7e4051;hp=e043bfbfa270229a527da16d5ce3099ba5e09b4a;hb=70b36591bc9e9f0772c62794bf1a202b1884aaf6;hpb=f3ac16dcb4749fb8d75a0de24d4e6240f1bfdd77 diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index e043bfbfa27..ac8fbbf9c76 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -15,10 +15,16 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -28,23 +34,27 @@ #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/NoFolder.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Statepoint.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SimplifyLibCalls.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -61,6 +71,9 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " "computations were sunk"); STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); +STATISTIC(NumAndsAdded, + "Number of and mask instructions added to form ext loads"); +STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); @@ -71,6 +84,10 @@ static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); +static cl::opt + DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false), + cl::desc("Disable GC optimizations in CodeGenPrepare")); + static cl::opt DisableSelectToBranch( "disable-cgp-select2branch", cl::Hidden, cl::init(false), cl::desc("Disable select to branch conversion.")); @@ -91,27 +108,30 @@ static cl::opt StressStoreExtract( "stress-cgp-store-extract", cl::Hidden, cl::init(false), cl::desc("Stress test store(extract) optimizations in CodeGenPrepare")); +static cl::opt DisableExtLdPromotion( + "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), + cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in " + "CodeGenPrepare")); + +static cl::opt StressExtLdPromotion( + "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), + cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) " + "optimization in CodeGenPrepare")); + namespace { typedef SmallPtrSet SetOfInstrs; -struct TypeIsSExt { - Type *Ty; - bool IsSExt; - TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {} -}; +typedef PointerIntPair TypeIsSExt; typedef DenseMap InstrToOrigTy; +class TypePromotionTransaction; class CodeGenPrepare : public FunctionPass { - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. const TargetMachine *TM; const TargetLowering *TLI; const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; - DominatorTree *DT; - /// CurInstIterator - As we scan instructions optimizing them, this is the - /// next instruction to optimize. Xforms that can invalidate this should - /// update it. + /// As we scan instructions optimizing them, this is the next instruction + /// to optimize. Transforms that can invalidate this should update it. BasicBlock::iterator CurInstIterator; /// Keeps track of non-local addresses that have been sunk into a block. @@ -119,23 +139,30 @@ typedef DenseMap InstrToOrigTy; /// multiple load/stores of the same address. ValueMap SunkAddrs; - /// Keeps track of all truncates inserted for the current function. - SetOfInstrs InsertedTruncsSet; + /// Keeps track of all instructions inserted for the current function. + SetOfInstrs InsertedInsts; /// Keeps track of the type of the related instruction before their /// promotion for the current function. InstrToOrigTy PromotedInsts; - /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to - /// be updated. + /// True if CFG is modified in any way. bool ModifiedDT; - /// OptSize - True if optimizing for size. + /// True if optimizing for size. bool OptSize; + /// DataLayout for the Function being processed. + const DataLayout *DL; + + // XXX-comment:We need DominatorTree to figure out which instruction to + // taint. + DominatorTree *DT; + public: static char ID; // Pass identification, replacement for typeid explicit CodeGenPrepare(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr) { + : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr), + DT(nullptr) { initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -144,77 +171,1090 @@ typedef DenseMap InstrToOrigTy; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved(); - AU.addRequired(); - AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); } private: - bool EliminateFallThrough(Function &F); - bool EliminateMostlyEmptyBlocks(Function &F); - bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; - void EliminateMostlyEmptyBlock(BasicBlock *BB); - bool OptimizeBlock(BasicBlock &BB); - bool OptimizeInst(Instruction *I); - bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy); - bool OptimizeInlineAsmInst(CallInst *CS); - bool OptimizeCallInst(CallInst *CI); - bool MoveExtToFormExtLoad(Instruction *I); - bool OptimizeExtUses(Instruction *I); - bool OptimizeSelectInst(SelectInst *SI); - bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI); - bool OptimizeExtractElementInst(Instruction *Inst); - bool DupRetToEnableTailCallOpts(BasicBlock *BB); - bool PlaceDbgValues(Function &F); + bool eliminateFallThrough(Function &F); + bool eliminateMostlyEmptyBlocks(Function &F); + bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; + void eliminateMostlyEmptyBlock(BasicBlock *BB); + bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); + bool optimizeInst(Instruction *I, bool& ModifiedDT); + bool optimizeMemoryInst(Instruction *I, Value *Addr, + Type *AccessTy, unsigned AS); + bool optimizeInlineAsmInst(CallInst *CS); + bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); + bool moveExtToFormExtLoad(Instruction *&I); + bool optimizeExtUses(Instruction *I); + bool optimizeLoadExt(LoadInst *I); + bool optimizeSelectInst(SelectInst *SI); + bool optimizeShuffleVectorInst(ShuffleVectorInst *SI); + bool optimizeSwitchInst(SwitchInst *CI); + bool optimizeExtractElementInst(Instruction *Inst); + bool dupRetToEnableTailCallOpts(BasicBlock *BB); + bool placeDbgValues(Function &F); bool sinkAndCmp(Function &F); + bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, + Instruction *&Inst, + const SmallVectorImpl &Exts, + unsigned CreatedInstCost); bool splitBranchCondition(Function &F); + bool simplifyOffsetableRelocate(Instruction &I); + void stripInvariantGroupMetadata(Instruction &I); }; } char CodeGenPrepare::ID = 0; -INITIALIZE_TM_PASS(CodeGenPrepare, "codegenprepare", +INITIALIZE_TM_PASS_BEGIN(CodeGenPrepare, "codegenprepare", + "Optimize for code generation", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_TM_PASS_END(CodeGenPrepare, "codegenprepare", "Optimize for code generation", false, false) FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { return new CodeGenPrepare(TM); } +namespace { + +bool StoreAddressDependOnValue(StoreInst* SI, Value* DepVal); +Value* GetUntaintedAddress(Value* CurrentAddress); + +// The depth we trace down a variable to look for its dependence set. +const unsigned kDependenceDepth = 4; + +// Recursively looks for variables that 'Val' depends on at the given depth +// 'Depth', and adds them in 'DepSet'. If 'InsertOnlyLeafNodes' is true, only +// inserts the leaf node values; otherwise, all visited nodes are included in +// 'DepSet'. Note that constants will be ignored. +template +void recursivelyFindDependence(SetType* DepSet, Value* Val, + bool InsertOnlyLeafNodes = false, + unsigned Depth = kDependenceDepth) { + if (Val == nullptr) { + return; + } + if (!InsertOnlyLeafNodes && !isa(Val)) { + DepSet->insert(Val); + } + if (Depth == 0) { + // Cannot go deeper. Insert the leaf nodes. + if (InsertOnlyLeafNodes && !isa(Val)) { + DepSet->insert(Val); + } + return; + } + + // Go one step further to explore the dependence of the operands. + Instruction* I = nullptr; + if ((I = dyn_cast(Val))) { + if (isa(I)) { + // A load is considerd the leaf load of the dependence tree. Done. + DepSet->insert(Val); + return; + } else if (I->isBinaryOp()) { + BinaryOperator* I = dyn_cast(Val); + Value *Op0 = I->getOperand(0), *Op1 = I->getOperand(1); + recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1); + recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes, Depth - 1); + } else if (I->isCast()) { + Value* Op0 = I->getOperand(0); + recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1); + } else if (I->getOpcode() == Instruction::Select) { + Value* Op0 = I->getOperand(0); + Value* Op1 = I->getOperand(1); + Value* Op2 = I->getOperand(2); + recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1); + recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes, Depth - 1); + recursivelyFindDependence(DepSet, Op2, InsertOnlyLeafNodes, Depth - 1); + } else if (I->getOpcode() == Instruction::GetElementPtr) { + for (unsigned i = 0; i < I->getNumOperands(); i++) { + recursivelyFindDependence(DepSet, I->getOperand(i), InsertOnlyLeafNodes, + Depth - 1); + } + } else if (I->getOpcode() == Instruction::Store) { + auto* SI = dyn_cast(Val); + recursivelyFindDependence(DepSet, SI->getPointerOperand(), + InsertOnlyLeafNodes, Depth - 1); + recursivelyFindDependence(DepSet, SI->getValueOperand(), + InsertOnlyLeafNodes, Depth - 1); + } else { + Value* Op0 = nullptr; + Value* Op1 = nullptr; + switch (I->getOpcode()) { + case Instruction::ICmp: + case Instruction::FCmp: { + Op0 = I->getOperand(0); + Op1 = I->getOperand(1); + recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, + Depth - 1); + recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes, + Depth - 1); + break; + } + case Instruction::PHI: { + for (int i = 0; i < I->getNumOperands(); i++) { + auto* op = I->getOperand(i); + if (DepSet->count(op) == 0) { + recursivelyFindDependence(DepSet, I->getOperand(i), + InsertOnlyLeafNodes, Depth - 1); + } + } + break; + } + default: { + // Be conservative. Add it and be done with it. + DepSet->insert(Val); + return; + } + } + } + } else if (isa(Val)) { + // Not interested in constant values. Done. + return; + } else { + // Be conservative. Add it and be done with it. + DepSet->insert(Val); + return; + } +} + +// Helper function to create a Cast instruction. +Value* createCast(IRBuilder& Builder, Value* DepVal, + Type* TargetIntegerType) { + Instruction::CastOps CastOp = Instruction::BitCast; + switch (DepVal->getType()->getTypeID()) { + case Type::IntegerTyID: { + CastOp = Instruction::SExt; + break; + } + case Type::FloatTyID: + case Type::DoubleTyID: { + CastOp = Instruction::FPToSI; + break; + } + case Type::PointerTyID: { + CastOp = Instruction::PtrToInt; + break; + } + default: { break; } + } + + return Builder.CreateCast(CastOp, DepVal, TargetIntegerType); +} + +// Given a value, if it's a tainted address, this function returns the +// instruction that ORs the "dependence value" with the "original address". +// Otherwise, returns nullptr. This instruction is the first OR instruction +// where one of its operand is an AND instruction with an operand being 0. +// +// E.g., it returns '%4 = or i32 %3, %2' given 'CurrentAddress' is '%5'. +// %0 = load i32, i32* @y, align 4, !tbaa !1 +// %cmp = icmp ne i32 %0, 42 // <== this is like the condition +// %1 = sext i1 %cmp to i32 +// %2 = ptrtoint i32* @x to i32 +// %3 = and i32 %1, 0 +// %4 = or i32 %3, %2 +// %5 = inttoptr i32 %4 to i32* +// store i32 1, i32* %5, align 4 +Instruction* getOrAddress(Value* CurrentAddress) { + // Is it a cast from integer to pointer type. + Instruction* OrAddress = nullptr; + Instruction* AndDep = nullptr; + Instruction* CastToInt = nullptr; + Value* ActualAddress = nullptr; + Constant* ZeroConst = nullptr; + + const Instruction* CastToPtr = dyn_cast(CurrentAddress); + if (CastToPtr && CastToPtr->getOpcode() == Instruction::IntToPtr) { + // Is it an OR instruction: %1 = or %and, %actualAddress. + if ((OrAddress = dyn_cast(CastToPtr->getOperand(0))) && + OrAddress->getOpcode() == Instruction::Or) { + // The first operand should be and AND instruction. + AndDep = dyn_cast(OrAddress->getOperand(0)); + if (AndDep && AndDep->getOpcode() == Instruction::And) { + // Also make sure its first operand of the "AND" is 0, or the "AND" is + // marked explicitly by "NoInstCombine". + if ((ZeroConst = dyn_cast(AndDep->getOperand(1))) && + ZeroConst->isNullValue()) { + return OrAddress; + } + } + } + } + // Looks like it's not been tainted. + return nullptr; +} + +// Given a value, if it's a tainted address, this function returns the +// instruction that taints the "dependence value". Otherwise, returns nullptr. +// This instruction is the last AND instruction where one of its operand is 0. +// E.g., it returns '%3' given 'CurrentAddress' is '%5'. +// %0 = load i32, i32* @y, align 4, !tbaa !1 +// %cmp = icmp ne i32 %0, 42 // <== this is like the condition +// %1 = sext i1 %cmp to i32 +// %2 = ptrtoint i32* @x to i32 +// %3 = and i32 %1, 0 +// %4 = or i32 %3, %2 +// %5 = inttoptr i32 %4 to i32* +// store i32 1, i32* %5, align 4 +Instruction* getAndDependence(Value* CurrentAddress) { + // If 'CurrentAddress' is tainted, get the OR instruction. + auto* OrAddress = getOrAddress(CurrentAddress); + if (OrAddress == nullptr) { + return nullptr; + } + + // No need to check the operands. + auto* AndDepInst = dyn_cast(OrAddress->getOperand(0)); + assert(AndDepInst); + return AndDepInst; +} + +// Given a value, if it's a tainted address, this function returns +// the "dependence value", which is the first operand in the AND instruction. +// E.g., it returns '%1' given 'CurrentAddress' is '%5'. +// %0 = load i32, i32* @y, align 4, !tbaa !1 +// %cmp = icmp ne i32 %0, 42 // <== this is like the condition +// %1 = sext i1 %cmp to i32 +// %2 = ptrtoint i32* @x to i32 +// %3 = and i32 %1, 0 +// %4 = or i32 %3, %2 +// %5 = inttoptr i32 %4 to i32* +// store i32 1, i32* %5, align 4 +Value* getDependence(Value* CurrentAddress) { + auto* AndInst = getAndDependence(CurrentAddress); + if (AndInst == nullptr) { + return nullptr; + } + return AndInst->getOperand(0); +} + +// Given an address that has been tainted, returns the only condition it depends +// on, if any; otherwise, returns nullptr. +Value* getConditionDependence(Value* Address) { + auto* Dep = getDependence(Address); + if (Dep == nullptr) { + // 'Address' has not been dependence-tainted. + return nullptr; + } + + Value* Operand = Dep; + while (true) { + auto* Inst = dyn_cast(Operand); + if (Inst == nullptr) { + // Non-instruction type does not have condition dependence. + return nullptr; + } + if (Inst->getOpcode() == Instruction::ICmp) { + return Inst; + } else { + if (Inst->getNumOperands() != 1) { + return nullptr; + } else { + Operand = Inst->getOperand(0); + } + } + } +} + +// Conservatively decides whether the dependence set of 'Val1' includes the +// dependence set of 'Val2'. If 'ExpandSecondValue' is false, we do not expand +// 'Val2' and use that single value as its dependence set. +// If it returns true, it means the dependence set of 'Val1' includes that of +// 'Val2'; otherwise, it only means we cannot conclusively decide it. +bool dependenceSetInclusion(Value* Val1, Value* Val2, + int Val1ExpandLevel = 2 * kDependenceDepth, + int Val2ExpandLevel = kDependenceDepth) { + typedef SmallSet IncludingSet; + typedef SmallSet IncludedSet; + + IncludingSet DepSet1; + IncludedSet DepSet2; + // Look for more depths for the including set. + recursivelyFindDependence(&DepSet1, Val1, false /*Insert all visited nodes*/, + Val1ExpandLevel); + recursivelyFindDependence(&DepSet2, Val2, true /*Only insert leaf nodes*/, + Val2ExpandLevel); + + auto set_inclusion = [](IncludingSet FullSet, IncludedSet Subset) { + for (auto* Dep : Subset) { + if (0 == FullSet.count(Dep)) { + return false; + } + } + return true; + }; + bool inclusion = set_inclusion(DepSet1, DepSet2); + DEBUG(dbgs() << "[dependenceSetInclusion]: " << inclusion << "\n"); + DEBUG(dbgs() << "Including set for: " << *Val1 << "\n"); + DEBUG(for (const auto* Dep : DepSet1) { dbgs() << "\t\t" << *Dep << "\n"; }); + DEBUG(dbgs() << "Included set for: " << *Val2 << "\n"); + DEBUG(for (const auto* Dep : DepSet2) { dbgs() << "\t\t" << *Dep << "\n"; }); + + return inclusion; +} + +// Recursively iterates through the operands spawned from 'DepVal'. If there +// exists a single value that 'DepVal' only depends on, we call that value the +// root dependence of 'DepVal' and return it. Otherwise, return 'DepVal'. +Value* getRootDependence(Value* DepVal) { + SmallSet DepSet; + for (unsigned depth = kDependenceDepth; depth > 0; --depth) { + recursivelyFindDependence(&DepSet, DepVal, true /*Only insert leaf nodes*/, + depth); + if (DepSet.size() == 1) { + return *DepSet.begin(); + } + DepSet.clear(); + } + return DepVal; +} + +// This function actually taints 'DepVal' to the address to 'SI'. If the +// address +// of 'SI' already depends on whatever 'DepVal' depends on, this function +// doesn't do anything and returns false. Otherwise, returns true. +// +// This effect forces the store and any stores that comes later to depend on +// 'DepVal'. For example, we have a condition "cond", and a store instruction +// "s: STORE addr, val". If we want "s" (and any later store) to depend on +// "cond", we do the following: +// %conv = sext i1 %cond to i32 +// %addrVal = ptrtoint i32* %addr to i32 +// %andCond = and i32 conv, 0; +// %orAddr = or i32 %andCond, %addrVal; +// %NewAddr = inttoptr i32 %orAddr to i32*; +// +// This is a more concrete example: +// ------ +// %0 = load i32, i32* @y, align 4, !tbaa !1 +// %cmp = icmp ne i32 %0, 42 // <== this is like the condition +// %1 = sext i1 %cmp to i32 +// %2 = ptrtoint i32* @x to i32 +// %3 = and i32 %1, 0 +// %4 = or i32 %3, %2 +// %5 = inttoptr i32 %4 to i32* +// store i32 1, i32* %5, align 4 +bool taintStoreAddress(StoreInst* SI, Value* DepVal) { + // Set the insertion point right after the 'DepVal'. + Instruction* Inst = nullptr; + IRBuilder Builder(SI); + BasicBlock* BB = SI->getParent(); + Value* Address = SI->getPointerOperand(); + Type* TargetIntegerType = + IntegerType::get(Address->getContext(), + BB->getModule()->getDataLayout().getPointerSizeInBits()); + + // Does SI's address already depends on whatever 'DepVal' depends on? + if (StoreAddressDependOnValue(SI, DepVal)) { + return false; + } + + // Figure out if there's a root variable 'DepVal' depends on. For example, we + // can extract "getelementptr inbounds %struct, %struct* %0, i64 0, i32 123" + // to be "%struct* %0" since all other operands are constant. + auto* RootVal = getRootDependence(DepVal); + auto* RootInst = dyn_cast(RootVal); + auto* DepValInst = dyn_cast(DepVal); + if (RootInst && DepValInst && + RootInst->getParent() == DepValInst->getParent()) { + DepVal = RootVal; + } + + // Is this already a dependence-tainted store? + Value* OldDep = getDependence(Address); + if (OldDep) { + // The address of 'SI' has already been tainted. Just need to absorb the + // DepVal to the existing dependence in the address of SI. + Instruction* AndDep = getAndDependence(Address); + IRBuilder Builder(AndDep); + Value* NewDep = nullptr; + if (DepVal->getType() == AndDep->getType()) { + NewDep = Builder.CreateAnd(OldDep, DepVal); + } else { + NewDep = Builder.CreateAnd( + OldDep, createCast(Builder, DepVal, TargetIntegerType)); + } + + auto* NewDepInst = dyn_cast(NewDep); + + // Use the new AND instruction as the dependence + AndDep->setOperand(0, NewDep); + return true; + } + + // SI's address has not been tainted. Now taint it with 'DepVal'. + Value* CastDepToInt = createCast(Builder, DepVal, TargetIntegerType); + Value* PtrToIntCast = Builder.CreatePtrToInt(Address, TargetIntegerType); + Value* AndDepVal = + Builder.CreateAnd(CastDepToInt, ConstantInt::get(TargetIntegerType, 0)); + auto AndInst = dyn_cast(AndDepVal); + // XXX-comment: The original IR InstCombiner would change our and instruction + // to a select and then the back end optimize the condition out. We attach a + // flag to instructions and set it here to inform the InstCombiner to not to + // touch this and instruction at all. + Value* OrAddr = Builder.CreateOr(AndDepVal, PtrToIntCast); + Value* NewAddr = Builder.CreateIntToPtr(OrAddr, Address->getType()); + + DEBUG(dbgs() << "[taintStoreAddress]\n" + << "Original store: " << *SI << '\n'); + SI->setOperand(1, NewAddr); + + // Debug output. + DEBUG(dbgs() << "\tTargetIntegerType: " << *TargetIntegerType << '\n' + << "\tCast dependence value to integer: " << *CastDepToInt + << '\n' + << "\tCast address to integer: " << *PtrToIntCast << '\n' + << "\tAnd dependence value: " << *AndDepVal << '\n' + << "\tOr address: " << *OrAddr << '\n' + << "\tCast or instruction to address: " << *NewAddr << "\n\n"); + + return true; +} + +// Looks for the previous store in the if block --- 'BrBB', which makes the +// speculative store 'StoreToHoist' safe. +Value* getSpeculativeStoreInPrevBB(StoreInst* StoreToHoist, BasicBlock* BrBB) { + assert(StoreToHoist && "StoreToHoist must be a real store"); + + Value* StorePtr = StoreToHoist->getPointerOperand(); + + // Look for a store to the same pointer in BrBB. + for (BasicBlock::reverse_iterator RI = BrBB->rbegin(), RE = BrBB->rend(); + RI != RE; ++RI) { + Instruction* CurI = &*RI; + + StoreInst* SI = dyn_cast(CurI); + // Found the previous store make sure it stores to the same location. + // XXX-update: If the previous store's original untainted address are the + // same as 'StorePtr', we are also good to hoist the store. + if (SI && (SI->getPointerOperand() == StorePtr || + GetUntaintedAddress(SI->getPointerOperand()) == StorePtr)) { + // Found the previous store, return its value operand. + return SI; + } + } + + assert(false && + "We should not reach here since this store is safe to speculate"); +} + +// XXX-comment: Returns true if it changes the code, false otherwise (the branch +// condition already depends on 'DepVal'. +bool taintConditionalBranch(BranchInst* BI, Value* DepVal) { + assert(BI->isConditional()); + auto* Cond = BI->getOperand(0); + if (dependenceSetInclusion(Cond, DepVal)) { + // The dependence/ordering is self-evident. + return false; + } + + IRBuilder Builder(BI); + auto* AndDep = + Builder.CreateAnd(DepVal, ConstantInt::get(DepVal->getType(), 0)); + auto* TruncAndDep = + Builder.CreateTrunc(AndDep, IntegerType::get(DepVal->getContext(), 1)); + auto* OrCond = Builder.CreateOr(TruncAndDep, Cond); + BI->setOperand(0, OrCond); + + // Debug output. + DEBUG(dbgs() << "\tTainted branch condition:\n" << *BI->getParent()); + + return true; +} + +bool ConditionalBranchDependsOnValue(BranchInst* BI, Value* DepVal) { + assert(BI->isConditional()); + auto* Cond = BI->getOperand(0); + return dependenceSetInclusion(Cond, DepVal); +} + +// XXX-update: For a relaxed load 'LI', find the first immediate atomic store or +// the first conditional branch. Returns nullptr if there's no such immediately +// following store/branch instructions, which we can only enforce the load with +// 'acquire'. 'ChainedBB' contains all the blocks chained together with +// unconditional branches from 'BB' to the block with the first store/cond +// branch. +template +Instruction* findFirstStoreCondBranchInst(LoadInst* LI, Vector* ChainedBB) { + // In some situations, relaxed loads can be left as is: + // 1. The relaxed load is used to calculate the address of the immediate + // following store; + // 2. The relaxed load is used as a condition in the immediate following + // condition, and there are no stores in between. This is actually quite + // common. E.g., + // int r1 = x.load(relaxed); + // if (r1 != 0) { + // y.store(1, relaxed); + // } + // However, in this function, we don't deal with them directly. Instead, we + // just find the immediate following store/condition branch and return it. + + assert(ChainedBB != nullptr && "Chained BB should not be nullptr"); + auto* BB = LI->getParent(); + ChainedBB->push_back(BB); + auto BE = BB->end(); + auto BBI = BasicBlock::iterator(LI); + BBI++; + while (true) { + for (; BBI != BE; BBI++) { + auto* Inst = dyn_cast(&*BBI); + if (Inst == nullptr) { + continue; + } + if (Inst->getOpcode() == Instruction::Store) { + return Inst; + } else if (Inst->getOpcode() == Instruction::Br) { + auto* BrInst = dyn_cast(Inst); + if (BrInst->isConditional()) { + return Inst; + } else { + // Reinitialize iterators with the destination of the unconditional + // branch. + BB = BrInst->getSuccessor(0); + ChainedBB->push_back(BB); + BBI = BB->begin(); + BE = BB->end(); + break; + } + } + } + if (BBI == BE) { + return nullptr; + } + } +} + +// XXX-comment: Returns whether the code has been changed. +bool taintMonotonicLoads(const SmallVector& MonotonicLoadInsts) { + bool Changed = false; + for (auto* LI : MonotonicLoadInsts) { + SmallVector ChainedBB; + auto* FirstInst = findFirstStoreCondBranchInst(LI, &ChainedBB); + if (FirstInst == nullptr) { + // We don't seem to be able to taint a following store/conditional branch + // instruction. Simply make it acquire. + DEBUG(dbgs() << "[RelaxedLoad]: Transformed to acquire load\n" + << *LI << "\n"); + LI->setOrdering(Acquire); + Changed = true; + continue; + } + // Taint 'FirstInst', which could be a store or a condition branch + // instruction. + if (FirstInst->getOpcode() == Instruction::Store) { + Changed |= taintStoreAddress(dyn_cast(FirstInst), LI); + } else if (FirstInst->getOpcode() == Instruction::Br) { + Changed |= taintConditionalBranch(dyn_cast(FirstInst), LI); + } else { + assert(false && "findFirstStoreCondBranchInst() should return a " + "store/condition branch instruction"); + } + } + return Changed; +} + +// Inserts a fake conditional branch right after the instruction 'SplitInst', +// and the branch condition is 'Condition'. 'SplitInst' will be placed in the +// newly created block. +void AddFakeConditionalBranch(Instruction* SplitInst, Value* Condition) { + auto* BB = SplitInst->getParent(); + TerminatorInst* ThenTerm = nullptr; + TerminatorInst* ElseTerm = nullptr; + SplitBlockAndInsertIfThenElse(Condition, SplitInst, &ThenTerm, &ElseTerm); + assert(ThenTerm && ElseTerm && + "Then/Else terminators cannot be empty after basic block spliting"); + auto* ThenBB = ThenTerm->getParent(); + auto* ElseBB = ElseTerm->getParent(); + auto* TailBB = ThenBB->getSingleSuccessor(); + assert(TailBB && "Tail block cannot be empty after basic block spliting"); + + ThenBB->disableCanEliminateBlock(); + ThenBB->disableCanEliminateBlock(); + TailBB->disableCanEliminateBlock(); + ThenBB->setName(BB->getName() + "Then.Fake"); + ElseBB->setName(BB->getName() + "Else.Fake"); + DEBUG(dbgs() << "Add fake conditional branch:\n" + << "Then Block:\n" + << *ThenBB << "Else Block:\n" + << *ElseBB << "\n"); +} + +// Returns true if the code is changed, and false otherwise. +void TaintRelaxedLoads(Instruction* UsageInst) { + // For better performance, we can add a "AND X 0" instruction before the + // condition. + auto* BB = UsageInst->getParent(); + auto* InsertPoint = UsageInst->getNextNode(); + while (dyn_cast(InsertPoint)) { + InsertPoint = InsertPoint->getNextNode(); + } + IRBuilder Builder(InsertPoint); + // First thing is to cast 'UsageInst' to an integer type if necessary. + Value* AndTarget = nullptr; + if (IntegerType::classof(UsageInst->getType())) { + AndTarget = UsageInst; + } else { + Type* TargetIntegerType = IntegerType::get( + UsageInst->getContext(), + BB->getModule()->getDataLayout().getPointerSizeInBits()); + AndTarget = createCast(Builder, UsageInst, TargetIntegerType); + } + + auto* AndZero = dyn_cast( + Builder.CreateAnd(AndTarget, Constant::getNullValue(AndTarget->getType()))); + auto* FakeCondition = dyn_cast(Builder.CreateICmp( + CmpInst::ICMP_NE, AndZero, Constant::getNullValue(AndTarget->getType()))); + AddFakeConditionalBranch(FakeCondition->getNextNode(), FakeCondition); +} + +// XXX-comment: Finds the appropriate Value derived from an atomic load. +// 'ChainedBB' contains all the blocks chained together with unconditional +// branches from LI's parent BB to the block with the first store/cond branch. +// If we don't find any, it means 'LI' is not used at all (which should not +// happen in practice). We can simply set 'LI' to be acquire just to be safe. +template +Instruction* findMostRecentDependenceUsage(LoadInst* LI, Instruction* LaterInst, + Vector* ChainedBB, + DominatorTree* DT) { + typedef SmallSet UsageSet; + typedef DenseMap> UsageMap; + assert(ChainedBB->size() >= 1 && "ChainedBB must have >=1 size"); + // Mapping from basic block in 'ChainedBB' to the set of dependence usage of + // 'LI' in each block. + UsageMap usage_map; + auto* LoadBB = LI->getParent(); + usage_map[LoadBB] = make_unique(); + usage_map[LoadBB]->insert(LI); + + for (auto* BB : *ChainedBB) { + if (usage_map[BB] == nullptr) { + usage_map[BB] = make_unique(); + } + auto& usage_set = usage_map[BB]; + if (usage_set->size() == 0) { + // The value has not been used. + return nullptr; + } + // Calculate the usage in the current BB first. + std::list bb_usage_list; + std::copy(usage_set->begin(), usage_set->end(), + std::back_inserter(bb_usage_list)); + for (auto list_iter = bb_usage_list.begin(); + list_iter != bb_usage_list.end(); list_iter++) { + auto* val = *list_iter; + for (auto* U : val->users()) { + Instruction* Inst = nullptr; + if (!(Inst = dyn_cast(U))) { + continue; + } + assert(Inst && "Usage value must be an instruction"); + auto iter = + std::find(ChainedBB->begin(), ChainedBB->end(), Inst->getParent()); + if (iter == ChainedBB->end()) { + // Only care about usage within ChainedBB. + continue; + } + auto* UsageBB = *iter; + if (UsageBB == BB) { + // Current BB. + if (!usage_set->count(Inst)) { + bb_usage_list.push_back(Inst); + usage_set->insert(Inst); + } + } else { + // A following BB. + if (usage_map[UsageBB] == nullptr) { + usage_map[UsageBB] = make_unique(); + } + usage_map[UsageBB]->insert(Inst); + } + } + } + } + + // Pick one usage that is in LaterInst's block and that dominates 'LaterInst'. + auto* LaterBB = LaterInst->getParent(); + auto& usage_set = usage_map[LaterBB]; + Instruction* usage_inst = nullptr; + for (auto* inst : *usage_set) { + if (DT->dominates(inst, LaterInst)) { + usage_inst = inst; + break; + } + } + + assert(usage_inst && "The usage instruction in the same block but after the " + "later instruction"); + return usage_inst; +} + +// XXX-comment: Returns whether the code has been changed. +bool AddFakeConditionalBranchAfterMonotonicLoads( + const SmallVector& MonotonicLoadInsts, DominatorTree* DT) { + bool Changed = false; + for (auto* LI : MonotonicLoadInsts) { + SmallVector ChainedBB; + auto* FirstInst = findFirstStoreCondBranchInst(LI, &ChainedBB); + if (FirstInst != nullptr) { + if (FirstInst->getOpcode() == Instruction::Store) { + if (StoreAddressDependOnValue(dyn_cast(FirstInst), LI)) { + continue; + } + } else if (FirstInst->getOpcode() == Instruction::Br) { + if (ConditionalBranchDependsOnValue(dyn_cast(FirstInst), + LI)) { + continue; + } + } else { + dbgs() << "FirstInst=" << *FirstInst << "\n"; + assert(false && "findFirstStoreCondBranchInst() should return a " + "store/condition branch instruction"); + } + } + + // We really need to process the relaxed load now. + StoreInst* SI = nullptr;; + if (FirstInst && (SI = dyn_cast(FirstInst))) { + // For immediately coming stores, taint the address of the store. + if (SI->getParent() == LI->getParent() || DT->dominates(LI, SI)) { + Changed |= taintStoreAddress(SI, LI); + } else { + auto* Inst = + findMostRecentDependenceUsage(LI, FirstInst, &ChainedBB, DT); + if (!Inst) { + LI->setOrdering(Acquire); + Changed = true; + } else { + Changed |= taintStoreAddress(SI, Inst); + } + } + } else { + // No upcoming branch + if (!FirstInst) { + TaintRelaxedLoads(LI); + Changed = true; + } else { + // For immediately coming branch, directly add a fake branch. + if (FirstInst->getParent() == LI->getParent() || + DT->dominates(LI, FirstInst)) { + TaintRelaxedLoads(LI); + Changed = true; + } else { + auto* Inst = + findMostRecentDependenceUsage(LI, FirstInst, &ChainedBB, DT); + if (Inst) { + TaintRelaxedLoads(Inst); + } else { + LI->setOrdering(Acquire); + } + Changed = true; + } + } + } + } + return Changed; +} + +/**** Implementations of public methods for dependence tainting ****/ +Value* GetUntaintedAddress(Value* CurrentAddress) { + auto* OrAddress = getOrAddress(CurrentAddress); + if (OrAddress == nullptr) { + // Is it tainted by a select instruction? + auto* Inst = dyn_cast(CurrentAddress); + if (nullptr != Inst && Inst->getOpcode() == Instruction::Select) { + // A selection instruction. + if (Inst->getOperand(1) == Inst->getOperand(2)) { + return Inst->getOperand(1); + } + } + + return CurrentAddress; + } + Value* ActualAddress = nullptr; + + auto* CastToInt = dyn_cast(OrAddress->getOperand(1)); + if (CastToInt && CastToInt->getOpcode() == Instruction::PtrToInt) { + return CastToInt->getOperand(0); + } else { + // This should be a IntToPtr constant expression. + ConstantExpr* PtrToIntExpr = + dyn_cast(OrAddress->getOperand(1)); + if (PtrToIntExpr && PtrToIntExpr->getOpcode() == Instruction::PtrToInt) { + return PtrToIntExpr->getOperand(0); + } + } + + // Looks like it's not been dependence-tainted. Returns itself. + return CurrentAddress; +} + +MemoryLocation GetUntaintedMemoryLocation(StoreInst* SI) { + AAMDNodes AATags; + SI->getAAMetadata(AATags); + const auto& DL = SI->getModule()->getDataLayout(); + const auto* OriginalAddr = GetUntaintedAddress(SI->getPointerOperand()); + DEBUG(if (OriginalAddr != SI->getPointerOperand()) { + dbgs() << "[GetUntaintedMemoryLocation]\n" + << "Storing address: " << *SI->getPointerOperand() + << "\nUntainted address: " << *OriginalAddr << "\n"; + }); + return MemoryLocation(OriginalAddr, + DL.getTypeStoreSize(SI->getValueOperand()->getType()), + AATags); +} + +bool TaintDependenceToStore(StoreInst* SI, Value* DepVal) { + if (dependenceSetInclusion(SI, DepVal)) { + return false; + } + + bool tainted = taintStoreAddress(SI, DepVal); + assert(tainted); + return tainted; +} + +bool TaintDependenceToStoreAddress(StoreInst* SI, Value* DepVal) { + if (dependenceSetInclusion(SI->getPointerOperand(), DepVal)) { + return false; + } + + bool tainted = taintStoreAddress(SI, DepVal); + assert(tainted); + return tainted; +} + +bool CompressTaintedStore(BasicBlock* BB) { + // This function looks for windows of adajcent stores in 'BB' that satisfy the + // following condition (and then do optimization): + // *Addr(d1) = v1, d1 is a condition and is the only dependence the store's + // address depends on && Dep(v1) includes Dep(d1); + // *Addr(d2) = v2, d2 is a condition and is the only dependnece the store's + // address depends on && Dep(v2) includes Dep(d2) && + // Dep(d2) includes Dep(d1); + // ... + // *Addr(dN) = vN, dN is a condition and is the only dependence the store's + // address depends on && Dep(dN) includes Dep(d"N-1"). + // + // As a result, Dep(dN) includes [Dep(d1) V ... V Dep(d"N-1")], so we can + // safely transform the above to the following. In between these stores, we + // can omit untainted stores to the same address 'Addr' since they internally + // have dependence on the previous stores on the same address. + // => + // *Addr = v1 + // *Addr = v2 + // *Addr(d3) = v3 + for (auto BI = BB->begin(), BE = BB->end(); BI != BE; BI++) { + // Look for the first store in such a window of adajacent stores. + auto* FirstSI = dyn_cast(&*BI); + if (!FirstSI) { + continue; + } + + // The first store in the window must be tainted. + auto* UntaintedAddress = GetUntaintedAddress(FirstSI->getPointerOperand()); + if (UntaintedAddress == FirstSI->getPointerOperand()) { + continue; + } + + // The first store's address must directly depend on and only depend on a + // condition. + auto* FirstSIDepCond = getConditionDependence(FirstSI->getPointerOperand()); + if (nullptr == FirstSIDepCond) { + continue; + } + + // Dep(first store's storing value) includes Dep(tainted dependence). + if (!dependenceSetInclusion(FirstSI->getValueOperand(), FirstSIDepCond)) { + continue; + } + + // Look for subsequent stores to the same address that satisfy the condition + // of "compressing the dependence". + SmallVector AdajacentStores; + AdajacentStores.push_back(FirstSI); + auto BII = BasicBlock::iterator(FirstSI); + for (BII++; BII != BE; BII++) { + auto* CurrSI = dyn_cast(&*BII); + if (!CurrSI) { + if (BII->mayHaveSideEffects()) { + // Be conservative. Instructions with side effects are similar to + // stores. + break; + } + continue; + } + + auto* OrigAddress = GetUntaintedAddress(CurrSI->getPointerOperand()); + auto* CurrSIDepCond = getConditionDependence(CurrSI->getPointerOperand()); + // All other stores must satisfy either: + // A. 'CurrSI' is an untainted store to the same address, or + // B. the combination of the following 5 subconditions: + // 1. Tainted; + // 2. Untainted address is the same as the group's address; + // 3. The address is tainted with a sole value which is a condition; + // 4. The storing value depends on the condition in 3. + // 5. The condition in 3 depends on the previous stores dependence + // condition. + + // Condition A. Should ignore this store directly. + if (OrigAddress == CurrSI->getPointerOperand() && + OrigAddress == UntaintedAddress) { + continue; + } + // Check condition B. + Value* Cond = nullptr; + if (OrigAddress == CurrSI->getPointerOperand() || + OrigAddress != UntaintedAddress || CurrSIDepCond == nullptr || + !dependenceSetInclusion(CurrSI->getValueOperand(), CurrSIDepCond)) { + // Check condition 1, 2, 3 & 4. + break; + } + + // Check condition 5. + StoreInst* PrevSI = AdajacentStores[AdajacentStores.size() - 1]; + auto* PrevSIDepCond = getConditionDependence(PrevSI->getPointerOperand()); + assert(PrevSIDepCond && + "Store in the group must already depend on a condtion"); + if (!dependenceSetInclusion(CurrSIDepCond, PrevSIDepCond)) { + break; + } + + AdajacentStores.push_back(CurrSI); + } + + if (AdajacentStores.size() == 1) { + // The outer loop should keep looking from the next store. + continue; + } + + // Now we have such a group of tainted stores to the same address. + DEBUG(dbgs() << "[CompressTaintedStore]\n"); + DEBUG(dbgs() << "Original BB\n"); + DEBUG(dbgs() << *BB << '\n'); + auto* LastSI = AdajacentStores[AdajacentStores.size() - 1]; + for (unsigned i = 0; i < AdajacentStores.size() - 1; ++i) { + auto* SI = AdajacentStores[i]; + + // Use the original address for stores before the last one. + SI->setOperand(1, UntaintedAddress); + + DEBUG(dbgs() << "Store address has been reversed: " << *SI << '\n';); + } + // XXX-comment: Try to make the last store use fewer registers. + // If LastSI's storing value is a select based on the condition with which + // its address is tainted, transform the tainted address to a select + // instruction, as follows: + // r1 = Select Cond ? A : B + // r2 = Cond & 0 + // r3 = Addr | r2 + // *r3 = r1 + // ==> + // r1 = Select Cond ? A : B + // r2 = Select Cond ? Addr : Addr + // *r2 = r1 + // The idea is that both Select instructions depend on the same condition, + // so hopefully the backend can generate two cmov instructions for them (and + // this saves the number of registers needed). + auto* LastSIDep = getConditionDependence(LastSI->getPointerOperand()); + auto* LastSIValue = dyn_cast(LastSI->getValueOperand()); + if (LastSIValue && LastSIValue->getOpcode() == Instruction::Select && + LastSIValue->getOperand(0) == LastSIDep) { + // XXX-comment: Maybe it's better for us to just leave it as an and/or + // dependence pattern. + // /* + IRBuilder Builder(LastSI); + auto* Address = + Builder.CreateSelect(LastSIDep, UntaintedAddress, UntaintedAddress); + LastSI->setOperand(1, Address); + DEBUG(dbgs() << "The last store becomes :" << *LastSI << "\n\n";); + // */ + } + } + + return true; +} + +bool PassDependenceToStore(Value* OldAddress, StoreInst* NewStore) { + Value* OldDep = getDependence(OldAddress); + // Return false when there's no dependence to pass from the OldAddress. + if (!OldDep) { + return false; + } + + // No need to pass the dependence to NewStore's address if it already depends + // on whatever 'OldAddress' depends on. + if (StoreAddressDependOnValue(NewStore, OldDep)) { + return false; + } + return taintStoreAddress(NewStore, OldAddress); +} + +SmallSet FindDependence(Value* Val) { + SmallSet DepSet; + recursivelyFindDependence(&DepSet, Val, true /*Only insert leaf nodes*/); + return DepSet; +} + +bool StoreAddressDependOnValue(StoreInst* SI, Value* DepVal) { + return dependenceSetInclusion(SI->getPointerOperand(), DepVal); +} + +bool StoreDependOnValue(StoreInst* SI, Value* Dep) { + return dependenceSetInclusion(SI, Dep); +} + +} // namespace + + + bool CodeGenPrepare::runOnFunction(Function &F) { + bool EverMadeChange = false; + if (skipOptnoneFunction(F)) return false; - bool EverMadeChange = false; + DL = &F.getParent()->getDataLayout(); + // Clear per function information. - InsertedTruncsSet.clear(); + InsertedInsts.clear(); PromotedInsts.clear(); ModifiedDT = false; if (TM) - TLI = TM->getSubtargetImpl()->getTargetLowering(); - TLInfo = &getAnalysis(); - TTI = &getAnalysis(); - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize); + TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + TLInfo = &getAnalysis().getTLI(); + TTI = &getAnalysis().getTTI(F); + DT = &getAnalysis().getDomTree(); + OptSize = F.optForSize(); /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (!OptSize && TLI && TLI->isSlowDivBypassed()) { const DenseMap &BypassWidths = TLI->getBypassSlowDivWidths(); - for (Function::iterator I = F.begin(); I != F.end(); I++) - EverMadeChange |= bypassSlowDivision(F, I, BypassWidths); + BasicBlock* BB = &*F.begin(); + while (BB != nullptr) { + // bypassSlowDivision may create new BBs, but we don't want to reapply the + // optimization to those blocks. + BasicBlock* Next = BB->getNextNode(); + EverMadeChange |= bypassSlowDivision(BB, BypassWidths); + BB = Next; + } } // Eliminate blocks that contain only PHI nodes and an // unconditional branch. - EverMadeChange |= EliminateMostlyEmptyBlocks(F); + EverMadeChange |= eliminateMostlyEmptyBlocks(F); // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. - EverMadeChange |= PlaceDbgValues(F); + EverMadeChange |= placeDbgValues(F); // If there is a mask, compare against zero, and branch that can be combined // into a single target instruction, push the mask and compare into branch @@ -229,8 +1269,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) { while (MadeChange) { MadeChange = false; for (Function::iterator I = F.begin(); I != F.end(); ) { - BasicBlock *BB = I++; - MadeChange |= OptimizeBlock(*BB); + BasicBlock *BB = &*I++; + bool ModifiedDTOnIteration = false; + MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); + + // Restart BB iteration if the dominator tree of the Function was changed + if (ModifiedDTOnIteration) + break; } EverMadeChange |= MadeChange; } @@ -240,9 +1285,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (!DisableBranchOpts) { MadeChange = false; SmallPtrSet WorkList; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - SmallVector Successors(succ_begin(BB), succ_end(BB)); - MadeChange |= ConstantFoldTerminator(BB, true); + for (BasicBlock &BB : F) { + SmallVector Successors(succ_begin(&BB), succ_end(&BB)); + MadeChange |= ConstantFoldTerminator(&BB, true); if (!MadeChange) continue; for (SmallVectorImpl::iterator @@ -269,27 +1314,55 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // Merge pairs of basic blocks with unconditional branches, connected by // a single edge. if (EverMadeChange || MadeChange) - MadeChange |= EliminateFallThrough(F); + MadeChange |= eliminateFallThrough(F); - if (MadeChange) - ModifiedDT = true; EverMadeChange |= MadeChange; } - if (ModifiedDT && DT) - DT->recalculate(F); + if (!DisableGCOpts) { + SmallVector Statepoints; + for (BasicBlock &BB : F) + for (Instruction &I : BB) + if (isStatepoint(I)) + Statepoints.push_back(&I); + for (auto &I : Statepoints) + EverMadeChange |= simplifyOffsetableRelocate(*I); + } + + // XXX-comment: Delay dealing with relaxed loads in this function to avoid + // further changes done by other passes (e.g., SimplifyCFG). + // Collect all the relaxed loads. + SmallVector MonotonicLoadInsts; + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { + if (I->isAtomic()) { + switch (I->getOpcode()) { + case Instruction::Load: { + auto* LI = dyn_cast(&*I); + if (LI->getOrdering() == Monotonic) { + MonotonicLoadInsts.push_back(LI); + } + break; + } + default: { + break; + } + } + } + } + EverMadeChange |= + AddFakeConditionalBranchAfterMonotonicLoads(MonotonicLoadInsts, DT); return EverMadeChange; } -/// EliminateFallThrough - Merge basic blocks which are connected -/// by a single edge, where one of the basic blocks has a single successor -/// pointing to the other basic block, which has a single predecessor. -bool CodeGenPrepare::EliminateFallThrough(Function &F) { +/// Merge basic blocks which are connected by a single edge, where one of the +/// basic blocks has a single successor pointing to the other basic block, +/// which has a single predecessor. +bool CodeGenPrepare::eliminateFallThrough(Function &F) { bool Changed = false; // Scan all of the blocks in the function, except for the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { - BasicBlock *BB = I++; + BasicBlock *BB = &*I++; // If the destination block has a single pred, then this is a trivial // edge, just collapse it. BasicBlock *SinglePred = BB->getSinglePredecessor(); @@ -304,29 +1377,27 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) { // Remember if SinglePred was the entry block of the function. // If so, we will need to move BB back to the entry position. bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - MergeBasicBlockIntoOnlyPred(BB, this); + MergeBasicBlockIntoOnlyPred(BB, nullptr); if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); // We have erased a block. Update the iterator. - I = BB; + I = BB->getIterator(); } } return Changed; } -/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes, -/// debug info directives, and an unconditional branch. Passes before isel -/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for -/// isel. Start by eliminating these blocks so we can split them the way we -/// want them. -bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { +/// Eliminate blocks that contain only PHI nodes, debug info directives, and an +/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split +/// edges in ways that are non-optimal for isel. Start by eliminating these +/// blocks so we can split them the way we want them. +bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { bool MadeChange = false; // Note that this intentionally skips the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { - BasicBlock *BB = I++; - + BasicBlock *BB = &*I++; // If this block doesn't end with an uncond branch, ignore it. BranchInst *BI = dyn_cast(BB->getTerminator()); if (!BI || !BI->isUnconditional()) @@ -334,7 +1405,7 @@ bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { // If the instruction before the branch (skipping debug info) isn't a phi // node, then other stuff is happening here. - BasicBlock::iterator BBI = BI; + BasicBlock::iterator BBI = BI->getIterator(); if (BBI != BB->begin()) { --BBI; while (isa(BBI)) { @@ -351,19 +1422,19 @@ bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { if (DestBB == BB) continue; - if (!CanMergeBlocks(BB, DestBB)) + if (!canMergeBlocks(BB, DestBB)) continue; - EliminateMostlyEmptyBlock(BB); + eliminateMostlyEmptyBlock(BB); MadeChange = true; } return MadeChange; } -/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a -/// single uncond branch between them, and BB contains no other non-phi +/// Return true if we can merge BB into DestBB if there is a single +/// unconditional branch between them, and BB contains no other non-phi /// instructions. -bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, +bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const { // We only want to eliminate blocks whose phi nodes are used by phi nodes in // the successor. If there are more complex condition (e.g. preheaders), @@ -374,7 +1445,7 @@ bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, const Instruction *UI = cast(U); if (UI->getParent() != DestBB || !isa(UI)) return false; - // If User is inside DestBB block and it is a PHINode then check + // IfUser is inside DestBB block and it is a PHINode then check // incoming value. If incoming value is not from BB then this is // a complex condition (e.g. preheaders) we want to avoid here. if (UI->getParent() == DestBB) { @@ -429,9 +1500,9 @@ bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, } -/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and -/// an unconditional branch in it. -void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { +/// Eliminate a basic block that has only phi's and an unconditional branch in +/// it. +void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { BranchInst *BI = cast(BB->getTerminator()); BasicBlock *DestBB = BI->getSuccessor(0); @@ -444,7 +1515,7 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { // Remember if SinglePred was the entry block of the function. If so, we // will need to move BB back to the entry position. bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - MergeBasicBlockIntoOnlyPred(DestBB, this); + MergeBasicBlockIntoOnlyPred(DestBB, nullptr); if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); @@ -486,19 +1557,190 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { // The PHIs are now updated, change everything that refers to BB to use // DestBB and remove BB. BB->replaceAllUsesWith(DestBB); - if (DT && !ModifiedDT) { - BasicBlock *BBIDom = DT->getNode(BB)->getIDom()->getBlock(); - BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock(); - BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom); - DT->changeImmediateDominator(DestBB, NewIDom); - DT->eraseNode(BB); - } BB->eraseFromParent(); ++NumBlocksElim; DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); } +// Computes a map of base pointer relocation instructions to corresponding +// derived pointer relocation instructions given a vector of all relocate calls +static void computeBaseDerivedRelocateMap( + const SmallVectorImpl &AllRelocateCalls, + DenseMap> + &RelocateInstMap) { + // Collect information in two maps: one primarily for locating the base object + // while filling the second map; the second map is the final structure holding + // a mapping between Base and corresponding Derived relocate calls + DenseMap, GCRelocateInst *> RelocateIdxMap; + for (auto *ThisRelocate : AllRelocateCalls) { + auto K = std::make_pair(ThisRelocate->getBasePtrIndex(), + ThisRelocate->getDerivedPtrIndex()); + RelocateIdxMap.insert(std::make_pair(K, ThisRelocate)); + } + for (auto &Item : RelocateIdxMap) { + std::pair Key = Item.first; + if (Key.first == Key.second) + // Base relocation: nothing to insert + continue; + + GCRelocateInst *I = Item.second; + auto BaseKey = std::make_pair(Key.first, Key.first); + + // We're iterating over RelocateIdxMap so we cannot modify it. + auto MaybeBase = RelocateIdxMap.find(BaseKey); + if (MaybeBase == RelocateIdxMap.end()) + // TODO: We might want to insert a new base object relocate and gep off + // that, if there are enough derived object relocates. + continue; + + RelocateInstMap[MaybeBase->second].push_back(I); + } +} + +// Accepts a GEP and extracts the operands into a vector provided they're all +// small integer constants +static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, + SmallVectorImpl &OffsetV) { + for (unsigned i = 1; i < GEP->getNumOperands(); i++) { + // Only accept small constant integer operands + auto Op = dyn_cast(GEP->getOperand(i)); + if (!Op || Op->getZExtValue() > 20) + return false; + } + + for (unsigned i = 1; i < GEP->getNumOperands(); i++) + OffsetV.push_back(GEP->getOperand(i)); + return true; +} + +// Takes a RelocatedBase (base pointer relocation instruction) and Targets to +// replace, computes a replacement, and affects it. +static bool +simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, + const SmallVectorImpl &Targets) { + bool MadeChange = false; + for (GCRelocateInst *ToReplace : Targets) { + assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() && + "Not relocating a derived object of the original base object"); + if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) { + // A duplicate relocate call. TODO: coalesce duplicates. + continue; + } + + if (RelocatedBase->getParent() != ToReplace->getParent()) { + // Base and derived relocates are in different basic blocks. + // In this case transform is only valid when base dominates derived + // relocate. However it would be too expensive to check dominance + // for each such relocate, so we skip the whole transformation. + continue; + } + + Value *Base = ToReplace->getBasePtr(); + auto Derived = dyn_cast(ToReplace->getDerivedPtr()); + if (!Derived || Derived->getPointerOperand() != Base) + continue; + + SmallVector OffsetV; + if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV)) + continue; + + // Create a Builder and replace the target callsite with a gep + assert(RelocatedBase->getNextNode() && "Should always have one since it's not a terminator"); + + // Insert after RelocatedBase + IRBuilder<> Builder(RelocatedBase->getNextNode()); + Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); + + // If gc_relocate does not match the actual type, cast it to the right type. + // In theory, there must be a bitcast after gc_relocate if the type does not + // match, and we should reuse it to get the derived pointer. But it could be + // cases like this: + // bb1: + // ... + // %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) + // br label %merge + // + // bb2: + // ... + // %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) + // br label %merge + // + // merge: + // %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ] + // %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)* + // + // In this case, we can not find the bitcast any more. So we insert a new bitcast + // no matter there is already one or not. In this way, we can handle all cases, and + // the extra bitcast should be optimized away in later passes. + Value *ActualRelocatedBase = RelocatedBase; + if (RelocatedBase->getType() != Base->getType()) { + ActualRelocatedBase = + Builder.CreateBitCast(RelocatedBase, Base->getType()); + } + Value *Replacement = Builder.CreateGEP( + Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV)); + Replacement->takeName(ToReplace); + // If the newly generated derived pointer's type does not match the original derived + // pointer's type, cast the new derived pointer to match it. Same reasoning as above. + Value *ActualReplacement = Replacement; + if (Replacement->getType() != ToReplace->getType()) { + ActualReplacement = + Builder.CreateBitCast(Replacement, ToReplace->getType()); + } + ToReplace->replaceAllUsesWith(ActualReplacement); + ToReplace->eraseFromParent(); + + MadeChange = true; + } + return MadeChange; +} + +// Turns this: +// +// %base = ... +// %ptr = gep %base + 15 +// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) +// %base' = relocate(%tok, i32 4, i32 4) +// %ptr' = relocate(%tok, i32 4, i32 5) +// %val = load %ptr' +// +// into this: +// +// %base = ... +// %ptr = gep %base + 15 +// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) +// %base' = gc.relocate(%tok, i32 4, i32 4) +// %ptr' = gep %base' + 15 +// %val = load %ptr' +bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { + bool MadeChange = false; + SmallVector AllRelocateCalls; + + for (auto *U : I.users()) + if (GCRelocateInst *Relocate = dyn_cast(U)) + // Collect all the relocate calls associated with a statepoint + AllRelocateCalls.push_back(Relocate); + + // We need atleast one base pointer relocation + one derived pointer + // relocation to mangle + if (AllRelocateCalls.size() < 2) + return false; + + // RelocateInstMap is a mapping from the base relocate instruction to the + // corresponding derived relocate instructions + DenseMap> RelocateInstMap; + computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap); + if (RelocateInstMap.empty()) + return false; + + for (auto &Item : RelocateInstMap) + // Item.first is the RelocatedBase to offset against + // Item.second is the vector of Targets to replace + MadeChange = simplifyRelocatesOffABase(Item.first, Item.second); + return MadeChange; +} + /// SinkCast - Sink the specified cast instruction into its user blocks static bool SinkCast(CastInst *CI) { BasicBlock *DefBB = CI->getParent(); @@ -522,6 +1764,12 @@ static bool SinkCast(CastInst *CI) { // Preincrement use iterator so we don't invalidate it. ++UI; + // If the block selected to receive the cast is an EH pad that does not + // allow non-PHI instructions before the terminator, we can't sink the + // cast. + if (UserBB->getTerminator()->isEHPad()) + continue; + // If this user is in the same block as the cast, don't change the cast. if (UserBB == DefBB) continue; @@ -530,14 +1778,14 @@ static bool SinkCast(CastInst *CI) { if (!InsertedCast) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedCast = - CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", - InsertPt); - MadeChange = true; + assert(InsertPt != UserBB->end()); + InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0), + CI->getType(), "", &*InsertPt); } // Replace a use of the cast with a use of the new cast. TheUse = InsertedCast; + MadeChange = true; ++NumCastUses; } @@ -550,17 +1798,17 @@ static bool SinkCast(CastInst *CI) { return MadeChange; } -/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop -/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC), -/// sink it into user blocks to reduce the number of virtual -/// registers that must be created and coalesced. +/// If the specified cast instruction is a noop copy (e.g. it's casting from +/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to +/// reduce the number of virtual registers that must be created and coalesced. /// /// Return true if any changes are made. /// -static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ +static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, + const DataLayout &DL) { // If this is a noop copy, - EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType()); - EVT DstVT = TLI.getValueType(CI->getType()); + EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, CI->getType()); // This is an fp<->int conversion? if (SrcVT.isInteger() != DstVT.isInteger()) @@ -587,16 +1835,63 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ return SinkCast(CI); } -/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce -/// the number of virtual registers that must be created and coalesced. This is -/// a clear win except on targets with multiple condition code registers -/// (PowerPC), where it might lose; some adjustment may be wanted there. +/// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if +/// possible. +/// +/// Return true if any changes were made. +static bool CombineUAddWithOverflow(CmpInst *CI) { + Value *A, *B; + Instruction *AddI; + if (!match(CI, + m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI)))) + return false; + + Type *Ty = AddI->getType(); + if (!isa(Ty)) + return false; + + // We don't want to move around uses of condition values this late, so we we + // check if it is legal to create the call to the intrinsic in the basic + // block containing the icmp: + + if (AddI->getParent() != CI->getParent() && !AddI->hasOneUse()) + return false; + +#ifndef NDEBUG + // Someday m_UAddWithOverflow may get smarter, but this is a safe assumption + // for now: + if (AddI->hasOneUse()) + assert(*AddI->user_begin() == CI && "expected!"); +#endif + + Module *M = CI->getModule(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty); + + auto *InsertPt = AddI->hasOneUse() ? CI : AddI; + + auto *UAddWithOverflow = + CallInst::Create(F, {A, B}, "uadd.overflow", InsertPt); + auto *UAdd = ExtractValueInst::Create(UAddWithOverflow, 0, "uadd", InsertPt); + auto *Overflow = + ExtractValueInst::Create(UAddWithOverflow, 1, "overflow", InsertPt); + + CI->replaceAllUsesWith(Overflow); + AddI->replaceAllUsesWith(UAdd); + CI->eraseFromParent(); + AddI->eraseFromParent(); + return true; +} + +/// Sink the given CmpInst into user blocks to reduce the number of virtual +/// registers that must be created and coalesced. This is a clear win except on +/// targets with multiple condition code registers (PowerPC), where it might +/// lose; some adjustment may be wanted there. /// /// Return true if any changes are made. -static bool OptimizeCmpExpression(CmpInst *CI) { +static bool SinkCmpExpression(CmpInst *CI) { BasicBlock *DefBB = CI->getParent(); - /// InsertedCmp - Only insert a cmp in each block once. + /// Only insert a cmp in each block once. DenseMap InsertedCmps; bool MadeChange = false; @@ -623,27 +1918,39 @@ static bool OptimizeCmpExpression(CmpInst *CI) { if (!InsertedCmp) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + assert(InsertPt != UserBB->end()); InsertedCmp = - CmpInst::Create(CI->getOpcode(), - CI->getPredicate(), CI->getOperand(0), - CI->getOperand(1), "", InsertPt); - MadeChange = true; + CmpInst::Create(CI->getOpcode(), CI->getPredicate(), + CI->getOperand(0), CI->getOperand(1), "", &*InsertPt); } // Replace a use of the cmp with a use of the new cmp. TheUse = InsertedCmp; + MadeChange = true; ++NumCmpUses; } // If we removed all uses, nuke the cmp. - if (CI->use_empty()) + if (CI->use_empty()) { CI->eraseFromParent(); + MadeChange = true; + } return MadeChange; } -/// isExtractBitsCandidateUse - Check if the candidates could -/// be combined with shift instruction, which includes: +static bool OptimizeCmpExpression(CmpInst *CI) { + if (SinkCmpExpression(CI)) + return true; + + if (CombineUAddWithOverflow(CI)) + return true; + + return false; +} + +/// Check if the candidates could be combined with a shift instruction, which +/// includes: /// 1. Truncate instruction /// 2. And instruction and the imm is a mask of the low bits: /// imm & (imm+1) == 0 @@ -661,12 +1968,11 @@ static bool isExtractBitsCandidateUse(Instruction *User) { return true; } -/// SinkShiftAndTruncate - sink both shift and truncate instruction -/// to the use of truncate's BB. +/// Sink both shift and truncate instruction to the use of truncate's BB. static bool SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, DenseMap &InsertedShifts, - const TargetLowering &TLI) { + const TargetLowering &TLI, const DataLayout &DL) { BasicBlock *UserBB = User->getParent(); DenseMap InsertedTruncs; TruncInst *TruncI = dyn_cast(User); @@ -692,7 +1998,7 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, // approximation; some nodes' legality is determined by the // operand or other means. There's no good way to find out though. if (TLI.isOperationLegalOrCustom( - ISDOpcode, TLI.getValueType(TruncUser->getType(), true))) + ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true))) continue; // Don't bother for PHI nodes. @@ -709,20 +2015,22 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, if (!InsertedShift && !InsertedTrunc) { BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt(); + assert(InsertPt != TruncUserBB->end()); // Sink the shift if (ShiftI->getOpcode() == Instruction::AShr) - InsertedShift = - BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt); + InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, + "", &*InsertPt); else - InsertedShift = - BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt); + InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, + "", &*InsertPt); // Sink the trunc BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt(); TruncInsertPt++; + assert(TruncInsertPt != TruncUserBB->end()); InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift, - TruncI->getType(), "", TruncInsertPt); + TruncI->getType(), "", &*TruncInsertPt); MadeChange = true; @@ -732,10 +2040,10 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, return MadeChange; } -/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if -/// the uses could potentially be combined with this shift instruction and -/// generate BitExtract instruction. It will only be applied if the architecture -/// supports BitExtract instruction. Here is an example: +/// Sink the shift *right* instruction into user blocks if the uses could +/// potentially be combined with this shift instruction and generate BitExtract +/// instruction. It will only be applied if the architecture supports BitExtract +/// instruction. Here is an example: /// BB1: /// %x.extract.shift = lshr i64 %arg1, 32 /// BB2: @@ -750,13 +2058,14 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, /// instruction. /// Return true if any changes are made. static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, - const TargetLowering &TLI) { + const TargetLowering &TLI, + const DataLayout &DL) { BasicBlock *DefBB = ShiftI->getParent(); /// Only insert instructions in each block once. DenseMap InsertedShifts; - bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(ShiftI->getType())); + bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType())); bool MadeChange = false; for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end(); @@ -793,9 +2102,10 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, if (isa(User) && shiftIsLegal // If the type of the truncate is legal, no trucate will be // introduced in other basic blocks. - && (!TLI.isTypeLegal(TLI.getValueType(User->getType())))) + && + (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType())))) MadeChange = - SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI); + SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL); continue; } @@ -804,13 +2114,14 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, if (!InsertedShift) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + assert(InsertPt != UserBB->end()); if (ShiftI->getOpcode() == Instruction::AShr) - InsertedShift = - BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt); + InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, + "", &*InsertPt); else - InsertedShift = - BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt); + InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, + "", &*InsertPt); MadeChange = true; } @@ -819,30 +2130,606 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, TheUse = InsertedShift; } - // If we removed all uses, nuke the shift. - if (ShiftI->use_empty()) - ShiftI->eraseFromParent(); + // If we removed all uses, nuke the shift. + if (ShiftI->use_empty()) + ShiftI->eraseFromParent(); + + return MadeChange; +} + +// Translate a masked load intrinsic like +// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, +// <16 x i1> %mask, <16 x i32> %passthru) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.load, label %else +// +//cond.load: ; preds = %0 +// %4 = getelementptr i32* %1, i32 0 +// %5 = load i32* %4 +// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 +// br label %else +// +//else: ; preds = %0, %cond.load +// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] +// %7 = extractelement <16 x i1> %mask, i32 1 +// %8 = icmp eq i1 %7, true +// br i1 %8, label %cond.load1, label %else2 +// +//cond.load1: ; preds = %else +// %9 = getelementptr i32* %1, i32 1 +// %10 = load i32* %9 +// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 +// br label %else2 +// +//else2: ; preds = %else, %cond.load1 +// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] +// %12 = extractelement <16 x i1> %mask, i32 2 +// %13 = icmp eq i1 %12, true +// br i1 %13, label %cond.load4, label %else5 +// +static void ScalarizeMaskedLoad(CallInst *CI) { + Value *Ptr = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + unsigned AlignVal = cast(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast(CI->getType()); + assert(VecType && "Unexpected return type of masked load intrinsic"); + + Type *EltTy = CI->getType()->getVectorElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Short-cut if the mask is all-true. + bool IsAllOnesMask = isa(Mask) && + cast(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8); + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + + if (isa(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = Builder.CreateInsertElement(VResult, Load, + Builder.getInt32(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_load = icmp eq i1 %mask_1, true + // br i1 %to_load, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked store intrinsic, like +// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, +// <16 x i1> %mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.store, label %else +// +// cond.store: ; preds = %0 +// %4 = extractelement <16 x i32> %val, i32 0 +// %5 = getelementptr i32* %1, i32 0 +// store i32 %4, i32* %5 +// br label %else +// +// else: ; preds = %0, %cond.store +// %6 = extractelement <16 x i1> %mask, i32 1 +// %7 = icmp eq i1 %6, true +// br i1 %7, label %cond.store1, label %else2 +// +// cond.store1: ; preds = %else +// %8 = extractelement <16 x i32> %val, i32 1 +// %9 = getelementptr i32* %1, i32 1 +// store i32 %8, i32* %9 +// br label %else2 +// . . . +static void ScalarizeMaskedStore(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + unsigned AlignVal = cast(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast(Src->getType()); + assert(VecType && "Unexpected data type in masked store intrinsic"); + + Type *EltTy = VecType->getElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Short-cut if the mask is all-true. + bool IsAllOnesMask = isa(Mask) && + cast(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Builder.CreateAlignedStore(Src, Ptr, AlignVal); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8); + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + + if (isa(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + } + CI->eraseFromParent(); + return; + } + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_store = icmp eq i1 %mask_1, true + // br i1 %to_store, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %OneElt = extractelement <16 x i32> %Src, i32 Idx + // %EltAddr = getelementptr i32* %1, i32 0 + // %store i32 %OneElt, i32* %EltAddr + // + BasicBlock *CondBlock = + IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +// Translate a masked gather intrinsic like +// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, +// <16 x i1> %Mask, <16 x i32> %Src) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> %Mask, i32 0 +// % ToLoad0 = icmp eq i1 % Mask0, true +// br i1 % ToLoad0, label %cond.load, label %else +// +// cond.load: +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// % Load0 = load i32, i32* % Ptr0, align 4 +// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 +// br label %else +// +// else: +// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] +// % Mask1 = extractelement <16 x i1> %Mask, i32 1 +// % ToLoad1 = icmp eq i1 % Mask1, true +// br i1 % ToLoad1, label %cond.load1, label %else2 +// +// cond.load1: +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// % Load1 = load i32, i32* % Ptr1, align 4 +// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 +// br label %else2 +// . . . +// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src +// ret <16 x i32> %Result +static void ScalarizeMaskedGather(CallInst *CI) { + Value *Ptrs = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast(CI->getType()); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + unsigned AlignVal = cast(Alignment)->getZExtValue(); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + unsigned VectorWidth = VecType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, + "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, + Builder.getInt32(Idx), + "Res" + Twine(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %Mask1 = extractelement <16 x i1> %Mask, i32 1 + // %ToLoad1 = icmp eq i1 %Mask1, true + // br i1 %ToLoad1, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, + Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToLoad" + Twine(Idx)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, + "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), + "Res" + Twine(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked scatter intrinsic, like +// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, +// <16 x i1> %Mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set. +// +// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> % Mask, i32 0 +// % ToStore0 = icmp eq i1 % Mask0, true +// br i1 %ToStore0, label %cond.store, label %else +// +// cond.store: +// % Elt0 = extractelement <16 x i32> %Src, i32 0 +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// store i32 %Elt0, i32* % Ptr0, align 4 +// br label %else +// +// else: +// % Mask1 = extractelement <16 x i1> % Mask, i32 1 +// % ToStore1 = icmp eq i1 % Mask1, true +// br i1 % ToStore1, label %cond.store1, label %else2 +// +// cond.store1: +// % Elt1 = extractelement <16 x i32> %Src, i32 1 +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// store i32 % Elt1, i32* % Ptr1, align 4 +// br label %else2 +// . . . +static void ScalarizeMaskedScatter(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptrs = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + assert(isa(Src->getType()) && + "Unexpected data type in masked scatter intrinsic"); + assert(isa(Ptrs->getType()) && + isa(Ptrs->getType()->getVectorElementType()) && + "Vector of pointers is expected in masked scatter intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + unsigned AlignVal = cast(Alignment)->getZExtValue(); + unsigned VectorWidth = Src->getType()->getVectorNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx + // % ToStore = icmp eq i1 % Mask1, true + // br i1 % ToStore, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, + Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = + Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToStore" + Twine(Idx)); + + // Create "cond" block + // + // % Elt1 = extractelement <16 x i32> %Src, i32 1 + // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 + // %store i32 % Elt1, i32* % Ptr1 + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +/// If counting leading or trailing zeros is an expensive operation and a zero +/// input is defined, add a check for zero to avoid calling the intrinsic. +/// +/// We want to transform: +/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) +/// +/// into: +/// entry: +/// %cmpz = icmp eq i64 %A, 0 +/// br i1 %cmpz, label %cond.end, label %cond.false +/// cond.false: +/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true) +/// br label %cond.end +/// cond.end: +/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] +/// +/// If the transform is performed, return true and set ModifiedDT to true. +static bool despeculateCountZeros(IntrinsicInst *CountZeros, + const TargetLowering *TLI, + const DataLayout *DL, + bool &ModifiedDT) { + if (!TLI || !DL) + return false; + + // If a zero input is undefined, it doesn't make sense to despeculate that. + if (match(CountZeros->getOperand(1), m_One())) + return false; - return MadeChange; -} + // If it's cheap to speculate, there's nothing to do. + auto IntrinsicID = CountZeros->getIntrinsicID(); + if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) || + (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz())) + return false; -namespace { -class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls { -protected: - void replaceCall(Value *With) override { - CI->replaceAllUsesWith(With); - CI->eraseFromParent(); - } - bool isFoldable(unsigned SizeCIOp, unsigned, bool) const override { - if (ConstantInt *SizeCI = - dyn_cast(CI->getArgOperand(SizeCIOp))) - return SizeCI->isAllOnesValue(); + // Only handle legal scalar cases. Anything else requires too much work. + Type *Ty = CountZeros->getType(); + unsigned SizeInBits = Ty->getPrimitiveSizeInBits(); + if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSize()) return false; - } -}; -} // end anonymous namespace -bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { + // The intrinsic will be sunk behind a compare against zero and branch. + BasicBlock *StartBlock = CountZeros->getParent(); + BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); + + // Create another block after the count zero intrinsic. A PHI will be added + // in this block to select the result of the intrinsic or the bit-width + // constant if the input to the intrinsic is zero. + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); + BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); + + // Set up a builder to create a compare, conditional branch, and PHI. + IRBuilder<> Builder(CountZeros->getContext()); + Builder.SetInsertPoint(StartBlock->getTerminator()); + Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc()); + + // Replace the unconditional branch that was created by the first split with + // a compare against zero and a conditional branch. + Value *Zero = Constant::getNullValue(Ty); + Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz"); + Builder.CreateCondBr(Cmp, EndBlock, CallBlock); + StartBlock->getTerminator()->eraseFromParent(); + + // Create a PHI in the end block to select either the output of the intrinsic + // or the bit width of the operand. + Builder.SetInsertPoint(&EndBlock->front()); + PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); + CountZeros->replaceAllUsesWith(PN); + Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); + PN->addIncoming(BitWidth, StartBlock); + PN->addIncoming(CountZeros, CallBlock); + + // We are explicitly handling the zero case, so we can set the intrinsic's + // undefined zero argument to 'true'. This will also prevent reprocessing the + // intrinsic; we only despeculate when a zero input is defined. + CountZeros->setArgOperand(1, Builder.getTrue()); + ModifiedDT = true; + return true; +} + +bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); // Lower inline assembly if we can. @@ -858,62 +2745,169 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { return true; } // Sink address computing for memory operands into the block. - if (OptimizeInlineAsmInst(CI)) + if (optimizeInlineAsmInst(CI)) return true; } - // Lower all uses of llvm.objectsize.* - IntrinsicInst *II = dyn_cast(CI); - if (II && II->getIntrinsicID() == Intrinsic::objectsize) { - bool Min = (cast(II->getArgOperand(1))->getZExtValue() == 1); - Type *ReturnTy = CI->getType(); - Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); - - // Substituting this can cause recursive simplifications, which can - // invalidate our iterator. Use a WeakVH to hold onto it in case this - // happens. - WeakVH IterHandle(CurInstIterator); - - replaceAndRecursivelySimplify(CI, RetVal, - TLI ? TLI->getDataLayout() : nullptr, - TLInfo, ModifiedDT ? nullptr : DT); - - // If the iterator instruction was recursively deleted, start over at the - // start of the block. - if (IterHandle != CurInstIterator) { - CurInstIterator = BB->begin(); - SunkAddrs.clear(); + // Align the pointer arguments to this call if the target thinks it's a good + // idea + unsigned MinSize, PrefAlign; + if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { + for (auto &Arg : CI->arg_operands()) { + // We want to align both objects whose address is used directly and + // objects whose address is used in casts and GEPs, though it only makes + // sense for GEPs if the offset is a multiple of the desired alignment and + // if size - offset meets the size threshold. + if (!Arg->getType()->isPointerTy()) + continue; + APInt Offset(DL->getPointerSizeInBits( + cast(Arg->getType())->getAddressSpace()), + 0); + Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); + uint64_t Offset2 = Offset.getLimitedValue(); + if ((Offset2 & (PrefAlign-1)) != 0) + continue; + AllocaInst *AI; + if ((AI = dyn_cast(Val)) && AI->getAlignment() < PrefAlign && + DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2) + AI->setAlignment(PrefAlign); + // Global variables can only be aligned if they are defined in this + // object (i.e. they are uniquely initialized in this object), and + // over-aligning global variables that have an explicit section is + // forbidden. + GlobalVariable *GV; + if ((GV = dyn_cast(Val)) && GV->canIncreaseAlignment() && + GV->getAlignment() < PrefAlign && + DL->getTypeAllocSize(GV->getType()->getElementType()) >= + MinSize + Offset2) + GV->setAlignment(PrefAlign); + } + // If this is a memcpy (or similar) then we may be able to improve the + // alignment + if (MemIntrinsic *MI = dyn_cast(CI)) { + unsigned Align = getKnownAlignment(MI->getDest(), *DL); + if (MemTransferInst *MTI = dyn_cast(MI)) + Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL)); + if (Align > MI->getAlignment()) + MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align)); } - return true; } - if (II && TLI) { - SmallVector PtrOps; - Type *AccessTy; - if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy)) - while (!PtrOps.empty()) - if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy)) - return true; + IntrinsicInst *II = dyn_cast(CI); + if (II) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::objectsize: { + // Lower all uses of llvm.objectsize.* + bool Min = (cast(II->getArgOperand(1))->getZExtValue() == 1); + Type *ReturnTy = CI->getType(); + Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); + + // Substituting this can cause recursive simplifications, which can + // invalidate our iterator. Use a WeakVH to hold onto it in case this + // happens. + WeakVH IterHandle(&*CurInstIterator); + + replaceAndRecursivelySimplify(CI, RetVal, + TLInfo, nullptr); + + // If the iterator instruction was recursively deleted, start over at the + // start of the block. + if (IterHandle != CurInstIterator.getNodePtrUnchecked()) { + CurInstIterator = BB->begin(); + SunkAddrs.clear(); + } + return true; + } + case Intrinsic::masked_load: { + // Scalarize unsupported vector masked load + if (!TTI->isLegalMaskedLoad(CI->getType())) { + ScalarizeMaskedLoad(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_store: { + if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { + ScalarizeMaskedStore(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_gather: { + if (!TTI->isLegalMaskedGather(CI->getType())) { + ScalarizeMaskedGather(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_scatter: { + if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { + ScalarizeMaskedScatter(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::aarch64_stlxr: + case Intrinsic::aarch64_stxr: { + ZExtInst *ExtVal = dyn_cast(CI->getArgOperand(0)); + if (!ExtVal || !ExtVal->hasOneUse() || + ExtVal->getParent() == CI->getParent()) + return false; + // Sink a zext feeding stlxr/stxr before it, so it can be folded into it. + ExtVal->moveBefore(CI); + // Mark this instruction as "inserted by CGP", so that other + // optimizations don't touch it. + InsertedInsts.insert(ExtVal); + return true; + } + case Intrinsic::invariant_group_barrier: + II->replaceAllUsesWith(II->getArgOperand(0)); + II->eraseFromParent(); + return true; + + case Intrinsic::cttz: + case Intrinsic::ctlz: + // If counting zeros is expensive, try to avoid it. + return despeculateCountZeros(II, TLI, DL, ModifiedDT); + } + + if (TLI) { + // Unknown address space. + // TODO: Target hook to pick which address space the intrinsic cares + // about? + unsigned AddrSpace = ~0u; + SmallVector PtrOps; + Type *AccessTy; + if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace)) + while (!PtrOps.empty()) + if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) + return true; + } } // From here on out we're working with named functions. if (!CI->getCalledFunction()) return false; - // We'll need DataLayout from here on out. - const DataLayout *TD = TLI ? TLI->getDataLayout() : nullptr; - if (!TD) return false; - // Lower all default uses of _chk calls. This is very similar // to what InstCombineCalls does, but here we are only lowering calls - // that have the default "don't know" as the objectsize. Anything else - // should be left alone. - CodeGenPrepareFortifiedLibCalls Simplifier; - return Simplifier.fold(CI, TD, TLInfo); + // to fortified library functions (e.g. __memcpy_chk) that have the default + // "don't know" as the objectsize. Anything else should be left alone. + FortifiedLibCallSimplifier Simplifier(TLInfo, true); + if (Value *V = Simplifier.optimizeCall(CI)) { + CI->replaceAllUsesWith(V); + CI->eraseFromParent(); + return true; + } + return false; } -/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return -/// instructions to the predecessor to enable tail call optimizations. The -/// case it is currently looking for is: +/// Look for opportunities to duplicate return instructions to the predecessor +/// to enable tail call optimizations. The case it is currently looking for is: /// @code /// bb0: /// %tmp0 = tail call i32 @f0() @@ -942,7 +2936,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// %tmp2 = tail call i32 @f2() /// ret i32 %tmp2 /// @endcode -bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) { +bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) { if (!TLI) return false; @@ -1061,7 +3055,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) { namespace { -/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode +/// This is an extended version of TargetLowering::AddrMode /// which holds actual Value*'s for register values. struct ExtAddrMode : public TargetLowering::AddrMode { Value *BaseReg; @@ -1173,10 +3167,10 @@ class TypePromotionTransaction { public: /// \brief Record the position of \p Inst. InsertionHandler(Instruction *Inst) { - BasicBlock::iterator It = Inst; + BasicBlock::iterator It = Inst->getIterator(); HasPrevInstruction = (It != (Inst->getParent()->begin())); if (HasPrevInstruction) - Point.PrevInst = --It; + Point.PrevInst = &*--It; else Point.BB = Inst->getParent(); } @@ -1188,7 +3182,7 @@ class TypePromotionTransaction { Inst->removeFromParent(); Inst->insertAfter(Point.PrevInst); } else { - Instruction *Position = Point.BB->getFirstInsertionPt(); + Instruction *Position = &*Point.BB->getFirstInsertionPt(); if (Inst->getParent()) Inst->moveBefore(Position); else @@ -1261,7 +3255,7 @@ class TypePromotionTransaction { Value *Val = Inst->getOperand(It); OriginalValues.push_back(Val); // Set a dummy one. - // We could use OperandSetter here, but that would implied an overhead + // We could use OperandSetter here, but that would imply an overhead // that we are not willing to pay. Inst->setOperand(It, UndefValue::get(Val->getType())); } @@ -1435,7 +3429,7 @@ class TypePromotionTransaction { Inst->removeFromParent(); } - ~InstructionRemover() { delete Replacer; } + ~InstructionRemover() override { delete Replacer; } /// \brief Really remove the instruction. void commit() override { delete Inst; } @@ -1565,86 +3559,91 @@ void TypePromotionTransaction::rollback( /// This encapsulates the logic for matching the target-legal addressing modes. class AddressingModeMatcher { SmallVectorImpl &AddrModeInsts; + const TargetMachine &TM; const TargetLowering &TLI; + const DataLayout &DL; /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and /// the memory instruction that we're computing this address for. Type *AccessTy; + unsigned AddrSpace; Instruction *MemoryInst; - /// AddrMode - This is the addressing mode that we're building up. This is + /// This is the addressing mode that we're building up. This is /// part of the return value of this addressing mode matching stuff. ExtAddrMode &AddrMode; - /// The truncate instruction inserted by other CodeGenPrepare optimizations. - const SetOfInstrs &InsertedTruncs; + /// The instructions inserted by other CodeGenPrepare optimizations. + const SetOfInstrs &InsertedInsts; /// A map from the instructions to their type before promotion. InstrToOrigTy &PromotedInsts; /// The ongoing transaction where every action should be registered. TypePromotionTransaction &TPT; - /// IgnoreProfitability - This is set to true when we should not do - /// profitability checks. When true, IsProfitableToFoldIntoAddressingMode - /// always returns true. + /// This is set to true when we should not do profitability checks. + /// When true, IsProfitableToFoldIntoAddressingMode always returns true. bool IgnoreProfitability; - AddressingModeMatcher(SmallVectorImpl &AMI, - const TargetLowering &T, Type *AT, + AddressingModeMatcher(SmallVectorImpl &AMI, + const TargetMachine &TM, Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, - const SetOfInstrs &InsertedTruncs, + const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) - : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM), - InsertedTruncs(InsertedTruncs), PromotedInsts(PromotedInsts), TPT(TPT) { + : AddrModeInsts(AMI), TM(TM), + TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent()) + ->getTargetLowering()), + DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), + MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), + PromotedInsts(PromotedInsts), TPT(TPT) { IgnoreProfitability = false; } public: - /// Match - Find the maximal addressing mode that a load/store of V can fold, + /// Find the maximal addressing mode that a load/store of V can fold, /// give an access type of AccessTy. This returns a list of involved /// instructions in AddrModeInsts. - /// \p InsertedTruncs The truncate instruction inserted by other - /// CodeGenPrepare + /// \p InsertedInsts The instructions inserted by other CodeGenPrepare /// optimizations. /// \p PromotedInsts maps the instructions to their type before promotion. /// \p The ongoing transaction where every action should be registered. - static ExtAddrMode Match(Value *V, Type *AccessTy, + static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, SmallVectorImpl &AddrModeInsts, - const TargetLowering &TLI, - const SetOfInstrs &InsertedTruncs, + const TargetMachine &TM, + const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) { ExtAddrMode Result; - bool Success = AddressingModeMatcher(AddrModeInsts, TLI, AccessTy, - MemoryInst, Result, InsertedTruncs, - PromotedInsts, TPT).MatchAddr(V, 0); + bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS, + MemoryInst, Result, InsertedInsts, + PromotedInsts, TPT).matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); return Result; } private: - bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); - bool MatchAddr(Value *V, unsigned Depth); - bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth, + bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); + bool matchAddr(Value *V, unsigned Depth); + bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth, bool *MovedAway = nullptr); - bool IsProfitableToFoldIntoAddressingMode(Instruction *I, + bool isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter); - bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); - bool IsPromotionProfitable(unsigned MatchedSize, unsigned SizeWithPromotion, + bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); + bool isPromotionProfitable(unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const; }; -/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode. +/// Try adding ScaleReg*Scale to the current addressing mode. /// Return true and update AddrMode if this addr mode is legal for the target, /// false if not. -bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, +bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth) { // If Scale is 1, then this is the same as adding ScaleReg to the addressing // mode. Just process that directly. if (Scale == 1) - return MatchAddr(ScaleReg, Depth); + return matchAddr(ScaleReg, Depth); // If the scale is 0, it takes nothing to add this. if (Scale == 0) @@ -1663,7 +3662,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, TestAddrMode.ScaledReg = ScaleReg; // If the new address isn't legal, bail out. - if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) + if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) return false; // It was legal, so commit it. @@ -1680,7 +3679,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, // If this addressing mode is legal, commit it and remember that we folded // this instruction. - if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { + if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) { AddrModeInsts.push_back(cast(ScaleReg)); AddrMode = TestAddrMode; return true; @@ -1691,9 +3690,9 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, return true; } -/// MightBeFoldableInst - This is a little filter, which returns true if an -/// addressing computation involving I might be folded into a load/store -/// accessing it. This doesn't need to be perfect, but needs to accept at least +/// This is a little filter, which returns true if an addressing computation +/// involving I might be folded into a load/store accessing it. +/// This doesn't need to be perfect, but needs to accept at least /// the set of instructions that MatchOperationAddr can. static bool MightBeFoldableInst(Instruction *I) { switch (I->getOpcode()) { @@ -1722,6 +3721,24 @@ static bool MightBeFoldableInst(Instruction *I) { } } +/// \brief Check whether or not \p Val is a legal instruction for \p TLI. +/// \note \p Val is assumed to be the product of some type promotion. +/// Therefore if \p Val has an undefined state in \p TLI, this is assumed +/// to be legal, as the non-promoted value would have had the same state. +static bool isPromotedInstructionLegal(const TargetLowering &TLI, + const DataLayout &DL, Value *Val) { + Instruction *PromotedInst = dyn_cast(Val); + if (!PromotedInst) + return false; + int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode()); + // If the ISDOpcode is undefined, it was undefined before the promotion. + if (!ISDOpcode) + return true; + // Otherwise, check if the promoted instruction is legal or not. + return TLI.isOperationLegalOrCustom( + ISDOpcode, TLI.getValueType(DL, PromotedInst->getType())); +} + /// \brief Hepler class to perform type promotion. class TypePromotionHelper { /// \brief Utility function to check whether or not a sign or zero extension @@ -1741,66 +3758,79 @@ class TypePromotionHelper { /// \brief Utility function to determine if \p OpIdx should be promoted when /// promoting \p Inst. static bool shouldExtOperand(const Instruction *Inst, int OpIdx) { - if (isa(Inst) && OpIdx == 0) - return false; - return true; + return !(isa(Inst) && OpIdx == 0); } /// \brief Utility function to promote the operand of \p Ext when this /// operand is a promotable trunc or sext or zext. /// \p PromotedInsts maps the instructions to their type before promotion. - /// \p CreatedInsts[out] contains how many non-free instructions have been + /// \p CreatedInstsCost[out] contains the cost of all instructions /// created to promote the operand of Ext. + /// Newly added extensions are inserted in \p Exts. + /// Newly added truncates are inserted in \p Truncs. /// Should never be called directly. /// \return The promoted value which is used instead of Ext. - static Value *promoteOperandForTruncAndAnyExt(Instruction *Ext, - TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts); + static Value *promoteOperandForTruncAndAnyExt( + Instruction *Ext, TypePromotionTransaction &TPT, + InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, + SmallVectorImpl *Exts, + SmallVectorImpl *Truncs, const TargetLowering &TLI); /// \brief Utility function to promote the operand of \p Ext when this /// operand is promotable and is not a supported trunc or sext. /// \p PromotedInsts maps the instructions to their type before promotion. - /// \p CreatedInsts[out] contains how many non-free instructions have been + /// \p CreatedInstsCost[out] contains the cost of all the instructions /// created to promote the operand of Ext. + /// Newly added extensions are inserted in \p Exts. + /// Newly added truncates are inserted in \p Truncs. /// Should never be called directly. /// \return The promoted value which is used instead of Ext. static Value *promoteOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts, bool IsSExt); + unsigned &CreatedInstsCost, + SmallVectorImpl *Exts, + SmallVectorImpl *Truncs, + const TargetLowering &TLI, bool IsSExt); /// \see promoteOperandForOther. - static Value *signExtendOperandForOther(Instruction *Ext, - TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts) { - return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, true); + static Value *signExtendOperandForOther( + Instruction *Ext, TypePromotionTransaction &TPT, + InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, + SmallVectorImpl *Exts, + SmallVectorImpl *Truncs, const TargetLowering &TLI) { + return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, + Exts, Truncs, TLI, true); } /// \see promoteOperandForOther. - static Value *zeroExtendOperandForOther(Instruction *Ext, - TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts) { - return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, false); + static Value *zeroExtendOperandForOther( + Instruction *Ext, TypePromotionTransaction &TPT, + InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, + SmallVectorImpl *Exts, + SmallVectorImpl *Truncs, const TargetLowering &TLI) { + return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, + Exts, Truncs, TLI, false); } public: /// Type for the utility function that promotes the operand of Ext. typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts); + unsigned &CreatedInstsCost, + SmallVectorImpl *Exts, + SmallVectorImpl *Truncs, + const TargetLowering &TLI); /// \brief Given a sign/zero extend instruction \p Ext, return the approriate /// action to promote the operand of \p Ext instead of using Ext. /// \return NULL if no promotable action is possible with the current /// sign extension. - /// \p InsertedTruncs keeps track of all the truncate instructions inserted by - /// the others CodeGenPrepare optimizations. This information is important + /// \p InsertedInsts keeps track of all the instructions inserted by the + /// other CodeGenPrepare optimizations. This information is important /// because we do not want to promote these instructions as CodeGenPrepare /// will reinsert them later. Thus creating an infinite loop: create/remove. /// \p PromotedInsts maps the instructions to their type before promotion. - static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedTruncs, + static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts, const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts); }; @@ -1809,6 +3839,12 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, Type *ConsideredExtType, const InstrToOrigTy &PromotedInsts, bool IsSExt) { + // The promotion helper does not know how to deal with vector types yet. + // To be able to fix that, we would need to fix the places where we + // statically extend, e.g., constants and such. + if (Inst->getType()->isVectorTy()) + return false; + // We can always get through zext. if (isa(Inst)) return true; @@ -1832,10 +3868,10 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, Value *OpndVal = Inst->getOperand(0); // Check if we can use this operand in the extension. - // If the type is larger than the result type of the extension, - // we cannot. - if (OpndVal->getType()->getIntegerBitWidth() > - ConsideredExtType->getIntegerBitWidth()) + // If the type is larger than the result type of the extension, we cannot. + if (!OpndVal->getType()->isIntegerTy() || + OpndVal->getType()->getIntegerBitWidth() > + ConsideredExtType->getIntegerBitWidth()) return false; // If the operand of the truncate is not an instruction, we will not have @@ -1851,22 +3887,20 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, // #1 get the type of the operand and check the kind of the extended bits. const Type *OpndType; InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd); - if (It != PromotedInsts.end() && It->second.IsSExt == IsSExt) - OpndType = It->second.Ty; + if (It != PromotedInsts.end() && It->second.getInt() == IsSExt) + OpndType = It->second.getPointer(); else if ((IsSExt && isa(Opnd)) || (!IsSExt && isa(Opnd))) OpndType = Opnd->getOperand(0)->getType(); else return false; - // #2 check that the truncate just drop extended bits. - if (Inst->getType()->getIntegerBitWidth() >= OpndType->getIntegerBitWidth()) - return true; - - return false; + // #2 check that the truncate just drops extended bits. + return Inst->getType()->getIntegerBitWidth() >= + OpndType->getIntegerBitWidth(); } TypePromotionHelper::Action TypePromotionHelper::getAction( - Instruction *Ext, const SetOfInstrs &InsertedTruncs, + Instruction *Ext, const SetOfInstrs &InsertedInsts, const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) { assert((isa(Ext) || isa(Ext)) && "Unexpected instruction type"); @@ -1882,7 +3916,7 @@ TypePromotionHelper::Action TypePromotionHelper::getAction( // Do not promote if the operand has been added by codegenprepare. // Otherwise, it means we are undoing an optimization that is likely to be // redone, thus causing potential infinite loop. - if (isa(ExtOpnd) && InsertedTruncs.count(ExtOpnd)) + if (isa(ExtOpnd) && InsertedInsts.count(ExtOpnd)) return nullptr; // SExt or Trunc instructions. @@ -1900,14 +3934,18 @@ TypePromotionHelper::Action TypePromotionHelper::getAction( Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( llvm::Instruction *SExt, TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts) { + InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, + SmallVectorImpl *Exts, + SmallVectorImpl *Truncs, const TargetLowering &TLI) { // By construction, the operand of SExt is an instruction. Otherwise we cannot // get through it and this method should not be called. Instruction *SExtOpnd = cast(SExt->getOperand(0)); Value *ExtVal = SExt; + bool HasMergedNonFreeExt = false; if (isa(SExtOpnd)) { // Replace s|zext(zext(opnd)) // => zext(opnd). + HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd); Value *ZExt = TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType()); TPT.replaceAllUsesWith(SExt, ZExt); @@ -1918,7 +3956,7 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( // => z|sext(opnd). TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0)); } - CreatedInsts = 0; + CreatedInstsCost = 0; // Remove dead code. if (SExtOpnd->use_empty()) @@ -1926,8 +3964,14 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( // Check if the extension is still needed. Instruction *ExtInst = dyn_cast(ExtVal); - if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) + if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) { + if (ExtInst) { + if (Exts) + Exts->push_back(ExtInst); + CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt; + } return ExtVal; + } // At this point we have: ext ty opnd to ty. // Reassign the uses of ExtInst to the opnd and remove ExtInst. @@ -1938,11 +3982,14 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( Value *TypePromotionHelper::promoteOperandForOther( Instruction *Ext, TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, bool IsSExt) { + InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, + SmallVectorImpl *Exts, + SmallVectorImpl *Truncs, const TargetLowering &TLI, + bool IsSExt) { // By construction, the operand of Ext is an instruction. Otherwise we cannot // get through it and this method should not be called. Instruction *ExtOpnd = cast(Ext->getOperand(0)); - CreatedInsts = 0; + CreatedInstsCost = 0; if (!ExtOpnd->hasOneUse()) { // ExtOpnd will be promoted. // All its uses, but Ext, will need to use a truncated value of the @@ -1953,10 +4000,12 @@ Value *TypePromotionHelper::promoteOperandForOther( ITrunc->removeFromParent(); // Insert it just after the definition. ITrunc->insertAfter(ExtOpnd); + if (Truncs) + Truncs->push_back(ITrunc); } TPT.replaceAllUsesWith(ExtOpnd, Trunc); - // Restore the operand of Ext (which has been replace by the previous call + // Restore the operand of Ext (which has been replaced by the previous call // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext. TPT.setOperand(Ext, 0, ExtOpnd); } @@ -2008,17 +4057,22 @@ Value *TypePromotionHelper::promoteOperandForOther( if (!ExtForOpnd) { // If yes, create a new one. DEBUG(dbgs() << "More operands to ext\n"); - ExtForOpnd = - cast(IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType()) - : TPT.createZExt(Ext, Opnd, Ext->getType())); - ++CreatedInsts; + Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType()) + : TPT.createZExt(Ext, Opnd, Ext->getType()); + if (!isa(ValForExtOpnd)) { + TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd); + continue; + } + ExtForOpnd = cast(ValForExtOpnd); } - + if (Exts) + Exts->push_back(ExtForOpnd); TPT.setOperand(ExtForOpnd, 0, Opnd); // Move the sign extension before the insertion point. TPT.moveBefore(ExtForOpnd, ExtOpnd); TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd); + CreatedInstsCost += !TLI.isExtFree(ExtForOpnd); // If more sext are required, new instructions will have to be created. ExtForOpnd = nullptr; } @@ -2029,43 +4083,33 @@ Value *TypePromotionHelper::promoteOperandForOther( return ExtOpnd; } -/// IsPromotionProfitable - Check whether or not promoting an instruction -/// to a wider type was profitable. -/// \p MatchedSize gives the number of instructions that have been matched -/// in the addressing mode after the promotion was applied. -/// \p SizeWithPromotion gives the number of created instructions for -/// the promotion plus the number of instructions that have been -/// matched in the addressing mode before the promotion. +/// Check whether or not promoting an instruction to a wider type is profitable. +/// \p NewCost gives the cost of extension instructions created by the +/// promotion. +/// \p OldCost gives the cost of extension instructions before the promotion +/// plus the number of instructions that have been +/// matched in the addressing mode the promotion. /// \p PromotedOperand is the value that has been promoted. /// \return True if the promotion is profitable, false otherwise. -bool -AddressingModeMatcher::IsPromotionProfitable(unsigned MatchedSize, - unsigned SizeWithPromotion, - Value *PromotedOperand) const { - // We folded less instructions than what we created to promote the operand. +bool AddressingModeMatcher::isPromotionProfitable( + unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const { + DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n'); + // The cost of the new extensions is greater than the cost of the + // old extension plus what we folded. // This is not profitable. - if (MatchedSize < SizeWithPromotion) + if (NewCost > OldCost) return false; - if (MatchedSize > SizeWithPromotion) + if (NewCost < OldCost) return true; // The promotion is neutral but it may help folding the sign extension in // loads for instance. // Check that we did not create an illegal instruction. - Instruction *PromotedInst = dyn_cast(PromotedOperand); - if (!PromotedInst) - return false; - int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode()); - // If the ISDOpcode is undefined, it was undefined before the promotion. - if (!ISDOpcode) - return true; - // Otherwise, check if the promoted instruction is legal or not. - return TLI.isOperationLegalOrCustom( - ISDOpcode, TLI.getValueType(PromotedInst->getType())); + return isPromotedInstructionLegal(TLI, DL, PromotedOperand); } -/// MatchOperationAddr - Given an instruction or constant expr, see if we can -/// fold the operation into the addressing mode. If so, update the addressing -/// mode and return true, otherwise return false without modifying AddrMode. +/// Given an instruction or constant expr, see if we can fold the operation +/// into the addressing mode. If so, update the addressing mode and return +/// true, otherwise return false without modifying AddrMode. /// If \p MovedAway is not NULL, it contains the information of whether or /// not AddrInst has to be folded into the addressing mode on success. /// If \p MovedAway == true, \p AddrInst will not be part of the addressing @@ -2074,7 +4118,7 @@ AddressingModeMatcher::IsPromotionProfitable(unsigned MatchedSize, /// This state can happen when AddrInst is a sext, since it may be moved away. /// Therefore, AddrInst may not be valid when MovedAway is true and it must /// not be referenced anymore. -bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, +bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth, bool *MovedAway) { // Avoid exponential behavior on extremely deep expression trees. @@ -2087,15 +4131,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, switch (Opcode) { case Instruction::PtrToInt: // PtrToInt is always a noop, as we know that the int type is pointer sized. - return MatchAddr(AddrInst->getOperand(0), Depth); - case Instruction::IntToPtr: + return matchAddr(AddrInst->getOperand(0), Depth); + case Instruction::IntToPtr: { + auto AS = AddrInst->getType()->getPointerAddressSpace(); + auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS)); // This inttoptr is a no-op if the integer type is pointer sized. - if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == - TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace())) - return MatchAddr(AddrInst->getOperand(0), Depth); + if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy) + return matchAddr(AddrInst->getOperand(0), Depth); return false; + } case Instruction::BitCast: - case Instruction::AddrSpaceCast: // BitCast is always a noop, and we can handle it as long as it is // int->int or pointer->pointer (we don't want int<->fp or something). if ((AddrInst->getOperand(0)->getType()->isPointerTy() || @@ -2104,8 +4149,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, // and we don't want to mess around with them. Assume it knows what it // is doing. AddrInst->getOperand(0)->getType() != AddrInst->getType()) - return MatchAddr(AddrInst->getOperand(0), Depth); + return matchAddr(AddrInst->getOperand(0), Depth); + return false; + case Instruction::AddrSpaceCast: { + unsigned SrcAS + = AddrInst->getOperand(0)->getType()->getPointerAddressSpace(); + unsigned DestAS = AddrInst->getType()->getPointerAddressSpace(); + if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) + return matchAddr(AddrInst->getOperand(0), Depth); return false; + } case Instruction::Add: { // Check to see if we can merge in the RHS then the LHS. If so, we win. ExtAddrMode BackupAddrMode = AddrMode; @@ -2117,8 +4170,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - if (MatchAddr(AddrInst->getOperand(1), Depth+1) && - MatchAddr(AddrInst->getOperand(0), Depth+1)) + if (matchAddr(AddrInst->getOperand(1), Depth+1) && + matchAddr(AddrInst->getOperand(0), Depth+1)) return true; // Restore the old addr mode info. @@ -2127,8 +4180,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, TPT.rollback(LastKnownGood); // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. - if (MatchAddr(AddrInst->getOperand(0), Depth+1) && - MatchAddr(AddrInst->getOperand(1), Depth+1)) + if (matchAddr(AddrInst->getOperand(0), Depth+1) && + matchAddr(AddrInst->getOperand(1), Depth+1)) return true; // Otherwise we definitely can't merge the ADD in. @@ -2150,7 +4203,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, if (Opcode == Instruction::Shl) Scale = 1LL << Scale; - return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); + return matchScaledValue(AddrInst->getOperand(0), Scale, Depth); } case Instruction::GetElementPtr: { // Scan the GEP. We check it if it contains constant offsets and at most @@ -2159,16 +4212,15 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, unsigned VariableScale = 0; int64_t ConstantOffset = 0; - const DataLayout *TD = TLI.getDataLayout(); gep_type_iterator GTI = gep_type_begin(AddrInst); for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { if (StructType *STy = dyn_cast(*GTI)) { - const StructLayout *SL = TD->getStructLayout(STy); + const StructLayout *SL = DL.getStructLayout(STy); unsigned Idx = cast(AddrInst->getOperand(i))->getZExtValue(); ConstantOffset += SL->getElementOffset(Idx); } else { - uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType()); + uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType()); if (ConstantInt *CI = dyn_cast(AddrInst->getOperand(i))) { ConstantOffset += CI->getSExtValue()*TypeSize; } else if (TypeSize) { // Scales of zero don't do anything. @@ -2187,9 +4239,10 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, // just add it to the disp field and check validity. if (VariableOperand == -1) { AddrMode.BaseOffs += ConstantOffset; - if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){ + if (ConstantOffset == 0 || + TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) { // Check to see if we can fold the base pointer in too. - if (MatchAddr(AddrInst->getOperand(0), Depth+1)) + if (matchAddr(AddrInst->getOperand(0), Depth+1)) return true; } AddrMode.BaseOffs -= ConstantOffset; @@ -2204,7 +4257,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, AddrMode.BaseOffs += ConstantOffset; // Match the base operand of the GEP. - if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) { + if (!matchAddr(AddrInst->getOperand(0), Depth+1)) { // If it couldn't be matched, just stuff the value in a register. if (AddrMode.HasBaseReg) { AddrMode = BackupAddrMode; @@ -2216,7 +4269,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, } // Match the remaining variable portion of the GEP. - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, + if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, Depth)) { // If it couldn't be matched, try stuffing the base into a register // instead of matching it, and retrying the match of the scale. @@ -2227,7 +4280,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, AddrMode.HasBaseReg = true; AddrMode.BaseReg = AddrInst->getOperand(0); AddrMode.BaseOffs += ConstantOffset; - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), + if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, Depth)) { // If even that didn't work, bail. AddrMode = BackupAddrMode; @@ -2247,14 +4300,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, // Try to move this ext out of the way of the addressing mode. // Ask for a method for doing so. TypePromotionHelper::Action TPH = - TypePromotionHelper::getAction(Ext, InsertedTruncs, TLI, PromotedInsts); + TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts); if (!TPH) return false; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - unsigned CreatedInsts = 0; - Value *PromotedOperand = TPH(Ext, TPT, PromotedInsts, CreatedInsts); + unsigned CreatedInstsCost = 0; + unsigned ExtCost = !TLI.isExtFree(Ext); + Value *PromotedOperand = + TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI); // SExt has been moved away. // Thus either it will be rematched later in the recursive calls or it is // gone. Anyway, we must not fold it into the addressing mode at this point. @@ -2275,8 +4330,13 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, ExtAddrMode BackupAddrMode = AddrMode; unsigned OldSize = AddrModeInsts.size(); - if (!MatchAddr(PromotedOperand, Depth) || - !IsPromotionProfitable(AddrModeInsts.size(), OldSize + CreatedInsts, + if (!matchAddr(PromotedOperand, Depth) || + // The total of the new cost is equal to the cost of the created + // instructions. + // The total of the old cost is equal to the cost of the extension plus + // what we have saved in the addressing mode. + !isPromotionProfitable(CreatedInstsCost, + ExtCost + (AddrModeInsts.size() - OldSize), PromotedOperand)) { AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); @@ -2290,12 +4350,12 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, return false; } -/// MatchAddr - If we can, try to add the value of 'Addr' into the current -/// addressing mode. If Addr can't be added to AddrMode this returns false and -/// leaves AddrMode unmodified. This assumes that Addr is either a pointer type -/// or intptr_t for the target. +/// If we can, try to add the value of 'Addr' into the current addressing mode. +/// If Addr can't be added to AddrMode this returns false and leaves AddrMode +/// unmodified. This assumes that Addr is either a pointer type or intptr_t +/// for the target. /// -bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { +bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { // Start a transaction at this point that we will rollback if the matching // fails. TypePromotionTransaction::ConstRestorationPt LastKnownGood = @@ -2303,14 +4363,14 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { if (ConstantInt *CI = dyn_cast(Addr)) { // Fold in immediates if legal for the target. AddrMode.BaseOffs += CI->getSExtValue(); - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) return true; AddrMode.BaseOffs -= CI->getSExtValue(); } else if (GlobalValue *GV = dyn_cast(Addr)) { // If this is a global variable, try to fold it into the addressing mode. if (!AddrMode.BaseGV) { AddrMode.BaseGV = GV; - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) return true; AddrMode.BaseGV = nullptr; } @@ -2320,8 +4380,8 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { // Check to see if it is possible to fold this operation. bool MovedAway = false; - if (MatchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) { - // This instruction may have been move away. If so, there is nothing + if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) { + // This instruction may have been moved away. If so, there is nothing // to check here. if (MovedAway) return true; @@ -2329,7 +4389,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { // *profitable* to do so. We use a simple cost model to avoid increasing // register pressure too much. if (I->hasOneUse() || - IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { + isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { AddrModeInsts.push_back(I); return true; } @@ -2341,7 +4401,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { TPT.rollback(LastKnownGood); } } else if (ConstantExpr *CE = dyn_cast(Addr)) { - if (MatchOperationAddr(CE, CE->getOpcode(), Depth)) + if (matchOperationAddr(CE, CE->getOpcode(), Depth)) return true; TPT.rollback(LastKnownGood); } else if (isa(Addr)) { @@ -2354,7 +4414,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { AddrMode.HasBaseReg = true; AddrMode.BaseReg = Addr; // Still check for legality in case the target supports [imm] but not [i+r]. - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) return true; AddrMode.HasBaseReg = false; AddrMode.BaseReg = nullptr; @@ -2364,7 +4424,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { if (AddrMode.Scale == 0) { AddrMode.Scale = 1; AddrMode.ScaledReg = Addr; - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) return true; AddrMode.Scale = 0; AddrMode.ScaledReg = nullptr; @@ -2374,17 +4434,21 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { return false; } -/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified -/// inline asm call are due to memory operands. If so, return true, otherwise -/// return false. +/// Check to see if all uses of OpVal by the specified inline asm call are due +/// to memory operands. If so, return true, otherwise return false. static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, - const TargetLowering &TLI) { - TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); + const TargetMachine &TM) { + const Function *F = CI->getParent()->getParent(); + const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering(); + const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo(); + TargetLowering::AsmOperandInfoVector TargetConstraints = + TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI, + ImmutableCallSite(CI)); for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; // Compute the constraint code and ConstraintType to use. - TLI.ComputeConstraintToUse(OpInfo, SDValue()); + TLI->ComputeConstraintToUse(OpInfo, SDValue()); // If this asm operand is our Value*, and if it isn't an indirect memory // operand, we can't fold it! @@ -2397,13 +4461,13 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, return true; } -/// FindAllMemoryUses - Recursively walk all the uses of I until we find a -/// memory use. If we find an obviously non-foldable instruction, return true. +/// Recursively walk all the uses of I until we find a memory use. +/// If we find an obviously non-foldable instruction, return true. /// Add the ultimately found memory instructions to MemoryUses. -static bool FindAllMemoryUses(Instruction *I, - SmallVectorImpl > &MemoryUses, - SmallPtrSetImpl &ConsideredInsts, - const TargetLowering &TLI) { +static bool FindAllMemoryUses( + Instruction *I, + SmallVectorImpl> &MemoryUses, + SmallPtrSetImpl &ConsideredInsts, const TargetMachine &TM) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -2433,23 +4497,23 @@ static bool FindAllMemoryUses(Instruction *I, if (!IA) return true; // If this is a memory operand, we're cool, otherwise bail out. - if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) + if (!IsOperandAMemoryOperand(CI, IA, I, TM)) return true; continue; } - if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI)) + if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM)) return true; } return false; } -/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at -/// the use site that we're folding it into. If so, there is no cost to -/// include it in the addressing mode. KnownLive1 and KnownLive2 are two values -/// that we know are live at the instruction already. -bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, +/// Return true if Val is already known to be live at the use site that we're +/// folding it into. If so, there is no cost to include it in the addressing +/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the +/// instruction already. +bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, Value *KnownLive2) { // If Val is either of the known-live values, we know it is live! if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2) @@ -2471,11 +4535,11 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, return Val->isUsedInBasicBlock(MemoryInst->getParent()); } -/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing -/// mode of the machine to fold the specified instruction into a load or store -/// that ultimately uses it. However, the specified instruction has multiple -/// uses. Given this, it may actually increase register pressure to fold it -/// into the load. For example, consider this code: +/// It is possible for the addressing mode of the machine to fold the specified +/// instruction into a load or store that ultimately uses it. +/// However, the specified instruction has multiple uses. +/// Given this, it may actually increase register pressure to fold it +/// into the load. For example, consider this code: /// /// X = ... /// Y = X+1 @@ -2493,7 +4557,7 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, /// X was live across 'load Z' for other reasons, we actually *would* want to /// fold the addressing mode in the Z case. This would make Y die earlier. bool AddressingModeMatcher:: -IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, +isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter) { if (IgnoreProfitability) return true; @@ -2510,9 +4574,9 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // If the BaseReg or ScaledReg was referenced by the previous addrmode, their // lifetime wasn't extended by adding this instruction. - if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) BaseReg = nullptr; - if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) ScaledReg = nullptr; // If folding this instruction (and it's subexprs) didn't extend any live @@ -2526,7 +4590,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // uses. SmallVector, 16> MemoryUses; SmallPtrSet ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -2541,9 +4605,11 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // Get the access type of this use. If the use isn't a pointer, we don't // know what it accesses. Value *Address = User->getOperand(OpNo); - if (!Address->getType()->isPointerTy()) + PointerType *AddrTy = dyn_cast(Address->getType()); + if (!AddrTy) return false; - Type *AddressAccessTy = Address->getType()->getPointerElementType(); + Type *AddressAccessTy = AddrTy->getElementType(); + unsigned AS = AddrTy->getAddressSpace(); // Do a match against the root of this address, ignoring profitability. This // will tell us if the addressing mode for the memory operation will @@ -2551,11 +4617,11 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode Result; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy, - MemoryInst, Result, InsertedTruncs, + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS, + MemoryInst, Result, InsertedInsts, PromotedInsts, TPT); Matcher.IgnoreProfitability = true; - bool Success = Matcher.MatchAddr(Address, 0); + bool Success = Matcher.matchAddr(Address, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); // The match was to check the profitability, the changes made are not @@ -2576,7 +4642,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, } // end anonymous namespace -/// IsNonLocalValue - Return true if the specified values are defined in a +/// Return true if the specified values are defined in a /// different basic block than BB. static bool IsNonLocalValue(Value *V, BasicBlock *BB) { if (Instruction *I = dyn_cast(V)) @@ -2584,17 +4650,16 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) { return false; } -/// OptimizeMemoryInst - Load and Store Instructions often have -/// addressing modes that can do significant amounts of computation. As such, -/// instruction selection will try to get the load or store to do as much -/// computation as possible for the program. The problem is that isel can only -/// see within a single block. As such, we sink as much legal addressing mode -/// stuff into the block as possible. +/// Load and Store Instructions often have addressing modes that can do +/// significant amounts of computation. As such, instruction selection will try +/// to get the load or store to do as much computation as possible for the +/// program. The problem is that isel can only see within a single block. As +/// such, we sink as much legal addressing mode work into the block as possible. /// /// This method is used to optimize both load/store and inline asms with memory /// operands. -bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, - Type *AccessTy) { +bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, + Type *AccessTy, unsigned AddrSpace) { Value *Repl = Addr; // Try to collapse single-value PHI nodes. This is necessary to undo @@ -2626,16 +4691,16 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // For a PHI node, push all of its incoming values. if (PHINode *P = dyn_cast(V)) { - for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) - worklist.push_back(P->getIncomingValue(i)); + for (Value *IncValue : P->incoming_values()) + worklist.push_back(IncValue); continue; } // For non-PHIs, determine the addressing mode being computed. SmallVector NewAddrModeInsts; ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( - V, AccessTy, MemoryInst, NewAddrModeInsts, *TLI, InsertedTruncsSet, - PromotedInsts, TPT); + V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, + InsertedInsts, PromotedInsts, TPT); // This check is broken into two cases with very similar code to avoid using // getNumUses() as much as possible. Some values have a lot of uses, so @@ -2709,13 +4774,15 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, << *MemoryInst << "\n"); if (SunkAddr->getType() != Addr->getType()) SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); - } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() && - TM && TM->getSubtarget().useAA())) { + } else if (AddrSinkUsingGEPs || + (!AddrSinkUsingGEPs.getNumOccurrences() && TM && + TM->getSubtargetImpl(*MemoryInst->getParent()->getParent()) + ->useAA())) { // By default, we use the GEP-based method when AA is used later. This // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); - Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType()); + Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); Value *ResultPtr = nullptr, *ResultIndex = nullptr; // First, find the pointer. @@ -2761,7 +4828,8 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, return false; } else { Type *I8PtrTy = - Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace()); + Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace()); + Type *I8Ty = Builder.getInt8Ty(); // Start with the base register. Do this first so that subsequent address // matching finds it last, which will prevent it from trying to match it @@ -2813,7 +4881,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // SDAG consecutive load/store merging. if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); - ResultPtr = Builder.CreateGEP(ResultPtr, ResultIndex, "sunkaddr"); + ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } ResultIndex = V; @@ -2824,7 +4892,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, } else { if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); - SunkAddr = Builder.CreateGEP(ResultPtr, ResultIndex, "sunkaddr"); + SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } if (SunkAddr->getType() != Addr->getType()) @@ -2833,7 +4901,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); - Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType()); + Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); Value *Result = nullptr; // Start with the base register. Do this first so that subsequent address @@ -2911,12 +4979,12 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (Repl->use_empty()) { // This can cause recursive deletion, which can invalidate our iterator. // Use a WeakVH to hold onto it in case this happens. - WeakVH IterHandle(CurInstIterator); + WeakVH IterHandle(&*CurInstIterator); BasicBlock *BB = CurInstIterator->getParent(); RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); - if (IterHandle != CurInstIterator) { + if (IterHandle != CurInstIterator.getNodePtrUnchecked()) { // If the iterator instruction was recursively deleted, start over at the // start of the block. CurInstIterator = BB->begin(); @@ -2927,14 +4995,15 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, return true; } -/// OptimizeInlineAsmInst - If there are any memory operands, use -/// OptimizeMemoryInst to sink their address computing into the block when -/// possible / profitable. -bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { +/// If there are any memory operands, use OptimizeMemoryInst to sink their +/// address computing into the block when possible / profitable. +bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { bool MadeChange = false; - TargetLowering::AsmOperandInfoVector - TargetConstraints = TLI->ParseConstraints(CS); + const TargetRegisterInfo *TRI = + TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo(); + TargetLowering::AsmOperandInfoVector TargetConstraints = + TLI->ParseConstraints(*DL, TRI, CS); unsigned ArgNo = 0; for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; @@ -2945,7 +5014,7 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { if (OpInfo.ConstraintType == TargetLowering::C_Memory && OpInfo.isIndirect) { Value *OpVal = CS->getArgOperand(ArgNo++); - MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType()); + MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u); } else if (OpInfo.Type == InlineAsm::isInput) ArgNo++; } @@ -2953,28 +5022,188 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { return MadeChange; } -/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same -/// basic block as the load, unless conditions are unfavorable. This allows -/// SelectionDAG to fold the extend into the load. +/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or +/// sign extensions. +static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { + assert(!Inst->use_empty() && "Input must have at least one use"); + const Instruction *FirstUser = cast(*Inst->user_begin()); + bool IsSExt = isa(FirstUser); + Type *ExtTy = FirstUser->getType(); + for (const User *U : Inst->users()) { + const Instruction *UI = cast(U); + if ((IsSExt && !isa(UI)) || (!IsSExt && !isa(UI))) + return false; + Type *CurTy = UI->getType(); + // Same input and output types: Same instruction after CSE. + if (CurTy == ExtTy) + continue; + + // If IsSExt is true, we are in this situation: + // a = Inst + // b = sext ty1 a to ty2 + // c = sext ty1 a to ty3 + // Assuming ty2 is shorter than ty3, this could be turned into: + // a = Inst + // b = sext ty1 a to ty2 + // c = sext ty2 b to ty3 + // However, the last sext is not free. + if (IsSExt) + return false; + + // This is a ZExt, maybe this is free to extend from one type to another. + // In that case, we would not account for a different use. + Type *NarrowTy; + Type *LargeTy; + if (ExtTy->getScalarType()->getIntegerBitWidth() > + CurTy->getScalarType()->getIntegerBitWidth()) { + NarrowTy = CurTy; + LargeTy = ExtTy; + } else { + NarrowTy = ExtTy; + LargeTy = CurTy; + } + + if (!TLI.isZExtFree(NarrowTy, LargeTy)) + return false; + } + // All uses are the same or can be derived from one another for free. + return true; +} + +/// \brief Try to form ExtLd by promoting \p Exts until they reach a +/// load instruction. +/// If an ext(load) can be formed, it is returned via \p LI for the load +/// and \p Inst for the extension. +/// Otherwise LI == nullptr and Inst == nullptr. +/// When some promotion happened, \p TPT contains the proper state to +/// revert them. +/// +/// \return true when promoting was necessary to expose the ext(load) +/// opportunity, false otherwise. +/// +/// Example: +/// \code +/// %ld = load i32* %addr +/// %add = add nuw i32 %ld, 4 +/// %zext = zext i32 %add to i64 +/// \endcode +/// => +/// \code +/// %ld = load i32* %addr +/// %zext = zext i32 %ld to i64 +/// %add = add nuw i64 %zext, 4 +/// \encode +/// Thanks to the promotion, we can match zext(load i32*) to i64. +bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, + LoadInst *&LI, Instruction *&Inst, + const SmallVectorImpl &Exts, + unsigned CreatedInstsCost = 0) { + // Iterate over all the extensions to see if one form an ext(load). + for (auto I : Exts) { + // Check if we directly have ext(load). + if ((LI = dyn_cast(I->getOperand(0)))) { + Inst = I; + // No promotion happened here. + return false; + } + // Check whether or not we want to do any promotion. + if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) + continue; + // Get the action to perform the promotion. + TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( + I, InsertedInsts, *TLI, PromotedInsts); + // Check if we can promote. + if (!TPH) + continue; + // Save the current state. + TypePromotionTransaction::ConstRestorationPt LastKnownGood = + TPT.getRestorationPoint(); + SmallVector NewExts; + unsigned NewCreatedInstsCost = 0; + unsigned ExtCost = !TLI->isExtFree(I); + // Promote. + Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost, + &NewExts, nullptr, *TLI); + assert(PromotedVal && + "TypePromotionHelper should have filtered out those cases"); + + // We would be able to merge only one extension in a load. + // Therefore, if we have more than 1 new extension we heuristically + // cut this search path, because it means we degrade the code quality. + // With exactly 2, the transformation is neutral, because we will merge + // one extension but leave one. However, we optimistically keep going, + // because the new extension may be removed too. + long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; + TotalCreatedInstsCost -= ExtCost; + if (!StressExtLdPromotion && + (TotalCreatedInstsCost > 1 || + !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { + // The promotion is not profitable, rollback to the previous state. + TPT.rollback(LastKnownGood); + continue; + } + // The promotion is profitable. + // Check if it exposes an ext(load). + (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); + if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || + // If we have created a new extension, i.e., now we have two + // extensions. We must make sure one of them is merged with + // the load, otherwise we may degrade the code quality. + (LI->hasOneUse() || hasSameExtUse(LI, *TLI)))) + // Promotion happened. + return true; + // If this does not help to expose an ext(load) then, rollback. + TPT.rollback(LastKnownGood); + } + // None of the extension can form an ext(load). + LI = nullptr; + Inst = nullptr; + return false; +} + +/// Move a zext or sext fed by a load into the same basic block as the load, +/// unless conditions are unfavorable. This allows SelectionDAG to fold the +/// extend into the load. +/// \p I[in/out] the extension may be modified during the process if some +/// promotions apply. /// -bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { +bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) { + // Try to promote a chain of computation if it allows to form + // an extended load. + TypePromotionTransaction TPT; + TypePromotionTransaction::ConstRestorationPt LastKnownGood = + TPT.getRestorationPoint(); + SmallVector Exts; + Exts.push_back(I); // Look for a load being extended. - LoadInst *LI = dyn_cast(I->getOperand(0)); - if (!LI) return false; + LoadInst *LI = nullptr; + Instruction *OldExt = I; + bool HasPromoted = extLdPromotion(TPT, LI, I, Exts); + if (!LI || !I) { + assert(!HasPromoted && !LI && "If we did not match any load instruction " + "the code must remain the same"); + I = OldExt; + return false; + } // If they're already in the same block, there's nothing to do. - if (LI->getParent() == I->getParent()) + // Make the cheap checks first if we did not promote. + // If we promoted, we need to check if it is indeed profitable. + if (!HasPromoted && LI->getParent() == I->getParent()) return false; - EVT VT = TLI->getValueType(I->getType()); - EVT LoadVT = TLI->getValueType(LI->getType()); + EVT VT = TLI->getValueType(*DL, I->getType()); + EVT LoadVT = TLI->getValueType(*DL, LI->getType()); // If the load has other users and the truncate is not free, this probably // isn't worthwhile. if (!LI->hasOneUse() && TLI && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && - !TLI->isTruncateFree(I->getType(), LI->getType())) + !TLI->isTruncateFree(I->getType(), LI->getType())) { + I = OldExt; + TPT.rollback(LastKnownGood); return false; + } // Check whether the target supports casts folded into loads. unsigned LType; @@ -2984,18 +5213,22 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { assert(isa(I) && "Unexpected ext type!"); LType = ISD::SEXTLOAD; } - if (TLI && !TLI->isLoadExtLegal(LType, LoadVT)) + if (TLI && !TLI->isLoadExtLegal(LType, VT, LoadVT)) { + I = OldExt; + TPT.rollback(LastKnownGood); return false; + } // Move the extend into the same block as the load, so that SelectionDAG // can fold it. + TPT.commit(); I->removeFromParent(); I->insertAfter(LI); ++NumExtsMoved; return true; } -bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { +bool CodeGenPrepare::optimizeExtUses(Instruction *I) { BasicBlock *DefBB = I->getParent(); // If the result of a {s|z}ext and its source are both live out, rewrite all @@ -3053,8 +5286,9 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { if (!InsertedTrunc) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt); - InsertedTruncsSet.insert(InsertedTrunc); + assert(InsertPt != UserBB->end()); + InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt); + InsertedInsts.insert(InsertedTrunc); } // Replace a use of the {s|z}ext source with a use of the result. @@ -3066,9 +5300,202 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { return MadeChange; } -/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be -/// turned into an explicit branch. -static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { +// Find loads whose uses only use some of the loaded value's bits. Add an "and" +// just after the load if the target can fold this into one extload instruction, +// with the hope of eliminating some of the other later "and" instructions using +// the loaded value. "and"s that are made trivially redundant by the insertion +// of the new "and" are removed by this function, while others (e.g. those whose +// path from the load goes through a phi) are left for isel to potentially +// remove. +// +// For example: +// +// b0: +// x = load i32 +// ... +// b1: +// y = and x, 0xff +// z = use y +// +// becomes: +// +// b0: +// x = load i32 +// x' = and x, 0xff +// ... +// b1: +// z = use x' +// +// whereas: +// +// b0: +// x1 = load i32 +// ... +// b1: +// x2 = load i32 +// ... +// b2: +// x = phi x1, x2 +// y = and x, 0xff +// +// becomes (after a call to optimizeLoadExt for each load): +// +// b0: +// x1 = load i32 +// x1' = and x1, 0xff +// ... +// b1: +// x2 = load i32 +// x2' = and x2, 0xff +// ... +// b2: +// x = phi x1', x2' +// y = and x, 0xff +// + +bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { + + if (!Load->isSimple() || + !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) + return false; + + // Skip loads we've already transformed or have no reason to transform. + if (Load->hasOneUse()) { + User *LoadUser = *Load->user_begin(); + if (cast(LoadUser)->getParent() == Load->getParent() && + !dyn_cast(LoadUser)) + return false; + } + + // Look at all uses of Load, looking through phis, to determine how many bits + // of the loaded value are needed. + SmallVector WorkList; + SmallPtrSet Visited; + SmallVector AndsToMaybeRemove; + for (auto *U : Load->users()) + WorkList.push_back(cast(U)); + + EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); + unsigned BitWidth = LoadResultVT.getSizeInBits(); + APInt DemandBits(BitWidth, 0); + APInt WidestAndBits(BitWidth, 0); + + while (!WorkList.empty()) { + Instruction *I = WorkList.back(); + WorkList.pop_back(); + + // Break use-def graph loops. + if (!Visited.insert(I).second) + continue; + + // For a PHI node, push all of its users. + if (auto *Phi = dyn_cast(I)) { + for (auto *U : Phi->users()) + WorkList.push_back(cast(U)); + continue; + } + + switch (I->getOpcode()) { + case llvm::Instruction::And: { + auto *AndC = dyn_cast(I->getOperand(1)); + if (!AndC) + return false; + APInt AndBits = AndC->getValue(); + DemandBits |= AndBits; + // Keep track of the widest and mask we see. + if (AndBits.ugt(WidestAndBits)) + WidestAndBits = AndBits; + if (AndBits == WidestAndBits && I->getOperand(0) == Load) + AndsToMaybeRemove.push_back(I); + break; + } + + case llvm::Instruction::Shl: { + auto *ShlC = dyn_cast(I->getOperand(1)); + if (!ShlC) + return false; + uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); + auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); + DemandBits |= ShlDemandBits; + break; + } + + case llvm::Instruction::Trunc: { + EVT TruncVT = TLI->getValueType(*DL, I->getType()); + unsigned TruncBitWidth = TruncVT.getSizeInBits(); + auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); + DemandBits |= TruncBits; + break; + } + + default: + return false; + } + } + + uint32_t ActiveBits = DemandBits.getActiveBits(); + // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the + // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, + // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but + // (and (load x) 1) is not matched as a single instruction, rather as a LDR + // followed by an AND. + // TODO: Look into removing this restriction by fixing backends to either + // return false for isLoadExtLegal for i1 or have them select this pattern to + // a single instruction. + // + // Also avoid hoisting if we didn't see any ands with the exact DemandBits + // mask, since these are the only ands that will be removed by isel. + if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + WidestAndBits != DemandBits) + return false; + + LLVMContext &Ctx = Load->getType()->getContext(); + Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); + EVT TruncVT = TLI->getValueType(*DL, TruncTy); + + // Reject cases that won't be matched as extloads. + if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || + !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) + return false; + + IRBuilder<> Builder(Load->getNextNode()); + auto *NewAnd = dyn_cast( + Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + + // Replace all uses of load with new and (except for the use of load in the + // new and itself). + Load->replaceAllUsesWith(NewAnd); + NewAnd->setOperand(0, Load); + + // Remove any and instructions that are now redundant. + for (auto *And : AndsToMaybeRemove) + // Check that the and mask is the same as the one we decided to put on the + // new and. + if (cast(And->getOperand(1))->getValue() == DemandBits) { + And->replaceAllUsesWith(NewAnd); + if (&*CurInstIterator == And) + CurInstIterator = std::next(And->getIterator()); + And->eraseFromParent(); + ++NumAndUses; + } + + ++NumAndsAdded; + return true; +} + +/// Check if V (an operand of a select instruction) is an expensive instruction +/// that is only used once. +static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { + auto *I = dyn_cast(V); + // If it's safe to speculatively execute, then it should not have side + // effects; therefore, it's safe to sink and possibly *not* execute. + return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) && + TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive; +} + +/// Returns true if a SelectInst should be turned into an explicit branch. +static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI, + SelectInst *SI) { // FIXME: This should use the same heuristics as IfConversion to determine // whether a select is better represented as a branch. This requires that // branch probability metadata is preserved for the select, which is not the @@ -3076,28 +5503,36 @@ static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { CmpInst *Cmp = dyn_cast(SI->getCondition()); - // If the branch is predicted right, an out of order CPU can avoid blocking on - // the compare. Emit cmovs on compares with a memory operand as branches to - // avoid stalls on the load from memory. If the compare has more than one use - // there's probably another cmov or setcc around so it's not worth emitting a - // branch. - if (!Cmp) + // If a branch is predictable, an out-of-order CPU can avoid blocking on its + // comparison condition. If the compare has more than one use, there's + // probably another cmov or setcc around, so it's not worth emitting a branch. + if (!Cmp || !Cmp->hasOneUse()) return false; Value *CmpOp0 = Cmp->getOperand(0); Value *CmpOp1 = Cmp->getOperand(1); - // We check that the memory operand has one use to avoid uses of the loaded - // value directly after the compare, making branches unprofitable. - return Cmp->hasOneUse() && - ((isa(CmpOp0) && CmpOp0->hasOneUse()) || - (isa(CmpOp1) && CmpOp1->hasOneUse())); + // Emit "cmov on compare with a memory operand" as a branch to avoid stalls + // on a load from memory. But if the load is used more than once, do not + // change the select to a branch because the load is probably needed + // regardless of whether the branch is taken or not. + if ((isa(CmpOp0) && CmpOp0->hasOneUse()) || + (isa(CmpOp1) && CmpOp1->hasOneUse())) + return true; + + // If either operand of the select is expensive and only needed on one side + // of the select, we should form a branch. + if (sinkSelectOperand(TTI, SI->getTrueValue()) || + sinkSelectOperand(TTI, SI->getFalseValue())) + return true; + + return false; } /// If we have a SelectInst that will likely profit from branch prediction, /// turn it into a branch. -bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { +bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); // Can we convert the 'select' to CF ? @@ -3117,34 +5552,97 @@ bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { // We have efficient codegen support for the select instruction. // Check if it is profitable to keep this 'select'. if (!TLI->isPredictableSelectExpensive() || - !isFormingBranchFromSelectProfitable(SI)) + !isFormingBranchFromSelectProfitable(TTI, SI)) return false; } ModifiedDT = true; + // Transform a sequence like this: + // start: + // %cmp = cmp uge i32 %a, %b + // %sel = select i1 %cmp, i32 %c, i32 %d + // + // Into: + // start: + // %cmp = cmp uge i32 %a, %b + // br i1 %cmp, label %select.true, label %select.false + // select.true: + // br label %select.end + // select.false: + // br label %select.end + // select.end: + // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] + // + // In addition, we may sink instructions that produce %c or %d from + // the entry block into the destination(s) of the new branch. + // If the true or false blocks do not contain a sunken instruction, that + // block and its branch may be optimized away. In that case, one side of the + // first branch will point directly to select.end, and the corresponding PHI + // predecessor block will be the start block. + // First, we split the block containing the select into 2 blocks. BasicBlock *StartBlock = SI->getParent(); BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI)); - BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); - - // Create a new block serving as the landing pad for the branch. - BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid", - NextBlock->getParent(), NextBlock); + BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); - // Move the unconditional branch from the block with the select in it into our - // landing pad block. + // Delete the unconditional branch that was just created by the split. StartBlock->getTerminator()->eraseFromParent(); - BranchInst::Create(NextBlock, SmallBlock); + + // These are the new basic blocks for the conditional branch. + // At least one will become an actual new basic block. + BasicBlock *TrueBlock = nullptr; + BasicBlock *FalseBlock = nullptr; + + // Sink expensive instructions into the conditional blocks to avoid executing + // them speculatively. + if (sinkSelectOperand(TTI, SI->getTrueValue())) { + TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", + EndBlock->getParent(), EndBlock); + auto *TrueBranch = BranchInst::Create(EndBlock, TrueBlock); + auto *TrueInst = cast(SI->getTrueValue()); + TrueInst->moveBefore(TrueBranch); + } + if (sinkSelectOperand(TTI, SI->getFalseValue())) { + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", + EndBlock->getParent(), EndBlock); + auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + auto *FalseInst = cast(SI->getFalseValue()); + FalseInst->moveBefore(FalseBranch); + } + + // If there was nothing to sink, then arbitrarily choose the 'false' side + // for a new input value to the PHI. + if (TrueBlock == FalseBlock) { + assert(TrueBlock == nullptr && + "Unexpected basic block transform while optimizing select"); + + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", + EndBlock->getParent(), EndBlock); + BranchInst::Create(EndBlock, FalseBlock); + } // Insert the real conditional branch based on the original condition. - BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI); + // If we did not create a new block for one of the 'true' or 'false' paths + // of the condition, it means that side of the branch goes to the end block + // directly and the path originates from the start block from the point of + // view of the new PHI. + if (TrueBlock == nullptr) { + BranchInst::Create(EndBlock, FalseBlock, SI->getCondition(), SI); + TrueBlock = StartBlock; + } else if (FalseBlock == nullptr) { + BranchInst::Create(TrueBlock, EndBlock, SI->getCondition(), SI); + FalseBlock = StartBlock; + } else { + BranchInst::Create(TrueBlock, FalseBlock, SI->getCondition(), SI); + } // The select itself is replaced with a PHI Node. - PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin()); + PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); PN->takeName(SI); - PN->addIncoming(SI->getTrueValue(), StartBlock); - PN->addIncoming(SI->getFalseValue(), SmallBlock); + PN->addIncoming(SI->getTrueValue(), TrueBlock); + PN->addIncoming(SI->getFalseValue(), FalseBlock); + SI->replaceAllUsesWith(PN); SI->eraseFromParent(); @@ -3170,7 +5668,7 @@ static bool isBroadcastShuffle(ShuffleVectorInst *SVI) { /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases /// it's often worth sinking a shufflevector splat down to its use so that /// codegen can spot all lanes are identical. -bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) { +bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { BasicBlock *DefBB = SVI->getParent(); // Only do this xform if variable vector shifts are particularly expensive. @@ -3202,9 +5700,10 @@ bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) { if (!InsertedShuffle) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedShuffle = new ShuffleVectorInst(SVI->getOperand(0), - SVI->getOperand(1), - SVI->getOperand(2), "", InsertPt); + assert(InsertPt != UserBB->end()); + InsertedShuffle = + new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1), + SVI->getOperand(2), "", &*InsertPt); } UI->replaceUsesOfWith(SVI, InsertedShuffle); @@ -3220,6 +5719,49 @@ bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) { return MadeChange; } +bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { + if (!TLI || !DL) + return false; + + Value *Cond = SI->getCondition(); + Type *OldType = Cond->getType(); + LLVMContext &Context = Cond->getContext(); + MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); + unsigned RegWidth = RegType.getSizeInBits(); + + if (RegWidth <= cast(OldType)->getBitWidth()) + return false; + + // If the register width is greater than the type width, expand the condition + // of the switch instruction and each case constant to the width of the + // register. By widening the type of the switch condition, subsequent + // comparisons (for case comparisons) will not need to be extended to the + // preferred register width, so we will potentially eliminate N-1 extends, + // where N is the number of cases in the switch. + auto *NewType = Type::getIntNTy(Context, RegWidth); + + // Zero-extend the switch condition and case constants unless the switch + // condition is a function argument that is already being sign-extended. + // In that case, we can avoid an unnecessary mask/extension by sign-extending + // everything instead. + Instruction::CastOps ExtType = Instruction::ZExt; + if (auto *Arg = dyn_cast(Cond)) + if (Arg->hasSExtAttr()) + ExtType = Instruction::SExt; + + auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); + ExtInst->insertBefore(SI); + SI->setCondition(ExtInst); + for (SwitchInst::CaseIt Case : SI->cases()) { + APInt NarrowConst = Case.getCaseValue()->getValue(); + APInt WideConst = (ExtType == Instruction::ZExt) ? + NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); + Case.setValue(ConstantInt::get(Context, WideConst)); + } + + return true; +} + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. @@ -3237,6 +5779,9 @@ namespace { /// Assuming both extractelement and store can be combine, we get rid of the /// transition. class VectorPromoteHelper { + /// DataLayout associated with the current module. + const DataLayout &DL; + /// Used to perform some checks on the legality of vector operations. const TargetLowering &TLI; @@ -3310,7 +5855,8 @@ class VectorPromoteHelper { unsigned Align = ST->getAlignment(); // Check if this store is supported. if (!TLI.allowsMisalignedMemoryAccesses( - TLI.getValueType(ST->getValueOperand()->getType()), AS, Align)) { + TLI.getValueType(DL, ST->getValueOperand()->getType()), AS, + Align)) { // If this is not supported, there is no way we can combine // the extract with the store. return false; @@ -3349,7 +5895,7 @@ class VectorPromoteHelper { /// \brief Generate a constant vector with \p Val with the same /// number of elements as the transition. /// \p UseSplat defines whether or not \p Val should be replicated - /// accross the whole vector. + /// across the whole vector. /// In other words, if UseSplat == true, we generate , /// otherwise we generate a vector with as many undef as possible: /// where \p Val is only @@ -3405,9 +5951,10 @@ class VectorPromoteHelper { } public: - VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI, - Instruction *Transition, unsigned CombineCost) - : TLI(TLI), TTI(TTI), Transition(Transition), + VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI, + const TargetTransformInfo &TTI, Instruction *Transition, + unsigned CombineCost) + : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition), StoreExtractCombineCost(CombineCost), CombineInst(nullptr) { assert(Transition && "Do not know how to promote null"); } @@ -3443,7 +5990,7 @@ public: return false; return StressStoreExtract || TLI.isOperationLegalOrCustom( - ISDOpcode, TLI.getValueType(getTransitionType(), true)); + ISDOpcode, TLI.getValueType(DL, getTransitionType(), true)); } /// \brief Check whether or not \p Use can be combined @@ -3518,7 +6065,8 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { isa(Val) || canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())); } else - assert(0 && "Did you modified shouldPromote and forgot to update this?"); + llvm_unreachable("Did you modified shouldPromote and forgot to update " + "this?"); ToBePromoted->setOperand(U.getOperandNo(), NewVal); } Transition->removeFromParent(); @@ -3529,7 +6077,7 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { /// Some targets can do store(extractelement) with one instruction. /// Try to push the extractelement towards the stores when the target /// has this feature and this is profitable. -bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) { +bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) { unsigned CombineCost = UINT_MAX; if (DisableStoreExtract || !TLI || (!StressStoreExtract && @@ -3546,7 +6094,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) { // we do not do that for now. BasicBlock *Parent = Inst->getParent(); DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n'); - VectorPromoteHelper VPH(*TLI, *TTI, Inst, CombineCost); + VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost); // If the transition has more than one use, assume this is not going to be // beneficial. while (Inst->hasOneUse()) { @@ -3581,13 +6129,17 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) { return false; } -bool CodeGenPrepare::OptimizeInst(Instruction *I) { +bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { + // Bail out if we inserted the instruction to prevent optimizations from + // stepping on each other's toes. + if (InsertedInsts.count(I)) + return false; + if (PHINode *P = dyn_cast(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a // trivial PHI, go ahead and zap it here. - if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : nullptr, - TLInfo, DT)) { + if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) { P->replaceAllUsesWith(V); P->eraseFromParent(); ++NumPHIsElim; @@ -3606,19 +6158,20 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (isa(CI->getOperand(0))) return false; - if (TLI && OptimizeNoopCopyExpression(CI, *TLI)) + if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL)) return true; if (isa(I) || isa(I)) { /// Sink a zext or sext into its user blocks if the target type doesn't /// fit in one register - if (TLI && TLI->getTypeAction(CI->getContext(), - TLI->getValueType(CI->getType())) == - TargetLowering::TypeExpandInteger) { + if (TLI && + TLI->getTypeAction(CI->getContext(), + TLI->getValueType(*DL, CI->getType())) == + TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { - bool MadeChange = MoveExtToFormExtLoad(I); - return MadeChange | OptimizeExtUses(I); + bool MadeChange = moveExtToFormExtLoad(I); + return MadeChange | optimizeExtUses(I); } } return false; @@ -3629,15 +6182,23 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { return OptimizeCmpExpression(CI); if (LoadInst *LI = dyn_cast(I)) { - if (TLI) - return OptimizeMemoryInst(I, I->getOperand(0), LI->getType()); + stripInvariantGroupMetadata(*LI); + if (TLI) { + bool Modified = optimizeLoadExt(LI); + unsigned AS = LI->getPointerAddressSpace(); + Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + return Modified; + } return false; } if (StoreInst *SI = dyn_cast(I)) { - if (TLI) - return OptimizeMemoryInst(I, SI->getOperand(1), - SI->getOperand(0)->getType()); + stripInvariantGroupMetadata(*SI); + if (TLI) { + unsigned AS = SI->getPointerAddressSpace(); + return optimizeMemoryInst(I, SI->getOperand(1), + SI->getOperand(0)->getType(), AS); + } return false; } @@ -3647,7 +6208,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { BinOp->getOpcode() == Instruction::LShr)) { ConstantInt *CI = dyn_cast(BinOp->getOperand(1)); if (TLI && CI && TLI->hasExtractBitsInsn()) - return OptimizeExtractBits(BinOp, CI, *TLI); + return OptimizeExtractBits(BinOp, CI, *TLI, *DL); return false; } @@ -3660,52 +6221,86 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { GEPI->replaceAllUsesWith(NC); GEPI->eraseFromParent(); ++NumGEPsElim; - OptimizeInst(NC); + optimizeInst(NC, ModifiedDT); return true; } return false; } if (CallInst *CI = dyn_cast(I)) - return OptimizeCallInst(CI); + return optimizeCallInst(CI, ModifiedDT); if (SelectInst *SI = dyn_cast(I)) - return OptimizeSelectInst(SI); + return optimizeSelectInst(SI); if (ShuffleVectorInst *SVI = dyn_cast(I)) - return OptimizeShuffleVectorInst(SVI); + return optimizeShuffleVectorInst(SVI); + + if (auto *Switch = dyn_cast(I)) + return optimizeSwitchInst(Switch); if (isa(I)) - return OptimizeExtractElementInst(I); + return optimizeExtractElementInst(I); return false; } +/// Given an OR instruction, check to see if this is a bitreverse +/// idiom. If so, insert the new intrinsic and return true. +static bool makeBitReverse(Instruction &I, const DataLayout &DL, + const TargetLowering &TLI) { + if (!I.getType()->isIntegerTy() || + !TLI.isOperationLegalOrCustom(ISD::BITREVERSE, + TLI.getValueType(DL, I.getType(), true))) + return false; + + SmallVector Insts; + if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts)) + return false; + Instruction *LastInst = Insts.back(); + I.replaceAllUsesWith(LastInst); + RecursivelyDeleteTriviallyDeadInstructions(&I); + return true; +} + // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. -bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { +bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) { SunkAddrs.clear(); bool MadeChange = false; CurInstIterator = BB.begin(); - while (CurInstIterator != BB.end()) - MadeChange |= OptimizeInst(CurInstIterator++); - - MadeChange |= DupRetToEnableTailCallOpts(&BB); + while (CurInstIterator != BB.end()) { + MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); + if (ModifiedDT) + return true; + } + bool MadeBitReverse = true; + while (TLI && MadeBitReverse) { + MadeBitReverse = false; + for (auto &I : reverse(BB)) { + if (makeBitReverse(I, *DL, *TLI)) { + MadeBitReverse = MadeChange = true; + break; + } + } + } + MadeChange |= dupRetToEnableTailCallOpts(&BB); + return MadeChange; } // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. -bool CodeGenPrepare::PlaceDbgValues(Function &F) { +bool CodeGenPrepare::placeDbgValues(Function &F) { bool MadeChange = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + for (BasicBlock &BB : F) { Instruction *PrevNonDbgInst = nullptr; - for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { - Instruction *Insn = BI; ++BI; + for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { + Instruction *Insn = &*BI++; DbgValueInst *DVI = dyn_cast(Insn); // Leave dbg.values that refer to an alloca alone. These // instrinsics describe the address of a variable (= the alloca) @@ -3719,10 +6314,14 @@ bool CodeGenPrepare::PlaceDbgValues(Function &F) { Instruction *VI = dyn_cast_or_null(DVI->getValue()); if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { + // If VI is a phi in a block with an EHPad terminator, we can't insert + // after it. + if (isa(VI) && VI->getParent()->getTerminator()->isEHPad()) + continue; DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); DVI->removeFromParent(); if (isa(VI)) - DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); + DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt()); else DVI->insertAfter(VI); MadeChange = true; @@ -3746,7 +6345,7 @@ bool CodeGenPrepare::sinkAndCmp(Function &F) { return false; bool MadeChange = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { - BasicBlock *BB = I++; + BasicBlock *BB = &*I++; // Does this BB end with the following? // %andVal = and %val, #single-bit-set @@ -3810,8 +6409,10 @@ static bool extractBranchMetadata(BranchInst *BI, if (!ProfileData || ProfileData->getNumOperands() != 3) return false; - const auto *CITrue = dyn_cast(ProfileData->getOperand(1)); - const auto *CIFalse = dyn_cast(ProfileData->getOperand(2)); + const auto *CITrue = + mdconst::dyn_extract(ProfileData->getOperand(1)); + const auto *CIFalse = + mdconst::dyn_extract(ProfileData->getOperand(2)); if (!CITrue || !CIFalse) return false; @@ -3852,8 +6453,7 @@ static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { /// FIXME: Remove the (equivalent?) implementation in SelectionDAG. /// bool CodeGenPrepare::splitBranchCondition(Function &F) { - if (!TM || TM->Options.EnableFastISel != true || - !TLI || TLI->isJumpExpensive()) + if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive()) return false; bool MadeChange = false; @@ -3868,13 +6468,17 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB))) continue; + auto *Br1 = cast(BB.getTerminator()); + if (Br1->getMetadata(LLVMContext::MD_unpredictable)) + continue; + unsigned Opc; - Instruction *Cond1, *Cond2; - if (match(LogicOp, m_And(m_OneUse(m_Instruction(Cond1)), - m_OneUse(m_Instruction(Cond2))))) + Value *Cond1, *Cond2; + if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)), + m_OneUse(m_Value(Cond2))))) Opc = Instruction::And; - else if (match(LogicOp, m_Or(m_OneUse(m_Instruction(Cond1)), - m_OneUse(m_Instruction(Cond2))))) + else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)), + m_OneUse(m_Value(Cond2))))) Opc = Instruction::Or; else continue; @@ -3894,10 +6498,9 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { // Update original basic block by using the first condition directly by the // branch instruction and removing the no longer needed and/or instruction. - auto *Br1 = cast(BB.getTerminator()); Br1->setCondition(Cond1); LogicOp->eraseFromParent(); - Cond2->removeFromParent(); + // Depending on the conditon we have to either replace the true or the false // successor of the original branch instruction. if (Opc == Instruction::And) @@ -3907,7 +6510,10 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { // Fill in the new basic block. auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB); - Cond2->insertBefore(Br2); + if (auto *I = dyn_cast(Cond2)) { + I->removeFromParent(); + I->insertBefore(Br2); + } // Update PHI nodes in both successors. The original BB needs to be // replaced in one succesor's PHI nodes, because the branch comes now from @@ -4011,10 +6617,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { } } - // Request DOM Tree update. // Note: No point in getting fancy here, since the DT info is never - // available to CodeGenPrepare and the existing update code is broken - // anyways. + // available to CodeGenPrepare. ModifiedDT = true; MadeChange = true; @@ -4024,3 +6628,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { } return MadeChange; } + +void CodeGenPrepare::stripInvariantGroupMetadata(Instruction &I) { + if (auto *InvariantMD = I.getMetadata(LLVMContext::MD_invariant_group)) + I.dropUnknownNonDebugMetadata(InvariantMD->getMetadataID()); +}