X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FCodeGen%2FCodeGenPrepare.cpp;h=c9d22a298654908fd40fe06d9980fd0ca32b38f6;hp=2eeaabed2eb00c8ca0c793c7bacc1e47570055ca;hb=da921ff605b31a92014857dc0f7c37edca5b8913;hpb=0b3441771844d9822234f9dbc570e45b6a748a11 diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 2eeaabed2eb..c9d22a29865 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -15,11 +15,16 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -29,9 +34,12 @@ #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/NoFolder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Statepoint.h" #include "llvm/IR/ValueHandle.h" @@ -63,6 +71,9 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " "computations were sunk"); STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); +STATISTIC(NumAndsAdded, + "Number of and mask instructions added to form ext loads"); +STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); @@ -143,10 +154,15 @@ class TypePromotionTransaction; /// DataLayout for the Function being processed. const DataLayout *DL; + // XXX-comment:We need DominatorTree to figure out which instruction to + // taint. + DominatorTree *DT; + public: static char ID; // Pass identification, replacement for typeid explicit CodeGenPrepare(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) { + : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr), + DT(nullptr) { initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -157,6 +173,7 @@ class TypePromotionTransaction; AU.addPreserved(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); } private: @@ -172,8 +189,10 @@ class TypePromotionTransaction; bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); bool moveExtToFormExtLoad(Instruction *&I); bool optimizeExtUses(Instruction *I); + bool optimizeLoadExt(LoadInst *I); bool optimizeSelectInst(SelectInst *SI); bool optimizeShuffleVectorInst(ShuffleVectorInst *SI); + bool optimizeSwitchInst(SwitchInst *CI); bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB); bool placeDbgValues(Function &F); @@ -189,20 +208,1074 @@ class TypePromotionTransaction; } char CodeGenPrepare::ID = 0; -INITIALIZE_TM_PASS(CodeGenPrepare, "codegenprepare", +INITIALIZE_TM_PASS_BEGIN(CodeGenPrepare, "codegenprepare", + "Optimize for code generation", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_TM_PASS_END(CodeGenPrepare, "codegenprepare", "Optimize for code generation", false, false) FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { return new CodeGenPrepare(TM); } +namespace { + +bool StoreAddressDependOnValue(StoreInst* SI, Value* DepVal); +Value* GetUntaintedAddress(Value* CurrentAddress); + +// The depth we trace down a variable to look for its dependence set. +const unsigned kDependenceDepth = 4; + +// Recursively looks for variables that 'Val' depends on at the given depth +// 'Depth', and adds them in 'DepSet'. If 'InsertOnlyLeafNodes' is true, only +// inserts the leaf node values; otherwise, all visited nodes are included in +// 'DepSet'. Note that constants will be ignored. +template +void recursivelyFindDependence(SetType* DepSet, Value* Val, + bool InsertOnlyLeafNodes = false, + unsigned Depth = kDependenceDepth) { + if (Val == nullptr) { + return; + } + if (!InsertOnlyLeafNodes && !isa(Val)) { + DepSet->insert(Val); + } + if (Depth == 0) { + // Cannot go deeper. Insert the leaf nodes. + if (InsertOnlyLeafNodes && !isa(Val)) { + DepSet->insert(Val); + } + return; + } + + // Go one step further to explore the dependence of the operands. + Instruction* I = nullptr; + if ((I = dyn_cast(Val))) { + if (isa(I)) { + // A load is considerd the leaf load of the dependence tree. Done. + DepSet->insert(Val); + return; + } else if (I->isBinaryOp()) { + BinaryOperator* I = dyn_cast(Val); + Value *Op0 = I->getOperand(0), *Op1 = I->getOperand(1); + recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1); + recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes, Depth - 1); + } else if (I->isCast()) { + Value* Op0 = I->getOperand(0); + recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1); + } else if (I->getOpcode() == Instruction::Select) { + Value* Op0 = I->getOperand(0); + Value* Op1 = I->getOperand(1); + Value* Op2 = I->getOperand(2); + recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1); + recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes, Depth - 1); + recursivelyFindDependence(DepSet, Op2, InsertOnlyLeafNodes, Depth - 1); + } else if (I->getOpcode() == Instruction::GetElementPtr) { + for (unsigned i = 0; i < I->getNumOperands(); i++) { + recursivelyFindDependence(DepSet, I->getOperand(i), InsertOnlyLeafNodes, + Depth - 1); + } + } else if (I->getOpcode() == Instruction::Store) { + auto* SI = dyn_cast(Val); + recursivelyFindDependence(DepSet, SI->getPointerOperand(), + InsertOnlyLeafNodes, Depth - 1); + recursivelyFindDependence(DepSet, SI->getValueOperand(), + InsertOnlyLeafNodes, Depth - 1); + } else { + Value* Op0 = nullptr; + Value* Op1 = nullptr; + switch (I->getOpcode()) { + case Instruction::ICmp: + case Instruction::FCmp: { + Op0 = I->getOperand(0); + Op1 = I->getOperand(1); + recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, + Depth - 1); + recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes, + Depth - 1); + break; + } + case Instruction::PHI: { + for (int i = 0; i < I->getNumOperands(); i++) { + auto* op = I->getOperand(i); + if (DepSet->count(op) == 0) { + recursivelyFindDependence(DepSet, I->getOperand(i), + InsertOnlyLeafNodes, Depth - 1); + } + } + break; + } + default: { + // Be conservative. Add it and be done with it. + DepSet->insert(Val); + return; + } + } + } + } else if (isa(Val)) { + // Not interested in constant values. Done. + return; + } else { + // Be conservative. Add it and be done with it. + DepSet->insert(Val); + return; + } +} + +// Helper function to create a Cast instruction. +Value* createCast(IRBuilder& Builder, Value* DepVal, + Type* TargetIntegerType) { + Instruction::CastOps CastOp = Instruction::BitCast; + switch (DepVal->getType()->getTypeID()) { + case Type::IntegerTyID: { + assert(TargetIntegerType->getTypeID() == Type::IntegerTyID); + auto* FromType = dyn_cast(DepVal->getType()); + auto* ToType = dyn_cast(TargetIntegerType); + assert(FromType && ToType); + if (FromType->getBitWidth() <= ToType->getBitWidth()) { + CastOp = Instruction::ZExt; + } else { + CastOp = Instruction::Trunc; + } + break; + } + case Type::FloatTyID: + case Type::DoubleTyID: { + CastOp = Instruction::FPToSI; + break; + } + case Type::PointerTyID: { + CastOp = Instruction::PtrToInt; + break; + } + default: { break; } + } + + return Builder.CreateCast(CastOp, DepVal, TargetIntegerType); +} + +// Given a value, if it's a tainted address, this function returns the +// instruction that ORs the "dependence value" with the "original address". +// Otherwise, returns nullptr. This instruction is the first OR instruction +// where one of its operand is an AND instruction with an operand being 0. +// +// E.g., it returns '%4 = or i32 %3, %2' given 'CurrentAddress' is '%5'. +// %0 = load i32, i32* @y, align 4, !tbaa !1 +// %cmp = icmp ne i32 %0, 42 // <== this is like the condition +// %1 = sext i1 %cmp to i32 +// %2 = ptrtoint i32* @x to i32 +// %3 = and i32 %1, 0 +// %4 = or i32 %3, %2 +// %5 = inttoptr i32 %4 to i32* +// store i32 1, i32* %5, align 4 +Instruction* getOrAddress(Value* CurrentAddress) { + // Is it a cast from integer to pointer type. + Instruction* OrAddress = nullptr; + Instruction* AndDep = nullptr; + Instruction* CastToInt = nullptr; + Value* ActualAddress = nullptr; + Constant* ZeroConst = nullptr; + + const Instruction* CastToPtr = dyn_cast(CurrentAddress); + if (CastToPtr && CastToPtr->getOpcode() == Instruction::IntToPtr) { + // Is it an OR instruction: %1 = or %and, %actualAddress. + if ((OrAddress = dyn_cast(CastToPtr->getOperand(0))) && + OrAddress->getOpcode() == Instruction::Or) { + // The first operand should be and AND instruction. + AndDep = dyn_cast(OrAddress->getOperand(0)); + if (AndDep && AndDep->getOpcode() == Instruction::And) { + // Also make sure its first operand of the "AND" is 0, or the "AND" is + // marked explicitly by "NoInstCombine". + if ((ZeroConst = dyn_cast(AndDep->getOperand(1))) && + ZeroConst->isNullValue()) { + return OrAddress; + } + } + } + } + // Looks like it's not been tainted. + return nullptr; +} + +// Given a value, if it's a tainted address, this function returns the +// instruction that taints the "dependence value". Otherwise, returns nullptr. +// This instruction is the last AND instruction where one of its operand is 0. +// E.g., it returns '%3' given 'CurrentAddress' is '%5'. +// %0 = load i32, i32* @y, align 4, !tbaa !1 +// %cmp = icmp ne i32 %0, 42 // <== this is like the condition +// %1 = sext i1 %cmp to i32 +// %2 = ptrtoint i32* @x to i32 +// %3 = and i32 %1, 0 +// %4 = or i32 %3, %2 +// %5 = inttoptr i32 %4 to i32* +// store i32 1, i32* %5, align 4 +Instruction* getAndDependence(Value* CurrentAddress) { + // If 'CurrentAddress' is tainted, get the OR instruction. + auto* OrAddress = getOrAddress(CurrentAddress); + if (OrAddress == nullptr) { + return nullptr; + } + + // No need to check the operands. + auto* AndDepInst = dyn_cast(OrAddress->getOperand(0)); + assert(AndDepInst); + return AndDepInst; +} + +// Given a value, if it's a tainted address, this function returns +// the "dependence value", which is the first operand in the AND instruction. +// E.g., it returns '%1' given 'CurrentAddress' is '%5'. +// %0 = load i32, i32* @y, align 4, !tbaa !1 +// %cmp = icmp ne i32 %0, 42 // <== this is like the condition +// %1 = sext i1 %cmp to i32 +// %2 = ptrtoint i32* @x to i32 +// %3 = and i32 %1, 0 +// %4 = or i32 %3, %2 +// %5 = inttoptr i32 %4 to i32* +// store i32 1, i32* %5, align 4 +Value* getDependence(Value* CurrentAddress) { + auto* AndInst = getAndDependence(CurrentAddress); + if (AndInst == nullptr) { + return nullptr; + } + return AndInst->getOperand(0); +} + +// Given an address that has been tainted, returns the only condition it depends +// on, if any; otherwise, returns nullptr. +Value* getConditionDependence(Value* Address) { + auto* Dep = getDependence(Address); + if (Dep == nullptr) { + // 'Address' has not been dependence-tainted. + return nullptr; + } + + Value* Operand = Dep; + while (true) { + auto* Inst = dyn_cast(Operand); + if (Inst == nullptr) { + // Non-instruction type does not have condition dependence. + return nullptr; + } + if (Inst->getOpcode() == Instruction::ICmp) { + return Inst; + } else { + if (Inst->getNumOperands() != 1) { + return nullptr; + } else { + Operand = Inst->getOperand(0); + } + } + } +} + +// Conservatively decides whether the dependence set of 'Val1' includes the +// dependence set of 'Val2'. If 'ExpandSecondValue' is false, we do not expand +// 'Val2' and use that single value as its dependence set. +// If it returns true, it means the dependence set of 'Val1' includes that of +// 'Val2'; otherwise, it only means we cannot conclusively decide it. +bool dependenceSetInclusion(Value* Val1, Value* Val2, + int Val1ExpandLevel = 2 * kDependenceDepth, + int Val2ExpandLevel = kDependenceDepth) { + typedef SmallSet IncludingSet; + typedef SmallSet IncludedSet; + + IncludingSet DepSet1; + IncludedSet DepSet2; + // Look for more depths for the including set. + recursivelyFindDependence(&DepSet1, Val1, false /*Insert all visited nodes*/, + Val1ExpandLevel); + recursivelyFindDependence(&DepSet2, Val2, true /*Only insert leaf nodes*/, + Val2ExpandLevel); + + auto set_inclusion = [](IncludingSet FullSet, IncludedSet Subset) { + for (auto* Dep : Subset) { + if (0 == FullSet.count(Dep)) { + return false; + } + } + return true; + }; + bool inclusion = set_inclusion(DepSet1, DepSet2); + DEBUG(dbgs() << "[dependenceSetInclusion]: " << inclusion << "\n"); + DEBUG(dbgs() << "Including set for: " << *Val1 << "\n"); + DEBUG(for (const auto* Dep : DepSet1) { dbgs() << "\t\t" << *Dep << "\n"; }); + DEBUG(dbgs() << "Included set for: " << *Val2 << "\n"); + DEBUG(for (const auto* Dep : DepSet2) { dbgs() << "\t\t" << *Dep << "\n"; }); + + return inclusion; +} + +// Recursively iterates through the operands spawned from 'DepVal'. If there +// exists a single value that 'DepVal' only depends on, we call that value the +// root dependence of 'DepVal' and return it. Otherwise, return 'DepVal'. +Value* getRootDependence(Value* DepVal) { + SmallSet DepSet; + for (unsigned depth = kDependenceDepth; depth > 0; --depth) { + recursivelyFindDependence(&DepSet, DepVal, true /*Only insert leaf nodes*/, + depth); + if (DepSet.size() == 1) { + return *DepSet.begin(); + } + DepSet.clear(); + } + return DepVal; +} + +// This function actually taints 'DepVal' to the address to 'SI'. If the +// address +// of 'SI' already depends on whatever 'DepVal' depends on, this function +// doesn't do anything and returns false. Otherwise, returns true. +// +// This effect forces the store and any stores that comes later to depend on +// 'DepVal'. For example, we have a condition "cond", and a store instruction +// "s: STORE addr, val". If we want "s" (and any later store) to depend on +// "cond", we do the following: +// %conv = sext i1 %cond to i32 +// %addrVal = ptrtoint i32* %addr to i32 +// %andCond = and i32 conv, 0; +// %orAddr = or i32 %andCond, %addrVal; +// %NewAddr = inttoptr i32 %orAddr to i32*; +// +// This is a more concrete example: +// ------ +// %0 = load i32, i32* @y, align 4, !tbaa !1 +// %cmp = icmp ne i32 %0, 42 // <== this is like the condition +// %1 = sext i1 %cmp to i32 +// %2 = ptrtoint i32* @x to i32 +// %3 = and i32 %1, 0 +// %4 = or i32 %3, %2 +// %5 = inttoptr i32 %4 to i32* +// store i32 1, i32* %5, align 4 +bool taintStoreAddress(StoreInst* SI, Value* DepVal) { + // Set the insertion point right after the 'DepVal'. + Instruction* Inst = nullptr; + IRBuilder Builder(SI); + BasicBlock* BB = SI->getParent(); + Value* Address = SI->getPointerOperand(); + Type* TargetIntegerType = + IntegerType::get(Address->getContext(), + BB->getModule()->getDataLayout().getPointerSizeInBits()); + + // Does SI's address already depends on whatever 'DepVal' depends on? + if (StoreAddressDependOnValue(SI, DepVal)) { + return false; + } + + // Figure out if there's a root variable 'DepVal' depends on. For example, we + // can extract "getelementptr inbounds %struct, %struct* %0, i64 0, i32 123" + // to be "%struct* %0" since all other operands are constant. + auto* RootVal = getRootDependence(DepVal); + auto* RootInst = dyn_cast(RootVal); + auto* DepValInst = dyn_cast(DepVal); + if (RootInst && DepValInst && + RootInst->getParent() == DepValInst->getParent()) { + DepVal = RootVal; + } + + // Is this already a dependence-tainted store? + Value* OldDep = getDependence(Address); + if (OldDep) { + // The address of 'SI' has already been tainted. Just need to absorb the + // DepVal to the existing dependence in the address of SI. + Instruction* AndDep = getAndDependence(Address); + IRBuilder Builder(AndDep); + Value* NewDep = nullptr; + if (DepVal->getType() == AndDep->getType()) { + NewDep = Builder.CreateAnd(OldDep, DepVal); + } else { + NewDep = Builder.CreateAnd( + OldDep, createCast(Builder, DepVal, TargetIntegerType)); + } + + auto* NewDepInst = dyn_cast(NewDep); + + // Use the new AND instruction as the dependence + AndDep->setOperand(0, NewDep); + return true; + } + + // SI's address has not been tainted. Now taint it with 'DepVal'. + Value* CastDepToInt = createCast(Builder, DepVal, TargetIntegerType); + Value* PtrToIntCast = Builder.CreatePtrToInt(Address, TargetIntegerType); + Value* AndDepVal = + Builder.CreateAnd(CastDepToInt, ConstantInt::get(TargetIntegerType, 0)); + auto AndInst = dyn_cast(AndDepVal); + // XXX-comment: The original IR InstCombiner would change our and instruction + // to a select and then the back end optimize the condition out. We attach a + // flag to instructions and set it here to inform the InstCombiner to not to + // touch this and instruction at all. + Value* OrAddr = Builder.CreateOr(AndDepVal, PtrToIntCast); + Value* NewAddr = Builder.CreateIntToPtr(OrAddr, Address->getType()); + + DEBUG(dbgs() << "[taintStoreAddress]\n" + << "Original store: " << *SI << '\n'); + SI->setOperand(1, NewAddr); + + // Debug output. + DEBUG(dbgs() << "\tTargetIntegerType: " << *TargetIntegerType << '\n' + << "\tCast dependence value to integer: " << *CastDepToInt + << '\n' + << "\tCast address to integer: " << *PtrToIntCast << '\n' + << "\tAnd dependence value: " << *AndDepVal << '\n' + << "\tOr address: " << *OrAddr << '\n' + << "\tCast or instruction to address: " << *NewAddr << "\n\n"); + + return true; +} + +// Looks for the previous store in the if block --- 'BrBB', which makes the +// speculative store 'StoreToHoist' safe. +Value* getSpeculativeStoreInPrevBB(StoreInst* StoreToHoist, BasicBlock* BrBB) { + assert(StoreToHoist && "StoreToHoist must be a real store"); + + Value* StorePtr = StoreToHoist->getPointerOperand(); + + // Look for a store to the same pointer in BrBB. + for (BasicBlock::reverse_iterator RI = BrBB->rbegin(), RE = BrBB->rend(); + RI != RE; ++RI) { + Instruction* CurI = &*RI; + + StoreInst* SI = dyn_cast(CurI); + // Found the previous store make sure it stores to the same location. + // XXX-update: If the previous store's original untainted address are the + // same as 'StorePtr', we are also good to hoist the store. + if (SI && (SI->getPointerOperand() == StorePtr || + GetUntaintedAddress(SI->getPointerOperand()) == StorePtr)) { + // Found the previous store, return its value operand. + return SI; + } + } + + assert(false && + "We should not reach here since this store is safe to speculate"); +} + +// XXX-comment: Returns true if it changes the code, false otherwise (the branch +// condition already depends on 'DepVal'. +bool taintConditionalBranch(BranchInst* BI, Value* DepVal) { + assert(BI->isConditional()); + auto* Cond = BI->getOperand(0); + if (dependenceSetInclusion(Cond, DepVal)) { + // The dependence/ordering is self-evident. + return false; + } + + IRBuilder Builder(BI); + auto* AndDep = + Builder.CreateAnd(DepVal, ConstantInt::get(DepVal->getType(), 0)); + auto* TruncAndDep = + Builder.CreateTrunc(AndDep, IntegerType::get(DepVal->getContext(), 1)); + auto* OrCond = Builder.CreateOr(TruncAndDep, Cond); + BI->setOperand(0, OrCond); + + // Debug output. + DEBUG(dbgs() << "\tTainted branch condition:\n" << *BI->getParent()); + + return true; +} + +bool ConditionalBranchDependsOnValue(BranchInst* BI, Value* DepVal) { + assert(BI->isConditional()); + auto* Cond = BI->getOperand(0); + return dependenceSetInclusion(Cond, DepVal); +} + +// XXX-update: For a relaxed load 'LI', find the first immediate atomic store or +// the first conditional branch. Returns nullptr if there's no such immediately +// following store/branch instructions, which we can only enforce the load with +// 'acquire'. 'ChainedBB' contains all the blocks chained together with +// unconditional branches from 'BB' to the block with the first store/cond +// branch. +template +Instruction* findFirstStoreCondBranchInst(LoadInst* LI, Vector* ChainedBB) { + // In some situations, relaxed loads can be left as is: + // 1. The relaxed load is used to calculate the address of the immediate + // following store; + // 2. The relaxed load is used as a condition in the immediate following + // condition, and there are no stores in between. This is actually quite + // common. E.g., + // int r1 = x.load(relaxed); + // if (r1 != 0) { + // y.store(1, relaxed); + // } + // However, in this function, we don't deal with them directly. Instead, we + // just find the immediate following store/condition branch and return it. + + assert(ChainedBB != nullptr && "Chained BB should not be nullptr"); + auto* BB = LI->getParent(); + ChainedBB->push_back(BB); + auto BE = BB->end(); + auto BBI = BasicBlock::iterator(LI); + BBI++; + while (true) { + for (; BBI != BE; BBI++) { + Instruction* Inst = &*BBI; + IntrinsicInst* II = dyn_cast(&*BBI); + if (II && II->getIntrinsicID() == Intrinsic::aarch64_stlxr) { + return II; + } else if (Inst->getOpcode() == Instruction::Store) { + return Inst; + } else if (Inst->getOpcode() == Instruction::Br) { + auto* BrInst = dyn_cast(Inst); + if (BrInst->isConditional()) { + return Inst; + } else { + // Reinitialize iterators with the destination of the unconditional + // branch. + BB = BrInst->getSuccessor(0); + ChainedBB->push_back(BB); + BBI = BB->begin(); + BE = BB->end(); + break; + } + } + } + if (BBI == BE) { + return nullptr; + } + } +} + +// XXX-update: Find the next node of the last relaxed load from 'FromInst' to +// 'ToInst'. If none, return 'ToInst'. +Instruction* findLastLoadNext(Instruction* FromInst, Instruction* ToInst) { + if (FromInst == ToInst) { + return ToInst; + } + Instruction* LastLoad = ToInst; + auto* BB = FromInst->getParent(); + auto BE = BB->end(); + auto BBI = BasicBlock::iterator(FromInst); + BBI++; + for (; BBI != BE && &*BBI != ToInst; BBI++) { + auto* LI = dyn_cast(&*BBI); + if (LI == nullptr || !LI->isAtomic() || LI->getOrdering() != Monotonic) { + continue; + } + LastLoad = LI; + LastLoad = LastLoad->getNextNode(); + } + return LastLoad; +} + +// Inserts a fake conditional branch right after the instruction 'SplitInst', +// and the branch condition is 'Condition'. 'SplitInst' will be placed in the +// newly created block. +void AddFakeConditionalBranch(Instruction* SplitInst, Value* Condition) { + auto* BB = SplitInst->getParent(); + TerminatorInst* ThenTerm = nullptr; + TerminatorInst* ElseTerm = nullptr; + SplitBlockAndInsertIfThenElse(Condition, SplitInst, &ThenTerm, &ElseTerm); + assert(ThenTerm && ElseTerm && + "Then/Else terminators cannot be empty after basic block spliting"); + auto* ThenBB = ThenTerm->getParent(); + auto* ElseBB = ElseTerm->getParent(); + auto* TailBB = ThenBB->getSingleSuccessor(); + assert(TailBB && "Tail block cannot be empty after basic block spliting"); + + ThenBB->disableCanEliminateBlock(); + ThenBB->disableCanEliminateBlock(); + TailBB->disableCanEliminateBlock(); + ThenBB->setName(BB->getName() + "Then.Fake"); + ElseBB->setName(BB->getName() + "Else.Fake"); + DEBUG(dbgs() << "Add fake conditional branch:\n" + << "Then Block:\n" + << *ThenBB << "Else Block:\n" + << *ElseBB << "\n"); +} + +// Returns true if the code is changed, and false otherwise. +void TaintRelaxedLoads(Instruction* UsageInst, Instruction* InsertPoint) { + // For better performance, we can add a "AND X 0" instruction before the + // condition. + auto* BB = UsageInst->getParent(); + if (InsertPoint == nullptr) { + InsertPoint = UsageInst->getNextNode(); + } + // Insert instructions after PHI nodes. + while (dyn_cast(InsertPoint)) { + InsertPoint = InsertPoint->getNextNode(); + } + // First thing is to cast 'UsageInst' to an integer type if necessary. + Value* AndTarget = nullptr; + Type* TargetIntegerType = + IntegerType::get(UsageInst->getContext(), + BB->getModule()->getDataLayout().getPointerSizeInBits()); + + // Check whether InsertPoint is a added fake conditional branch. + BranchInst* BI = nullptr; + if ((BI = dyn_cast(InsertPoint)) && BI->isConditional()) { + auto* Cond = dyn_cast(BI->getOperand(0)); + if (Cond && Cond->getOpcode() == Instruction::ICmp) { + auto* CmpInst = dyn_cast(Cond); + auto* Op0 = dyn_cast(Cond->getOperand(0)); + auto* Op1 = dyn_cast(Cond->getOperand(1)); + // %tmp = And X, 0 + // %cmp = ICMP_NE %tmp, 0 + // Br %cmp + // => + // %tmp1 = And X, NewTaintedVal + // %tmp2 = And %tmp1, 0 + // %cmp = ICMP_NE %tmp2, 0 + // Br %cmp + if (CmpInst && CmpInst->getPredicate() == CmpInst::ICMP_NE && Op0 && + Op0->getOpcode() == Instruction::And && Op1 && Op1->isZero()) { + auto* Op01 = dyn_cast(Op0->getOperand(1)); + if (Op01 && Op01->isZero()) { + // Now we have a previously added fake cond branch. + auto* Op00 = Op0->getOperand(0); + IRBuilder Builder(CmpInst); + if (Op00->getType() == UsageInst->getType()) { + AndTarget = UsageInst; + } else { + AndTarget = createCast(Builder, UsageInst, Op00->getType()); + } + AndTarget = Builder.CreateAnd(Op00, AndTarget); + auto* AndZero = dyn_cast(Builder.CreateAnd( + AndTarget, Constant::getNullValue(AndTarget->getType()))); + CmpInst->setOperand(0, AndZero); + return; + } + } + } + } + + IRBuilder Builder(InsertPoint); + if (IntegerType::classof(UsageInst->getType())) { + AndTarget = UsageInst; + } else { + AndTarget = createCast(Builder, UsageInst, TargetIntegerType); + } + auto* AndZero = dyn_cast( + Builder.CreateAnd(AndTarget, Constant::getNullValue(AndTarget->getType()))); + auto* FakeCondition = dyn_cast(Builder.CreateICmp( + CmpInst::ICMP_NE, AndZero, Constant::getNullValue(AndTarget->getType()))); + AddFakeConditionalBranch(FakeCondition->getNextNode(), FakeCondition); +} + +// XXX-comment: Finds the appropriate Value derived from an atomic load. +// 'ChainedBB' contains all the blocks chained together with unconditional +// branches from LI's parent BB to the block with the first store/cond branch. +// If we don't find any, it means 'LI' is not used at all (which should not +// happen in practice). We can simply set 'LI' to be acquire just to be safe. +template +Instruction* findMostRecentDependenceUsage(LoadInst* LI, Instruction* LaterInst, + Vector* ChainedBB, + DominatorTree* DT) { + typedef SmallSet UsageSet; + typedef DenseMap> UsageMap; + assert(ChainedBB->size() >= 1 && "ChainedBB must have >=1 size"); + // Mapping from basic block in 'ChainedBB' to the set of dependence usage of + // 'LI' in each block. + UsageMap usage_map; + auto* LoadBB = LI->getParent(); + usage_map[LoadBB] = make_unique(); + usage_map[LoadBB]->insert(LI); + + for (auto* BB : *ChainedBB) { + if (usage_map[BB] == nullptr) { + usage_map[BB] = make_unique(); + } + auto& usage_set = usage_map[BB]; + if (usage_set->size() == 0) { + // The value has not been used. + return nullptr; + } + // Calculate the usage in the current BB first. + std::list bb_usage_list; + std::copy(usage_set->begin(), usage_set->end(), + std::back_inserter(bb_usage_list)); + for (auto list_iter = bb_usage_list.begin(); + list_iter != bb_usage_list.end(); list_iter++) { + auto* val = *list_iter; + for (auto* U : val->users()) { + Instruction* Inst = nullptr; + if (!(Inst = dyn_cast(U))) { + continue; + } + assert(Inst && "Usage value must be an instruction"); + auto iter = + std::find(ChainedBB->begin(), ChainedBB->end(), Inst->getParent()); + if (iter == ChainedBB->end()) { + // Only care about usage within ChainedBB. + continue; + } + auto* UsageBB = *iter; + if (UsageBB == BB) { + // Current BB. + if (!usage_set->count(Inst)) { + bb_usage_list.push_back(Inst); + usage_set->insert(Inst); + } + } else { + // A following BB. + if (usage_map[UsageBB] == nullptr) { + usage_map[UsageBB] = make_unique(); + } + usage_map[UsageBB]->insert(Inst); + } + } + } + } + + // Pick one usage that is in LaterInst's block and that dominates 'LaterInst'. + auto* LaterBB = LaterInst->getParent(); + auto& usage_set = usage_map[LaterBB]; + Instruction* usage_inst = nullptr; + for (auto* inst : *usage_set) { + if (DT->dominates(inst, LaterInst)) { + usage_inst = inst; + break; + } + } + + assert(usage_inst && "The usage instruction in the same block but after the " + "later instruction"); + return usage_inst; +} + +// XXX-comment: Returns whether the code has been changed. +bool AddFakeConditionalBranchAfterMonotonicLoads( + SmallSet& MonotonicLoadInsts, DominatorTree* DT) { + bool Changed = false; + while (!MonotonicLoadInsts.empty()) { + auto* LI = *MonotonicLoadInsts.begin(); + MonotonicLoadInsts.erase(LI); + SmallVector ChainedBB; + auto* FirstInst = findFirstStoreCondBranchInst(LI, &ChainedBB); + if (FirstInst != nullptr) { + if (FirstInst->getOpcode() == Instruction::Store) { + if (StoreAddressDependOnValue(dyn_cast(FirstInst), LI)) { + continue; + } + } else if (FirstInst->getOpcode() == Instruction::Br) { + if (ConditionalBranchDependsOnValue(dyn_cast(FirstInst), + LI)) { + continue; + } + } else { + IntrinsicInst* II = dyn_cast(FirstInst); + if (!II || II->getIntrinsicID() != Intrinsic::aarch64_stlxr) { + dbgs() << "FirstInst=" << *FirstInst << "\n"; + assert(false && "findFirstStoreCondBranchInst() should return a " + "store/condition branch instruction"); + } + } + } + + // We really need to process the relaxed load now. Note that if the next + // instruction is a RMW, it will be transformed into a control block, so we + // can safely only taint upcoming store instructions. + StoreInst* SI = nullptr; + IntrinsicInst* II = nullptr; + if (FirstInst) { + SI = dyn_cast(FirstInst); + II = dyn_cast(FirstInst); + } + if (FirstInst && SI) { + // For immediately coming stores, taint the address of the store. + if (FirstInst->getParent() == LI->getParent() || + DT->dominates(LI, FirstInst)) { + Changed != taintStoreAddress(SI, LI); + Changed = true; + } else { + auto* Inst = + findMostRecentDependenceUsage(LI, FirstInst, &ChainedBB, DT); + if (!Inst) { + LI->setOrdering(Acquire); + Changed = true; + } else { + Changed |= taintStoreAddress(SI, Inst); + } + } + } else { + // No upcoming branch + if (!FirstInst) { + TaintRelaxedLoads(LI, nullptr); + Changed = true; + } else { + // For immediately coming branch, directly add a fake branch. + if (FirstInst->getParent() == LI->getParent() || + DT->dominates(LI, FirstInst)) { + TaintRelaxedLoads(LI, FirstInst); + Changed = true; + } else { + auto* Inst = + findMostRecentDependenceUsage(LI, FirstInst, &ChainedBB, DT); + if (Inst) { + TaintRelaxedLoads(Inst, FirstInst); + } else { + LI->setOrdering(Acquire); + } + Changed = true; + } + } + } + } + return Changed; +} + +/**** Implementations of public methods for dependence tainting ****/ +Value* GetUntaintedAddress(Value* CurrentAddress) { + auto* OrAddress = getOrAddress(CurrentAddress); + if (OrAddress == nullptr) { + // Is it tainted by a select instruction? + auto* Inst = dyn_cast(CurrentAddress); + if (nullptr != Inst && Inst->getOpcode() == Instruction::Select) { + // A selection instruction. + if (Inst->getOperand(1) == Inst->getOperand(2)) { + return Inst->getOperand(1); + } + } + + return CurrentAddress; + } + Value* ActualAddress = nullptr; + + auto* CastToInt = dyn_cast(OrAddress->getOperand(1)); + if (CastToInt && CastToInt->getOpcode() == Instruction::PtrToInt) { + return CastToInt->getOperand(0); + } else { + // This should be a IntToPtr constant expression. + ConstantExpr* PtrToIntExpr = + dyn_cast(OrAddress->getOperand(1)); + if (PtrToIntExpr && PtrToIntExpr->getOpcode() == Instruction::PtrToInt) { + return PtrToIntExpr->getOperand(0); + } + } + + // Looks like it's not been dependence-tainted. Returns itself. + return CurrentAddress; +} + +MemoryLocation GetUntaintedMemoryLocation(StoreInst* SI) { + AAMDNodes AATags; + SI->getAAMetadata(AATags); + const auto& DL = SI->getModule()->getDataLayout(); + const auto* OriginalAddr = GetUntaintedAddress(SI->getPointerOperand()); + DEBUG(if (OriginalAddr != SI->getPointerOperand()) { + dbgs() << "[GetUntaintedMemoryLocation]\n" + << "Storing address: " << *SI->getPointerOperand() + << "\nUntainted address: " << *OriginalAddr << "\n"; + }); + return MemoryLocation(OriginalAddr, + DL.getTypeStoreSize(SI->getValueOperand()->getType()), + AATags); +} + +bool TaintDependenceToStore(StoreInst* SI, Value* DepVal) { + if (dependenceSetInclusion(SI, DepVal)) { + return false; + } + + bool tainted = taintStoreAddress(SI, DepVal); + assert(tainted); + return tainted; +} + +bool TaintDependenceToStoreAddress(StoreInst* SI, Value* DepVal) { + if (dependenceSetInclusion(SI->getPointerOperand(), DepVal)) { + return false; + } + + bool tainted = taintStoreAddress(SI, DepVal); + assert(tainted); + return tainted; +} + +bool CompressTaintedStore(BasicBlock* BB) { + // This function looks for windows of adajcent stores in 'BB' that satisfy the + // following condition (and then do optimization): + // *Addr(d1) = v1, d1 is a condition and is the only dependence the store's + // address depends on && Dep(v1) includes Dep(d1); + // *Addr(d2) = v2, d2 is a condition and is the only dependnece the store's + // address depends on && Dep(v2) includes Dep(d2) && + // Dep(d2) includes Dep(d1); + // ... + // *Addr(dN) = vN, dN is a condition and is the only dependence the store's + // address depends on && Dep(dN) includes Dep(d"N-1"). + // + // As a result, Dep(dN) includes [Dep(d1) V ... V Dep(d"N-1")], so we can + // safely transform the above to the following. In between these stores, we + // can omit untainted stores to the same address 'Addr' since they internally + // have dependence on the previous stores on the same address. + // => + // *Addr = v1 + // *Addr = v2 + // *Addr(d3) = v3 + for (auto BI = BB->begin(), BE = BB->end(); BI != BE; BI++) { + // Look for the first store in such a window of adajacent stores. + auto* FirstSI = dyn_cast(&*BI); + if (!FirstSI) { + continue; + } + + // The first store in the window must be tainted. + auto* UntaintedAddress = GetUntaintedAddress(FirstSI->getPointerOperand()); + if (UntaintedAddress == FirstSI->getPointerOperand()) { + continue; + } + + // The first store's address must directly depend on and only depend on a + // condition. + auto* FirstSIDepCond = getConditionDependence(FirstSI->getPointerOperand()); + if (nullptr == FirstSIDepCond) { + continue; + } + + // Dep(first store's storing value) includes Dep(tainted dependence). + if (!dependenceSetInclusion(FirstSI->getValueOperand(), FirstSIDepCond)) { + continue; + } + + // Look for subsequent stores to the same address that satisfy the condition + // of "compressing the dependence". + SmallVector AdajacentStores; + AdajacentStores.push_back(FirstSI); + auto BII = BasicBlock::iterator(FirstSI); + for (BII++; BII != BE; BII++) { + auto* CurrSI = dyn_cast(&*BII); + if (!CurrSI) { + if (BII->mayHaveSideEffects()) { + // Be conservative. Instructions with side effects are similar to + // stores. + break; + } + continue; + } + + auto* OrigAddress = GetUntaintedAddress(CurrSI->getPointerOperand()); + auto* CurrSIDepCond = getConditionDependence(CurrSI->getPointerOperand()); + // All other stores must satisfy either: + // A. 'CurrSI' is an untainted store to the same address, or + // B. the combination of the following 5 subconditions: + // 1. Tainted; + // 2. Untainted address is the same as the group's address; + // 3. The address is tainted with a sole value which is a condition; + // 4. The storing value depends on the condition in 3. + // 5. The condition in 3 depends on the previous stores dependence + // condition. + + // Condition A. Should ignore this store directly. + if (OrigAddress == CurrSI->getPointerOperand() && + OrigAddress == UntaintedAddress) { + continue; + } + // Check condition B. + Value* Cond = nullptr; + if (OrigAddress == CurrSI->getPointerOperand() || + OrigAddress != UntaintedAddress || CurrSIDepCond == nullptr || + !dependenceSetInclusion(CurrSI->getValueOperand(), CurrSIDepCond)) { + // Check condition 1, 2, 3 & 4. + break; + } + + // Check condition 5. + StoreInst* PrevSI = AdajacentStores[AdajacentStores.size() - 1]; + auto* PrevSIDepCond = getConditionDependence(PrevSI->getPointerOperand()); + assert(PrevSIDepCond && + "Store in the group must already depend on a condtion"); + if (!dependenceSetInclusion(CurrSIDepCond, PrevSIDepCond)) { + break; + } + + AdajacentStores.push_back(CurrSI); + } + + if (AdajacentStores.size() == 1) { + // The outer loop should keep looking from the next store. + continue; + } + + // Now we have such a group of tainted stores to the same address. + DEBUG(dbgs() << "[CompressTaintedStore]\n"); + DEBUG(dbgs() << "Original BB\n"); + DEBUG(dbgs() << *BB << '\n'); + auto* LastSI = AdajacentStores[AdajacentStores.size() - 1]; + for (unsigned i = 0; i < AdajacentStores.size() - 1; ++i) { + auto* SI = AdajacentStores[i]; + + // Use the original address for stores before the last one. + SI->setOperand(1, UntaintedAddress); + + DEBUG(dbgs() << "Store address has been reversed: " << *SI << '\n';); + } + // XXX-comment: Try to make the last store use fewer registers. + // If LastSI's storing value is a select based on the condition with which + // its address is tainted, transform the tainted address to a select + // instruction, as follows: + // r1 = Select Cond ? A : B + // r2 = Cond & 0 + // r3 = Addr | r2 + // *r3 = r1 + // ==> + // r1 = Select Cond ? A : B + // r2 = Select Cond ? Addr : Addr + // *r2 = r1 + // The idea is that both Select instructions depend on the same condition, + // so hopefully the backend can generate two cmov instructions for them (and + // this saves the number of registers needed). + auto* LastSIDep = getConditionDependence(LastSI->getPointerOperand()); + auto* LastSIValue = dyn_cast(LastSI->getValueOperand()); + if (LastSIValue && LastSIValue->getOpcode() == Instruction::Select && + LastSIValue->getOperand(0) == LastSIDep) { + // XXX-comment: Maybe it's better for us to just leave it as an and/or + // dependence pattern. + // /* + IRBuilder Builder(LastSI); + auto* Address = + Builder.CreateSelect(LastSIDep, UntaintedAddress, UntaintedAddress); + LastSI->setOperand(1, Address); + DEBUG(dbgs() << "The last store becomes :" << *LastSI << "\n\n";); + // */ + } + } + + return true; +} + +bool PassDependenceToStore(Value* OldAddress, StoreInst* NewStore) { + Value* OldDep = getDependence(OldAddress); + // Return false when there's no dependence to pass from the OldAddress. + if (!OldDep) { + return false; + } + + // No need to pass the dependence to NewStore's address if it already depends + // on whatever 'OldAddress' depends on. + if (StoreAddressDependOnValue(NewStore, OldDep)) { + return false; + } + return taintStoreAddress(NewStore, OldAddress); +} + +SmallSet FindDependence(Value* Val) { + SmallSet DepSet; + recursivelyFindDependence(&DepSet, Val, true /*Only insert leaf nodes*/); + return DepSet; +} + +bool StoreAddressDependOnValue(StoreInst* SI, Value* DepVal) { + return dependenceSetInclusion(SI->getPointerOperand(), DepVal); +} + +bool StoreDependOnValue(StoreInst* SI, Value* Dep) { + return dependenceSetInclusion(SI, Dep); +} + +} // namespace + + + bool CodeGenPrepare::runOnFunction(Function &F) { + bool EverMadeChange = false; + if (skipOptnoneFunction(F)) return false; DL = &F.getParent()->getDataLayout(); - bool EverMadeChange = false; // Clear per function information. InsertedInsts.clear(); PromotedInsts.clear(); @@ -212,6 +1285,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { TLI = TM->getSubtargetImpl(F)->getTargetLowering(); TLInfo = &getAnalysis().getTLI(); TTI = &getAnalysis().getTTI(F); + DT = &getAnalysis().getDomTree(); OptSize = F.optForSize(); /// This optimization identifies DIV instructions that can be @@ -219,8 +1293,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (!OptSize && TLI && TLI->isSlowDivBypassed()) { const DenseMap &BypassWidths = TLI->getBypassSlowDivWidths(); - for (Function::iterator I = F.begin(); I != F.end(); I++) - EverMadeChange |= bypassSlowDivision(F, I, BypassWidths); + BasicBlock* BB = &*F.begin(); + while (BB != nullptr) { + // bypassSlowDivision may create new BBs, but we don't want to reapply the + // optimization to those blocks. + BasicBlock* Next = BB->getNextNode(); + EverMadeChange |= bypassSlowDivision(BB, BypassWidths); + BB = Next; + } } // Eliminate blocks that contain only PHI nodes and an @@ -305,6 +1385,30 @@ bool CodeGenPrepare::runOnFunction(Function &F) { EverMadeChange |= simplifyOffsetableRelocate(*I); } + // XXX-comment: Delay dealing with relaxed loads in this function to avoid + // further changes done by other passes (e.g., SimplifyCFG). + // Collect all the relaxed loads. + SmallSet MonotonicLoadInsts; + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { + if (I->isAtomic()) { + switch (I->getOpcode()) { + case Instruction::Load: { + auto* LI = dyn_cast(&*I); + if (LI->getOrdering() == Monotonic && + !LI->getHasSubsequentAcqlRMW()) { + MonotonicLoadInsts.insert(LI); + } + break; + } + default: { + break; + } + } + } + } + EverMadeChange |= + AddFakeConditionalBranchAfterMonotonicLoads(MonotonicLoadInsts, DT); + return EverMadeChange; } @@ -351,7 +1455,6 @@ bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { // Note that this intentionally skips the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { BasicBlock *BB = &*I++; - // If this block doesn't end with an uncond branch, ignore it. BranchInst *BI = dyn_cast(BB->getTerminator()); if (!BI || !BI->isUnconditional()) @@ -399,7 +1502,7 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, const Instruction *UI = cast(U); if (UI->getParent() != DestBB || !isa(UI)) return false; - // If User is inside DestBB block and it is a PHINode then check + // IfUser is inside DestBB block and it is a PHINode then check // incoming value. If incoming value is not from BB then this is // a complex condition (e.g. preheaders) we want to avoid here. if (UI->getParent() == DestBB) { @@ -520,19 +1623,17 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { // Computes a map of base pointer relocation instructions to corresponding // derived pointer relocation instructions given a vector of all relocate calls static void computeBaseDerivedRelocateMap( - const SmallVectorImpl &AllRelocateCalls, - DenseMap> & - RelocateInstMap) { + const SmallVectorImpl &AllRelocateCalls, + DenseMap> + &RelocateInstMap) { // Collect information in two maps: one primarily for locating the base object // while filling the second map; the second map is the final structure holding // a mapping between Base and corresponding Derived relocate calls - DenseMap, IntrinsicInst *> RelocateIdxMap; - for (auto &U : AllRelocateCalls) { - GCRelocateOperands ThisRelocate(U); - IntrinsicInst *I = cast(U); - auto K = std::make_pair(ThisRelocate.getBasePtrIndex(), - ThisRelocate.getDerivedPtrIndex()); - RelocateIdxMap.insert(std::make_pair(K, I)); + DenseMap, GCRelocateInst *> RelocateIdxMap; + for (auto *ThisRelocate : AllRelocateCalls) { + auto K = std::make_pair(ThisRelocate->getBasePtrIndex(), + ThisRelocate->getDerivedPtrIndex()); + RelocateIdxMap.insert(std::make_pair(K, ThisRelocate)); } for (auto &Item : RelocateIdxMap) { std::pair Key = Item.first; @@ -540,7 +1641,7 @@ static void computeBaseDerivedRelocateMap( // Base relocation: nothing to insert continue; - IntrinsicInst *I = Item.second; + GCRelocateInst *I = Item.second; auto BaseKey = std::make_pair(Key.first, Key.first); // We're iterating over RelocateIdxMap so we cannot modify it. @@ -573,22 +1674,27 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, // Takes a RelocatedBase (base pointer relocation instruction) and Targets to // replace, computes a replacement, and affects it. static bool -simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase, - const SmallVectorImpl &Targets) { +simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, + const SmallVectorImpl &Targets) { bool MadeChange = false; - for (auto &ToReplace : Targets) { - GCRelocateOperands MasterRelocate(RelocatedBase); - GCRelocateOperands ThisRelocate(ToReplace); - - assert(ThisRelocate.getBasePtrIndex() == MasterRelocate.getBasePtrIndex() && + for (GCRelocateInst *ToReplace : Targets) { + assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() && "Not relocating a derived object of the original base object"); - if (ThisRelocate.getBasePtrIndex() == ThisRelocate.getDerivedPtrIndex()) { + if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) { // A duplicate relocate call. TODO: coalesce duplicates. continue; } - Value *Base = ThisRelocate.getBasePtr(); - auto Derived = dyn_cast(ThisRelocate.getDerivedPtr()); + if (RelocatedBase->getParent() != ToReplace->getParent()) { + // Base and derived relocates are in different basic blocks. + // In this case transform is only valid when base dominates derived + // relocate. However it would be too expensive to check dominance + // for each such relocate, so we skip the whole transformation. + continue; + } + + Value *Base = ToReplace->getBasePtr(); + auto Derived = dyn_cast(ToReplace->getDerivedPtr()); if (!Derived || Derived->getPointerOperand() != Base) continue; @@ -624,21 +1730,20 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase, // In this case, we can not find the bitcast any more. So we insert a new bitcast // no matter there is already one or not. In this way, we can handle all cases, and // the extra bitcast should be optimized away in later passes. - Instruction *ActualRelocatedBase = RelocatedBase; + Value *ActualRelocatedBase = RelocatedBase; if (RelocatedBase->getType() != Base->getType()) { ActualRelocatedBase = - cast(Builder.CreateBitCast(RelocatedBase, Base->getType())); + Builder.CreateBitCast(RelocatedBase, Base->getType()); } Value *Replacement = Builder.CreateGEP( Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV)); - Instruction *ReplacementInst = cast(Replacement); Replacement->takeName(ToReplace); // If the newly generated derived pointer's type does not match the original derived // pointer's type, cast the new derived pointer to match it. Same reasoning as above. - Instruction *ActualReplacement = ReplacementInst; - if (ReplacementInst->getType() != ToReplace->getType()) { + Value *ActualReplacement = Replacement; + if (Replacement->getType() != ToReplace->getType()) { ActualReplacement = - cast(Builder.CreateBitCast(ReplacementInst, ToReplace->getType())); + Builder.CreateBitCast(Replacement, ToReplace->getType()); } ToReplace->replaceAllUsesWith(ActualReplacement); ToReplace->eraseFromParent(); @@ -667,12 +1772,12 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase, // %val = load %ptr' bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { bool MadeChange = false; - SmallVector AllRelocateCalls; + SmallVector AllRelocateCalls; for (auto *U : I.users()) - if (isGCRelocate(dyn_cast(U))) + if (GCRelocateInst *Relocate = dyn_cast(U)) // Collect all the relocate calls associated with a statepoint - AllRelocateCalls.push_back(U); + AllRelocateCalls.push_back(Relocate); // We need atleast one base pointer relocation + one derived pointer // relocation to mangle @@ -681,7 +1786,7 @@ bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { // RelocateInstMap is a mapping from the base relocate instruction to the // corresponding derived relocate instructions - DenseMap> RelocateInstMap; + DenseMap> RelocateInstMap; computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap); if (RelocateInstMap.empty()) return false; @@ -716,6 +1821,12 @@ static bool SinkCast(CastInst *CI) { // Preincrement use iterator so we don't invalidate it. ++UI; + // If the block selected to receive the cast is an EH pad that does not + // allow non-PHI instructions before the terminator, we can't sink the + // cast. + if (UserBB->getTerminator()->isEHPad()) + continue; + // If this user is in the same block as the cast, don't change the cast. if (UserBB == DefBB) continue; @@ -810,7 +1921,7 @@ static bool CombineUAddWithOverflow(CmpInst *CI) { assert(*AddI->user_begin() == CI && "expected!"); #endif - Module *M = CI->getParent()->getParent()->getParent(); + Module *M = CI->getModule(); Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty); auto *InsertPt = AddI->hasOneUse() ? CI : AddI; @@ -1088,7 +2199,7 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, // <16 x i1> %mask, <16 x i32> %passthru) // to a chain of basic blocks, with loading element one-by-one if // the appropriate mask bit is set -// +// // %1 = bitcast i8* %addr to i32* // %2 = extractelement <16 x i1> %mask, i32 0 // %3 = icmp eq i1 %2, true @@ -1120,35 +2231,68 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, // static void ScalarizeMaskedLoad(CallInst *CI) { Value *Ptr = CI->getArgOperand(0); - Value *Src0 = CI->getArgOperand(3); + Value *Alignment = CI->getArgOperand(1); Value *Mask = CI->getArgOperand(2); - VectorType *VecType = dyn_cast(CI->getType()); - Type *EltTy = VecType->getElementType(); + Value *Src0 = CI->getArgOperand(3); + unsigned AlignVal = cast(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast(CI->getType()); assert(VecType && "Unexpected return type of masked load intrinsic"); + Type *EltTy = CI->getType()->getVectorElementType(); + IRBuilder<> Builder(CI->getContext()); Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); BasicBlock *CondBlock = nullptr; BasicBlock *PrevIfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); + Builder.SetInsertPoint(InsertPt); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + // Short-cut if the mask is all-true. + bool IsAllOnesMask = isa(Mask) && + cast(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8); // Bitcast %addr fron i8* to EltTy* Type *NewPtrType = EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + Value *UndefVal = UndefValue::get(VecType); // The result vector Value *VResult = UndefVal; + if (isa(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = Builder.CreateInsertElement(VResult, Load, + Builder.getInt32(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + PHINode *Phi = nullptr; Value *PrevPhi = UndefVal; - unsigned VectorWidth = VecType->getNumElements(); for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration @@ -1181,7 +2325,7 @@ static void ScalarizeMaskedLoad(CallInst *CI) { Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst* Load = Builder.CreateLoad(Gep, false); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); // Create "else" block, fill it in the next iteration @@ -1219,12 +2363,12 @@ static void ScalarizeMaskedLoad(CallInst *CI) { // %5 = getelementptr i32* %1, i32 0 // store i32 %4, i32* %5 // br label %else -// +// // else: ; preds = %0, %cond.store // %6 = extractelement <16 x i1> %mask, i32 1 // %7 = icmp eq i1 %6, true // br i1 %7, label %cond.store1, label %else2 -// +// // cond.store1: ; preds = %else // %8 = extractelement <16 x i32> %val, i32 1 // %9 = getelementptr i32* %1, i32 1 @@ -1232,34 +2376,61 @@ static void ScalarizeMaskedLoad(CallInst *CI) { // br label %else2 // . . . static void ScalarizeMaskedStore(CallInst *CI) { - Value *Ptr = CI->getArgOperand(1); Value *Src = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); Value *Mask = CI->getArgOperand(3); + unsigned AlignVal = cast(Alignment)->getZExtValue(); VectorType *VecType = dyn_cast(Src->getType()); - Type *EltTy = VecType->getElementType(); - assert(VecType && "Unexpected data type in masked store intrinsic"); + Type *EltTy = VecType->getElementType(); + IRBuilder<> Builder(CI->getContext()); Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); Builder.SetInsertPoint(InsertPt); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + // Short-cut if the mask is all-true. + bool IsAllOnesMask = isa(Mask) && + cast(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Builder.CreateAlignedStore(Src, Ptr, AlignVal); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8); // Bitcast %addr fron i8* to EltTy* Type *NewPtrType = EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); - unsigned VectorWidth = VecType->getNumElements(); + + if (isa(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // // %mask_1 = extractelement <16 x i1> %mask, i32 Idx // %to_store = icmp eq i1 %mask_1, true - // br i1 %to_load, label %cond.store, label %else + // br i1 %to_store, label %cond.store, label %else // Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, @@ -1278,7 +2449,7 @@ static void ScalarizeMaskedStore(CallInst *CI) { Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - Builder.CreateStore(OneElt, Gep); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = @@ -1292,6 +2463,329 @@ static void ScalarizeMaskedStore(CallInst *CI) { CI->eraseFromParent(); } +// Translate a masked gather intrinsic like +// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, +// <16 x i1> %Mask, <16 x i32> %Src) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> %Mask, i32 0 +// % ToLoad0 = icmp eq i1 % Mask0, true +// br i1 % ToLoad0, label %cond.load, label %else +// +// cond.load: +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// % Load0 = load i32, i32* % Ptr0, align 4 +// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 +// br label %else +// +// else: +// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] +// % Mask1 = extractelement <16 x i1> %Mask, i32 1 +// % ToLoad1 = icmp eq i1 % Mask1, true +// br i1 % ToLoad1, label %cond.load1, label %else2 +// +// cond.load1: +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// % Load1 = load i32, i32* % Ptr1, align 4 +// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 +// br label %else2 +// . . . +// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src +// ret <16 x i32> %Result +static void ScalarizeMaskedGather(CallInst *CI) { + Value *Ptrs = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast(CI->getType()); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + unsigned AlignVal = cast(Alignment)->getZExtValue(); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + unsigned VectorWidth = VecType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, + "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, + Builder.getInt32(Idx), + "Res" + Twine(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %Mask1 = extractelement <16 x i1> %Mask, i32 1 + // %ToLoad1 = icmp eq i1 %Mask1, true + // br i1 %ToLoad1, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, + Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToLoad" + Twine(Idx)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, + "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), + "Res" + Twine(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked scatter intrinsic, like +// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, +// <16 x i1> %Mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set. +// +// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> % Mask, i32 0 +// % ToStore0 = icmp eq i1 % Mask0, true +// br i1 %ToStore0, label %cond.store, label %else +// +// cond.store: +// % Elt0 = extractelement <16 x i32> %Src, i32 0 +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// store i32 %Elt0, i32* % Ptr0, align 4 +// br label %else +// +// else: +// % Mask1 = extractelement <16 x i1> % Mask, i32 1 +// % ToStore1 = icmp eq i1 % Mask1, true +// br i1 % ToStore1, label %cond.store1, label %else2 +// +// cond.store1: +// % Elt1 = extractelement <16 x i32> %Src, i32 1 +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// store i32 % Elt1, i32* % Ptr1, align 4 +// br label %else2 +// . . . +static void ScalarizeMaskedScatter(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptrs = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + assert(isa(Src->getType()) && + "Unexpected data type in masked scatter intrinsic"); + assert(isa(Ptrs->getType()) && + isa(Ptrs->getType()->getVectorElementType()) && + "Vector of pointers is expected in masked scatter intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + unsigned AlignVal = cast(Alignment)->getZExtValue(); + unsigned VectorWidth = Src->getType()->getVectorNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx + // % ToStore = icmp eq i1 % Mask1, true + // br i1 % ToStore, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, + Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = + Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToStore" + Twine(Idx)); + + // Create "cond" block + // + // % Elt1 = extractelement <16 x i32> %Src, i32 1 + // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 + // %store i32 % Elt1, i32* % Ptr1 + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +/// If counting leading or trailing zeros is an expensive operation and a zero +/// input is defined, add a check for zero to avoid calling the intrinsic. +/// +/// We want to transform: +/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) +/// +/// into: +/// entry: +/// %cmpz = icmp eq i64 %A, 0 +/// br i1 %cmpz, label %cond.end, label %cond.false +/// cond.false: +/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true) +/// br label %cond.end +/// cond.end: +/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] +/// +/// If the transform is performed, return true and set ModifiedDT to true. +static bool despeculateCountZeros(IntrinsicInst *CountZeros, + const TargetLowering *TLI, + const DataLayout *DL, + bool &ModifiedDT) { + if (!TLI || !DL) + return false; + + // If a zero input is undefined, it doesn't make sense to despeculate that. + if (match(CountZeros->getOperand(1), m_One())) + return false; + + // If it's cheap to speculate, there's nothing to do. + auto IntrinsicID = CountZeros->getIntrinsicID(); + if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) || + (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz())) + return false; + + // Only handle legal scalar cases. Anything else requires too much work. + Type *Ty = CountZeros->getType(); + unsigned SizeInBits = Ty->getPrimitiveSizeInBits(); + if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSize()) + return false; + + // The intrinsic will be sunk behind a compare against zero and branch. + BasicBlock *StartBlock = CountZeros->getParent(); + BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); + + // Create another block after the count zero intrinsic. A PHI will be added + // in this block to select the result of the intrinsic or the bit-width + // constant if the input to the intrinsic is zero. + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); + BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); + + // Set up a builder to create a compare, conditional branch, and PHI. + IRBuilder<> Builder(CountZeros->getContext()); + Builder.SetInsertPoint(StartBlock->getTerminator()); + Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc()); + + // Replace the unconditional branch that was created by the first split with + // a compare against zero and a conditional branch. + Value *Zero = Constant::getNullValue(Ty); + Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz"); + Builder.CreateCondBr(Cmp, EndBlock, CallBlock); + StartBlock->getTerminator()->eraseFromParent(); + + // Create a PHI in the end block to select either the output of the intrinsic + // or the bit width of the operand. + Builder.SetInsertPoint(&EndBlock->front()); + PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); + CountZeros->replaceAllUsesWith(PN); + Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); + PN->addIncoming(BitWidth, StartBlock); + PN->addIncoming(CountZeros, CallBlock); + + // We are explicitly handling the zero case, so we can set the intrinsic's + // undefined zero argument to 'true'. This will also prevent reprocessing the + // intrinsic; we only despeculate when a zero input is defined. + CountZeros->setArgOperand(1, Builder.getTrue()); + ModifiedDT = true; + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1339,8 +2833,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { // over-aligning global variables that have an explicit section is // forbidden. GlobalVariable *GV; - if ((GV = dyn_cast(Val)) && GV->hasUniqueInitializer() && - !GV->hasSection() && GV->getAlignment() < PrefAlign && + if ((GV = dyn_cast(Val)) && GV->canIncreaseAlignment() && + GV->getAlignment() < PrefAlign && DL->getTypeAllocSize(GV->getType()->getElementType()) >= MinSize + Offset2) GV->setAlignment(PrefAlign); @@ -1384,7 +2878,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { } case Intrinsic::masked_load: { // Scalarize unsupported vector masked load - if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) { + if (!TTI->isLegalMaskedLoad(CI->getType())) { ScalarizeMaskedLoad(CI); ModifiedDT = true; return true; @@ -1392,13 +2886,29 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { return false; } case Intrinsic::masked_store: { - if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) { + if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { ScalarizeMaskedStore(CI); ModifiedDT = true; return true; } return false; } + case Intrinsic::masked_gather: { + if (!TTI->isLegalMaskedGather(CI->getType())) { + ScalarizeMaskedGather(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_scatter: { + if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { + ScalarizeMaskedScatter(CI); + ModifiedDT = true; + return true; + } + return false; + } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { ZExtInst *ExtVal = dyn_cast(CI->getArgOperand(0)); @@ -1416,6 +2926,11 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { II->replaceAllUsesWith(II->getArgOperand(0)); II->eraseFromParent(); return true; + + case Intrinsic::cttz: + case Intrinsic::ctlz: + // If counting zeros is expensive, try to avoid it. + return despeculateCountZeros(II, TLI, DL, ModifiedDT); } if (TLI) { @@ -2300,9 +3815,7 @@ class TypePromotionHelper { /// \brief Utility function to determine if \p OpIdx should be promoted when /// promoting \p Inst. static bool shouldExtOperand(const Instruction *Inst, int OpIdx) { - if (isa(Inst) && OpIdx == 0) - return false; - return true; + return !(isa(Inst) && OpIdx == 0); } /// \brief Utility function to promote the operand of \p Ext when this @@ -2439,10 +3952,8 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, return false; // #2 check that the truncate just drops extended bits. - if (Inst->getType()->getIntegerBitWidth() >= OpndType->getIntegerBitWidth()) - return true; - - return false; + return Inst->getType()->getIntegerBitWidth() >= + OpndType->getIntegerBitWidth(); } TypePromotionHelper::Action TypePromotionHelper::getAction( @@ -3846,15 +5357,197 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) { return MadeChange; } +// Find loads whose uses only use some of the loaded value's bits. Add an "and" +// just after the load if the target can fold this into one extload instruction, +// with the hope of eliminating some of the other later "and" instructions using +// the loaded value. "and"s that are made trivially redundant by the insertion +// of the new "and" are removed by this function, while others (e.g. those whose +// path from the load goes through a phi) are left for isel to potentially +// remove. +// +// For example: +// +// b0: +// x = load i32 +// ... +// b1: +// y = and x, 0xff +// z = use y +// +// becomes: +// +// b0: +// x = load i32 +// x' = and x, 0xff +// ... +// b1: +// z = use x' +// +// whereas: +// +// b0: +// x1 = load i32 +// ... +// b1: +// x2 = load i32 +// ... +// b2: +// x = phi x1, x2 +// y = and x, 0xff +// +// becomes (after a call to optimizeLoadExt for each load): +// +// b0: +// x1 = load i32 +// x1' = and x1, 0xff +// ... +// b1: +// x2 = load i32 +// x2' = and x2, 0xff +// ... +// b2: +// x = phi x1', x2' +// y = and x, 0xff +// + +bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { + + if (!Load->isSimple() || + !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) + return false; + + // Skip loads we've already transformed or have no reason to transform. + if (Load->hasOneUse()) { + User *LoadUser = *Load->user_begin(); + if (cast(LoadUser)->getParent() == Load->getParent() && + !dyn_cast(LoadUser)) + return false; + } + + // Look at all uses of Load, looking through phis, to determine how many bits + // of the loaded value are needed. + SmallVector WorkList; + SmallPtrSet Visited; + SmallVector AndsToMaybeRemove; + for (auto *U : Load->users()) + WorkList.push_back(cast(U)); + + EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); + unsigned BitWidth = LoadResultVT.getSizeInBits(); + APInt DemandBits(BitWidth, 0); + APInt WidestAndBits(BitWidth, 0); + + while (!WorkList.empty()) { + Instruction *I = WorkList.back(); + WorkList.pop_back(); + + // Break use-def graph loops. + if (!Visited.insert(I).second) + continue; + + // For a PHI node, push all of its users. + if (auto *Phi = dyn_cast(I)) { + for (auto *U : Phi->users()) + WorkList.push_back(cast(U)); + continue; + } + + switch (I->getOpcode()) { + case llvm::Instruction::And: { + auto *AndC = dyn_cast(I->getOperand(1)); + if (!AndC) + return false; + APInt AndBits = AndC->getValue(); + DemandBits |= AndBits; + // Keep track of the widest and mask we see. + if (AndBits.ugt(WidestAndBits)) + WidestAndBits = AndBits; + if (AndBits == WidestAndBits && I->getOperand(0) == Load) + AndsToMaybeRemove.push_back(I); + break; + } + + case llvm::Instruction::Shl: { + auto *ShlC = dyn_cast(I->getOperand(1)); + if (!ShlC) + return false; + uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); + auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); + DemandBits |= ShlDemandBits; + break; + } + + case llvm::Instruction::Trunc: { + EVT TruncVT = TLI->getValueType(*DL, I->getType()); + unsigned TruncBitWidth = TruncVT.getSizeInBits(); + auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); + DemandBits |= TruncBits; + break; + } + + default: + return false; + } + } + + uint32_t ActiveBits = DemandBits.getActiveBits(); + // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the + // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, + // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but + // (and (load x) 1) is not matched as a single instruction, rather as a LDR + // followed by an AND. + // TODO: Look into removing this restriction by fixing backends to either + // return false for isLoadExtLegal for i1 or have them select this pattern to + // a single instruction. + // + // Also avoid hoisting if we didn't see any ands with the exact DemandBits + // mask, since these are the only ands that will be removed by isel. + if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + WidestAndBits != DemandBits) + return false; + + LLVMContext &Ctx = Load->getType()->getContext(); + Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); + EVT TruncVT = TLI->getValueType(*DL, TruncTy); + + // Reject cases that won't be matched as extloads. + if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || + !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) + return false; + + IRBuilder<> Builder(Load->getNextNode()); + auto *NewAnd = dyn_cast( + Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + + // Replace all uses of load with new and (except for the use of load in the + // new and itself). + Load->replaceAllUsesWith(NewAnd); + NewAnd->setOperand(0, Load); + + // Remove any and instructions that are now redundant. + for (auto *And : AndsToMaybeRemove) + // Check that the and mask is the same as the one we decided to put on the + // new and. + if (cast(And->getOperand(1))->getValue() == DemandBits) { + And->replaceAllUsesWith(NewAnd); + if (&*CurInstIterator == And) + CurInstIterator = std::next(And->getIterator()); + And->eraseFromParent(); + ++NumAndUses; + } + + ++NumAndsAdded; + return true; +} + /// Check if V (an operand of a select instruction) is an expensive instruction /// that is only used once. static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { auto *I = dyn_cast(V); - if (I && I->hasOneUse() && - TTI->getUserCost(I) == TargetTransformInfo::TCC_Expensive) - return true; - - return false; + // If it's safe to speculatively execute, then it should not have side + // effects; therefore, it's safe to sink and possibly *not* execute. + return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) && + TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive; } /// Returns true if a SelectInst should be turned into an explicit branch. @@ -4083,6 +5776,49 @@ bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { return MadeChange; } +bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { + if (!TLI || !DL) + return false; + + Value *Cond = SI->getCondition(); + Type *OldType = Cond->getType(); + LLVMContext &Context = Cond->getContext(); + MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); + unsigned RegWidth = RegType.getSizeInBits(); + + if (RegWidth <= cast(OldType)->getBitWidth()) + return false; + + // If the register width is greater than the type width, expand the condition + // of the switch instruction and each case constant to the width of the + // register. By widening the type of the switch condition, subsequent + // comparisons (for case comparisons) will not need to be extended to the + // preferred register width, so we will potentially eliminate N-1 extends, + // where N is the number of cases in the switch. + auto *NewType = Type::getIntNTy(Context, RegWidth); + + // Zero-extend the switch condition and case constants unless the switch + // condition is a function argument that is already being sign-extended. + // In that case, we can avoid an unnecessary mask/extension by sign-extending + // everything instead. + Instruction::CastOps ExtType = Instruction::ZExt; + if (auto *Arg = dyn_cast(Cond)) + if (Arg->hasSExtAttr()) + ExtType = Instruction::SExt; + + auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); + ExtInst->insertBefore(SI); + SI->setCondition(ExtInst); + for (SwitchInst::CaseIt Case : SI->cases()) { + APInt NarrowConst = Case.getCaseValue()->getValue(); + APInt WideConst = (ExtType == Instruction::ZExt) ? + NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); + Case.setValue(ConstantInt::get(Context, WideConst)); + } + + return true; +} + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. @@ -4505,8 +6241,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { if (LoadInst *LI = dyn_cast(I)) { stripInvariantGroupMetadata(*LI); if (TLI) { + bool Modified = optimizeLoadExt(LI); unsigned AS = LI->getPointerAddressSpace(); - return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + return Modified; } return false; } @@ -4555,12 +6293,33 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { if (ShuffleVectorInst *SVI = dyn_cast(I)) return optimizeShuffleVectorInst(SVI); + if (auto *Switch = dyn_cast(I)) + return optimizeSwitchInst(Switch); + if (isa(I)) return optimizeExtractElementInst(I); return false; } +/// Given an OR instruction, check to see if this is a bitreverse +/// idiom. If so, insert the new intrinsic and return true. +static bool makeBitReverse(Instruction &I, const DataLayout &DL, + const TargetLowering &TLI) { + if (!I.getType()->isIntegerTy() || + !TLI.isOperationLegalOrCustom(ISD::BITREVERSE, + TLI.getValueType(DL, I.getType(), true))) + return false; + + SmallVector Insts; + if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts)) + return false; + Instruction *LastInst = Insts.back(); + I.replaceAllUsesWith(LastInst); + RecursivelyDeleteTriviallyDeadInstructions(&I); + return true; +} + // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. @@ -4574,8 +6333,19 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) { if (ModifiedDT) return true; } - MadeChange |= dupRetToEnableTailCallOpts(&BB); + bool MadeBitReverse = true; + while (TLI && MadeBitReverse) { + MadeBitReverse = false; + for (auto &I : reverse(BB)) { + if (makeBitReverse(I, *DL, *TLI)) { + MadeBitReverse = MadeChange = true; + break; + } + } + } + MadeChange |= dupRetToEnableTailCallOpts(&BB); + return MadeChange; } @@ -4601,6 +6371,10 @@ bool CodeGenPrepare::placeDbgValues(Function &F) { Instruction *VI = dyn_cast_or_null(DVI->getValue()); if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { + // If VI is a phi in a block with an EHPad terminator, we can't insert + // after it. + if (isa(VI) && VI->getParent()->getTerminator()->isEHPad()) + continue; DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); DVI->removeFromParent(); if (isa(VI))