X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FCodeGen%2FCodeGenPrepare.cpp;h=d619044e86caea043b6c76401802ad9951fc9102;hb=d749dbb6ecc72c57a30c3d996d6b176bc4db18e3;hp=87669d772d764867109792c595bde433ba82aebe;hpb=2762d00d7419662466539dc727fdb3eff477ca67;p=oota-llvm.git diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 87669d772d7..d619044e86c 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -64,6 +64,9 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " "computations were sunk"); STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); +STATISTIC(NumAndsAdded, + "Number of and mask instructions added to form ext loads"); +STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); @@ -173,8 +176,10 @@ class TypePromotionTransaction; bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); bool moveExtToFormExtLoad(Instruction *&I); bool optimizeExtUses(Instruction *I); + bool optimizeLoadExt(LoadInst *I); bool optimizeSelectInst(SelectInst *SI); bool optimizeShuffleVectorInst(ShuffleVectorInst *SI); + bool optimizeSwitchInst(SwitchInst *CI); bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB); bool placeDbgValues(Function &F); @@ -588,6 +593,14 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase, continue; } + if (RelocatedBase->getParent() != ToReplace->getParent()) { + // Base and derived relocates are in different basic blocks. + // In this case transform is only valid when base dominates derived + // relocate. However it would be too expensive to check dominance + // for each such relocate, so we skip the whole transformation. + continue; + } + Value *Base = ThisRelocate.getBasePtr(); auto Derived = dyn_cast(ThisRelocate.getDerivedPtr()); if (!Derived || Derived->getPointerOperand() != Base) @@ -717,6 +730,12 @@ static bool SinkCast(CastInst *CI) { // Preincrement use iterator so we don't invalidate it. ++UI; + // If the block selected to receive the cast is an EH pad that does not + // allow non-PHI instructions before the terminator, we can't sink the + // cast. + if (UserBB->getTerminator()->isEHPad()) + continue; + // If this user is in the same block as the cast, don't change the cast. if (UserBB == DefBB) continue; @@ -1597,6 +1616,85 @@ static void ScalarizeMaskedScatter(CallInst *CI) { CI->eraseFromParent(); } +/// If counting leading or trailing zeros is an expensive operation and a zero +/// input is defined, add a check for zero to avoid calling the intrinsic. +/// +/// We want to transform: +/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) +/// +/// into: +/// entry: +/// %cmpz = icmp eq i64 %A, 0 +/// br i1 %cmpz, label %cond.end, label %cond.false +/// cond.false: +/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true) +/// br label %cond.end +/// cond.end: +/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] +/// +/// If the transform is performed, return true and set ModifiedDT to true. +static bool despeculateCountZeros(IntrinsicInst *CountZeros, + const TargetLowering *TLI, + const DataLayout *DL, + bool &ModifiedDT) { + if (!TLI || !DL) + return false; + + // If a zero input is undefined, it doesn't make sense to despeculate that. + if (match(CountZeros->getOperand(1), m_One())) + return false; + + // If it's cheap to speculate, there's nothing to do. + auto IntrinsicID = CountZeros->getIntrinsicID(); + if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) || + (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz())) + return false; + + // Only handle legal scalar cases. Anything else requires too much work. + Type *Ty = CountZeros->getType(); + unsigned SizeInBits = Ty->getPrimitiveSizeInBits(); + if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSize()) + return false; + + // The intrinsic will be sunk behind a compare against zero and branch. + BasicBlock *StartBlock = CountZeros->getParent(); + BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); + + // Create another block after the count zero intrinsic. A PHI will be added + // in this block to select the result of the intrinsic or the bit-width + // constant if the input to the intrinsic is zero. + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); + BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); + + // Set up a builder to create a compare, conditional branch, and PHI. + IRBuilder<> Builder(CountZeros->getContext()); + Builder.SetInsertPoint(StartBlock->getTerminator()); + Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc()); + + // Replace the unconditional branch that was created by the first split with + // a compare against zero and a conditional branch. + Value *Zero = Constant::getNullValue(Ty); + Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz"); + Builder.CreateCondBr(Cmp, EndBlock, CallBlock); + StartBlock->getTerminator()->eraseFromParent(); + + // Create a PHI in the end block to select either the output of the intrinsic + // or the bit width of the operand. + Builder.SetInsertPoint(&EndBlock->front()); + PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); + CountZeros->replaceAllUsesWith(PN); + Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); + PN->addIncoming(BitWidth, StartBlock); + PN->addIncoming(CountZeros, CallBlock); + + // We are explicitly handling the zero case, so we can set the intrinsic's + // undefined zero argument to 'true'. This will also prevent reprocessing the + // intrinsic; we only despeculate when a zero input is defined. + CountZeros->setArgOperand(1, Builder.getTrue()); + ModifiedDT = true; + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1737,6 +1835,11 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { II->replaceAllUsesWith(II->getArgOperand(0)); II->eraseFromParent(); return true; + + case Intrinsic::cttz: + case Intrinsic::ctlz: + // If counting zeros is expensive, try to avoid it. + return despeculateCountZeros(II, TLI, DL, ModifiedDT); } if (TLI) { @@ -4163,6 +4266,189 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) { return MadeChange; } +// Find loads whose uses only use some of the loaded value's bits. Add an "and" +// just after the load if the target can fold this into one extload instruction, +// with the hope of eliminating some of the other later "and" instructions using +// the loaded value. "and"s that are made trivially redundant by the insertion +// of the new "and" are removed by this function, while others (e.g. those whose +// path from the load goes through a phi) are left for isel to potentially +// remove. +// +// For example: +// +// b0: +// x = load i32 +// ... +// b1: +// y = and x, 0xff +// z = use y +// +// becomes: +// +// b0: +// x = load i32 +// x' = and x, 0xff +// ... +// b1: +// z = use x' +// +// whereas: +// +// b0: +// x1 = load i32 +// ... +// b1: +// x2 = load i32 +// ... +// b2: +// x = phi x1, x2 +// y = and x, 0xff +// +// becomes (after a call to optimizeLoadExt for each load): +// +// b0: +// x1 = load i32 +// x1' = and x1, 0xff +// ... +// b1: +// x2 = load i32 +// x2' = and x2, 0xff +// ... +// b2: +// x = phi x1', x2' +// y = and x, 0xff +// + +bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { + + if (!Load->isSimple() || + !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) + return false; + + // Skip loads we've already transformed or have no reason to transform. + if (Load->hasOneUse()) { + User *LoadUser = *Load->user_begin(); + if (cast(LoadUser)->getParent() == Load->getParent() && + !dyn_cast(LoadUser)) + return false; + } + + // Look at all uses of Load, looking through phis, to determine how many bits + // of the loaded value are needed. + SmallVector WorkList; + SmallPtrSet Visited; + SmallVector AndsToMaybeRemove; + for (auto *U : Load->users()) + WorkList.push_back(cast(U)); + + EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); + unsigned BitWidth = LoadResultVT.getSizeInBits(); + APInt DemandBits(BitWidth, 0); + APInt WidestAndBits(BitWidth, 0); + + while (!WorkList.empty()) { + Instruction *I = WorkList.back(); + WorkList.pop_back(); + + // Break use-def graph loops. + if (!Visited.insert(I).second) + continue; + + // For a PHI node, push all of its users. + if (auto *Phi = dyn_cast(I)) { + for (auto *U : Phi->users()) + WorkList.push_back(cast(U)); + continue; + } + + switch (I->getOpcode()) { + case llvm::Instruction::And: { + auto *AndC = dyn_cast(I->getOperand(1)); + if (!AndC) + return false; + APInt AndBits = AndC->getValue(); + DemandBits |= AndBits; + // Keep track of the widest and mask we see. + if (AndBits.ugt(WidestAndBits)) + WidestAndBits = AndBits; + if (AndBits == WidestAndBits && I->getOperand(0) == Load) + AndsToMaybeRemove.push_back(I); + break; + } + + case llvm::Instruction::Shl: { + auto *ShlC = dyn_cast(I->getOperand(1)); + if (!ShlC) + return false; + uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); + auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); + DemandBits |= ShlDemandBits; + break; + } + + case llvm::Instruction::Trunc: { + EVT TruncVT = TLI->getValueType(*DL, I->getType()); + unsigned TruncBitWidth = TruncVT.getSizeInBits(); + auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); + DemandBits |= TruncBits; + break; + } + + default: + return false; + } + } + + uint32_t ActiveBits = DemandBits.getActiveBits(); + // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the + // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, + // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but + // (and (load x) 1) is not matched as a single instruction, rather as a LDR + // followed by an AND. + // TODO: Look into removing this restriction by fixing backends to either + // return false for isLoadExtLegal for i1 or have them select this pattern to + // a single instruction. + // + // Also avoid hoisting if we didn't see any ands with the exact DemandBits + // mask, since these are the only ands that will be removed by isel. + if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + WidestAndBits != DemandBits) + return false; + + LLVMContext &Ctx = Load->getType()->getContext(); + Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); + EVT TruncVT = TLI->getValueType(*DL, TruncTy); + + // Reject cases that won't be matched as extloads. + if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || + !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) + return false; + + IRBuilder<> Builder(Load->getNextNode()); + auto *NewAnd = dyn_cast( + Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + + // Replace all uses of load with new and (except for the use of load in the + // new and itself). + Load->replaceAllUsesWith(NewAnd); + NewAnd->setOperand(0, Load); + + // Remove any and instructions that are now redundant. + for (auto *And : AndsToMaybeRemove) + // Check that the and mask is the same as the one we decided to put on the + // new and. + if (cast(And->getOperand(1))->getValue() == DemandBits) { + And->replaceAllUsesWith(NewAnd); + if (&*CurInstIterator == And) + CurInstIterator = std::next(And->getIterator()); + And->eraseFromParent(); + ++NumAndUses; + } + + ++NumAndsAdded; + return true; +} + /// Check if V (an operand of a select instruction) is an expensive instruction /// that is only used once. static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { @@ -4399,6 +4685,49 @@ bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { return MadeChange; } +bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { + if (!TLI || !DL) + return false; + + Value *Cond = SI->getCondition(); + Type *OldType = Cond->getType(); + LLVMContext &Context = Cond->getContext(); + MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); + unsigned RegWidth = RegType.getSizeInBits(); + + if (RegWidth <= cast(OldType)->getBitWidth()) + return false; + + // If the register width is greater than the type width, expand the condition + // of the switch instruction and each case constant to the width of the + // register. By widening the type of the switch condition, subsequent + // comparisons (for case comparisons) will not need to be extended to the + // preferred register width, so we will potentially eliminate N-1 extends, + // where N is the number of cases in the switch. + auto *NewType = Type::getIntNTy(Context, RegWidth); + + // Zero-extend the switch condition and case constants unless the switch + // condition is a function argument that is already being sign-extended. + // In that case, we can avoid an unnecessary mask/extension by sign-extending + // everything instead. + Instruction::CastOps ExtType = Instruction::ZExt; + if (auto *Arg = dyn_cast(Cond)) + if (Arg->hasSExtAttr()) + ExtType = Instruction::SExt; + + auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); + ExtInst->insertBefore(SI); + SI->setCondition(ExtInst); + for (SwitchInst::CaseIt Case : SI->cases()) { + APInt NarrowConst = Case.getCaseValue()->getValue(); + APInt WideConst = (ExtType == Instruction::ZExt) ? + NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); + Case.setValue(ConstantInt::get(Context, WideConst)); + } + + return true; +} + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. @@ -4821,8 +5150,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { if (LoadInst *LI = dyn_cast(I)) { stripInvariantGroupMetadata(*LI); if (TLI) { + bool Modified = optimizeLoadExt(LI); unsigned AS = LI->getPointerAddressSpace(); - return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + return Modified; } return false; } @@ -4871,6 +5202,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { if (ShuffleVectorInst *SVI = dyn_cast(I)) return optimizeShuffleVectorInst(SVI); + if (auto *Switch = dyn_cast(I)) + return optimizeSwitchInst(Switch); + if (isa(I)) return optimizeExtractElementInst(I);