+namespace {
+/// \brief Helper class to promote a scalar operation to a vector one.
+/// This class is used to move downward extractelement transition.
+/// E.g.,
+/// a = vector_op <2 x i32>
+/// b = extractelement <2 x i32> a, i32 0
+/// c = scalar_op b
+/// store c
+///
+/// =>
+/// a = vector_op <2 x i32>
+/// c = vector_op a (equivalent to scalar_op on the related lane)
+/// * d = extractelement <2 x i32> c, i32 0
+/// * store d
+/// Assuming both extractelement and store can be combine, we get rid of the
+/// transition.
+class VectorPromoteHelper {
+ /// Used to perform some checks on the legality of vector operations.
+ const TargetLowering &TLI;
+
+ /// Used to estimated the cost of the promoted chain.
+ const TargetTransformInfo &TTI;
+
+ /// The transition being moved downwards.
+ Instruction *Transition;
+ /// The sequence of instructions to be promoted.
+ SmallVector<Instruction *, 4> InstsToBePromoted;
+ /// Cost of combining a store and an extract.
+ unsigned StoreExtractCombineCost;
+ /// Instruction that will be combined with the transition.
+ Instruction *CombineInst;
+
+ /// \brief The instruction that represents the current end of the transition.
+ /// Since we are faking the promotion until we reach the end of the chain
+ /// of computation, we need a way to get the current end of the transition.
+ Instruction *getEndOfTransition() const {
+ if (InstsToBePromoted.empty())
+ return Transition;
+ return InstsToBePromoted.back();
+ }
+
+ /// \brief Return the index of the original value in the transition.
+ /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
+ /// c, is at index 0.
+ unsigned getTransitionOriginalValueIdx() const {
+ assert(isa<ExtractElementInst>(Transition) &&
+ "Other kind of transitions are not supported yet");
+ return 0;
+ }
+
+ /// \brief Return the index of the index in the transition.
+ /// E.g., for "extractelement <2 x i32> c, i32 0" the index
+ /// is at index 1.
+ unsigned getTransitionIdx() const {
+ assert(isa<ExtractElementInst>(Transition) &&
+ "Other kind of transitions are not supported yet");
+ return 1;
+ }
+
+ /// \brief Get the type of the transition.
+ /// This is the type of the original value.
+ /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
+ /// transition is <2 x i32>.
+ Type *getTransitionType() const {
+ return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
+ }
+
+ /// \brief Promote \p ToBePromoted by moving \p Def downward through.
+ /// I.e., we have the following sequence:
+ /// Def = Transition <ty1> a to <ty2>
+ /// b = ToBePromoted <ty2> Def, ...
+ /// =>
+ /// b = ToBePromoted <ty1> a, ...
+ /// Def = Transition <ty1> ToBePromoted to <ty2>
+ void promoteImpl(Instruction *ToBePromoted);
+
+ /// \brief Check whether or not it is profitable to promote all the
+ /// instructions enqueued to be promoted.
+ bool isProfitableToPromote() {
+ Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
+ unsigned Index = isa<ConstantInt>(ValIdx)
+ ? cast<ConstantInt>(ValIdx)->getZExtValue()
+ : -1;
+ Type *PromotedType = getTransitionType();
+
+ StoreInst *ST = cast<StoreInst>(CombineInst);
+ unsigned AS = ST->getPointerAddressSpace();
+ unsigned Align = ST->getAlignment();
+ // Check if this store is supported.
+ if (!TLI.allowsMisalignedMemoryAccesses(
+ TLI.getValueType(ST->getValueOperand()->getType()), AS, Align)) {
+ // If this is not supported, there is no way we can combine
+ // the extract with the store.
+ return false;
+ }
+
+ // The scalar chain of computation has to pay for the transition
+ // scalar to vector.
+ // The vector chain has to account for the combining cost.
+ uint64_t ScalarCost =
+ TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
+ uint64_t VectorCost = StoreExtractCombineCost;
+ for (const auto &Inst : InstsToBePromoted) {
+ // Compute the cost.
+ // By construction, all instructions being promoted are arithmetic ones.
+ // Moreover, one argument is a constant that can be viewed as a splat
+ // constant.
+ Value *Arg0 = Inst->getOperand(0);
+ bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
+ isa<ConstantFP>(Arg0);
+ TargetTransformInfo::OperandValueKind Arg0OVK =
+ IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+ : TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Arg1OVK =
+ !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+ : TargetTransformInfo::OK_AnyValue;
+ ScalarCost += TTI.getArithmeticInstrCost(
+ Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
+ VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
+ Arg0OVK, Arg1OVK);
+ }
+ DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
+ << ScalarCost << "\nVector: " << VectorCost << '\n');
+ return ScalarCost > VectorCost;
+ }
+
+ /// \brief Generate a constant vector with \p Val with the same
+ /// number of elements as the transition.
+ /// \p UseSplat defines whether or not \p Val should be replicated
+ /// accross the whole vector.
+ /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
+ /// otherwise we generate a vector with as many undef as possible:
+ /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
+ /// used at the index of the extract.
+ Value *getConstantVector(Constant *Val, bool UseSplat) const {
+ unsigned ExtractIdx = UINT_MAX;
+ if (!UseSplat) {
+ // If we cannot determine where the constant must be, we have to
+ // use a splat constant.
+ Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
+ if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
+ ExtractIdx = CstVal->getSExtValue();
+ else
+ UseSplat = true;
+ }
+
+ unsigned End = getTransitionType()->getVectorNumElements();
+ if (UseSplat)
+ return ConstantVector::getSplat(End, Val);
+
+ SmallVector<Constant *, 4> ConstVec;
+ UndefValue *UndefVal = UndefValue::get(Val->getType());
+ for (unsigned Idx = 0; Idx != End; ++Idx) {
+ if (Idx == ExtractIdx)
+ ConstVec.push_back(Val);
+ else
+ ConstVec.push_back(UndefVal);
+ }
+ return ConstantVector::get(ConstVec);
+ }
+
+ /// \brief Check if promoting to a vector type an operand at \p OperandIdx
+ /// in \p Use can trigger undefined behavior.
+ static bool canCauseUndefinedBehavior(const Instruction *Use,
+ unsigned OperandIdx) {
+ // This is not safe to introduce undef when the operand is on
+ // the right hand side of a division-like instruction.
+ if (OperandIdx != 1)
+ return false;
+ switch (Use->getOpcode()) {
+ default:
+ return false;
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return true;
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ return !Use->hasNoNaNs();
+ }
+ llvm_unreachable(nullptr);
+ }
+
+public:
+ VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI,
+ Instruction *Transition, unsigned CombineCost)
+ : TLI(TLI), TTI(TTI), Transition(Transition),
+ StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
+ assert(Transition && "Do not know how to promote null");
+ }
+
+ /// \brief Check if we can promote \p ToBePromoted to \p Type.
+ bool canPromote(const Instruction *ToBePromoted) const {
+ // We could support CastInst too.
+ return isa<BinaryOperator>(ToBePromoted);
+ }
+
+ /// \brief Check if it is profitable to promote \p ToBePromoted
+ /// by moving downward the transition through.
+ bool shouldPromote(const Instruction *ToBePromoted) const {
+ // Promote only if all the operands can be statically expanded.
+ // Indeed, we do not want to introduce any new kind of transitions.
+ for (const Use &U : ToBePromoted->operands()) {
+ const Value *Val = U.get();
+ if (Val == getEndOfTransition()) {
+ // If the use is a division and the transition is on the rhs,
+ // we cannot promote the operation, otherwise we may create a
+ // division by zero.
+ if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
+ return false;
+ continue;
+ }
+ if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
+ !isa<ConstantFP>(Val))
+ return false;
+ }
+ // Check that the resulting operation is legal.
+ int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
+ if (!ISDOpcode)
+ return false;
+ return StressStoreExtract ||
+ TLI.isOperationLegalOrCustom(
+ ISDOpcode, TLI.getValueType(getTransitionType(), true));
+ }
+
+ /// \brief Check whether or not \p Use can be combined
+ /// with the transition.
+ /// I.e., is it possible to do Use(Transition) => AnotherUse?
+ bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
+
+ /// \brief Record \p ToBePromoted as part of the chain to be promoted.
+ void enqueueForPromotion(Instruction *ToBePromoted) {
+ InstsToBePromoted.push_back(ToBePromoted);
+ }
+
+ /// \brief Set the instruction that will be combined with the transition.
+ void recordCombineInstruction(Instruction *ToBeCombined) {
+ assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
+ CombineInst = ToBeCombined;
+ }
+
+ /// \brief Promote all the instructions enqueued for promotion if it is
+ /// is profitable.
+ /// \return True if the promotion happened, false otherwise.
+ bool promote() {
+ // Check if there is something to promote.
+ // Right now, if we do not have anything to combine with,
+ // we assume the promotion is not profitable.
+ if (InstsToBePromoted.empty() || !CombineInst)
+ return false;
+
+ // Check cost.
+ if (!StressStoreExtract && !isProfitableToPromote())
+ return false;
+
+ // Promote.
+ for (auto &ToBePromoted : InstsToBePromoted)
+ promoteImpl(ToBePromoted);
+ InstsToBePromoted.clear();
+ return true;
+ }
+};
+} // End of anonymous namespace.
+
+void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
+ // At this point, we know that all the operands of ToBePromoted but Def
+ // can be statically promoted.
+ // For Def, we need to use its parameter in ToBePromoted:
+ // b = ToBePromoted ty1 a
+ // Def = Transition ty1 b to ty2
+ // Move the transition down.
+ // 1. Replace all uses of the promoted operation by the transition.
+ // = ... b => = ... Def.
+ assert(ToBePromoted->getType() == Transition->getType() &&
+ "The type of the result of the transition does not match "
+ "the final type");
+ ToBePromoted->replaceAllUsesWith(Transition);
+ // 2. Update the type of the uses.
+ // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
+ Type *TransitionTy = getTransitionType();
+ ToBePromoted->mutateType(TransitionTy);
+ // 3. Update all the operands of the promoted operation with promoted
+ // operands.
+ // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
+ for (Use &U : ToBePromoted->operands()) {
+ Value *Val = U.get();
+ Value *NewVal = nullptr;
+ if (Val == Transition)
+ NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
+ else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
+ isa<ConstantFP>(Val)) {
+ // Use a splat constant if it is not safe to use undef.
+ NewVal = getConstantVector(
+ cast<Constant>(Val),
+ isa<UndefValue>(Val) ||
+ canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
+ } else
+ llvm_unreachable("Did you modified shouldPromote and forgot to update "
+ "this?");
+ ToBePromoted->setOperand(U.getOperandNo(), NewVal);
+ }
+ Transition->removeFromParent();
+ Transition->insertAfter(ToBePromoted);
+ Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
+}
+
+// See if we can speculate calls to intrinsic cttz/ctlz.
+//
+// Example:
+// entry:
+// ...
+// %cmp = icmp eq i64 %val, 0
+// br i1 %cmp, label %end.bb, label %then.bb
+//
+// then.bb:
+// %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true)
+// br label %EndBB
+//
+// end.bb:
+// %cond = phi i64 [ %c, %then.bb ], [ 64, %entry ]
+//
+// ==>
+//
+// entry:
+// ...
+// %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 false)
+//
+static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) {
+ assert(BrInst->isConditional() && "Expected a conditional branch!");
+ BasicBlock *ThenBB = BrInst->getSuccessor(1);
+ BasicBlock *EndBB = BrInst->getSuccessor(0);
+
+ // See if ThenBB contains only one instruction (excluding the
+ // terminator and DbgInfoIntrinsic calls).
+ IntrinsicInst *II = nullptr;
+ CastInst *CI = nullptr;
+ for (BasicBlock::iterator I = ThenBB->begin(),
+ E = std::prev(ThenBB->end()); I != E; ++I) {
+ // Skip debug info.
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ // Check if this is a zero extension or a truncate of a previously
+ // matched call to intrinsic cttz/ctlz.
+ if (II) {
+ // Early exit if we already found a "free" zero extend/truncate.
+ if (CI)
+ return false;
+
+ Type *SrcTy = II->getType();
+ Type *DestTy = I->getType();
+ Value *V;
+
+ if (match(cast<Instruction>(I), m_ZExt(m_Value(V))) && V == II) {
+ // Speculate this zero extend only if it is "free" for the target.
+ if (TLI.isZExtFree(SrcTy, DestTy)) {
+ CI = cast<CastInst>(I);
+ continue;
+ }
+ } else if (match(cast<Instruction>(I), m_Trunc(m_Value(V))) && V == II) {
+ // Speculate this truncate only if it is "free" for the target.
+ if (TLI.isTruncateFree(SrcTy, DestTy)) {
+ CI = cast<CastInst>(I);
+ continue;
+ }
+ } else {
+ // Avoid speculating more than one instruction.
+ return false;
+ }
+ }
+
+ // See if this is a call to intrinsic cttz/ctlz.
+ if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::cttz>())) {
+ // Avoid speculating expensive intrinsic calls.
+ if (!TLI.isCheapToSpeculateCttz())
+ return false;
+ }
+ else if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::ctlz>())) {
+ // Avoid speculating expensive intrinsic calls.
+ if (!TLI.isCheapToSpeculateCtlz())
+ return false;
+ } else
+ return false;
+
+ II = cast<IntrinsicInst>(I);
+ }
+
+ // Look for PHI nodes with 'II' as the incoming value from 'ThenBB'.
+ BasicBlock *EntryBB = BrInst->getParent();
+ for (BasicBlock::iterator I = EndBB->begin();
+ PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+ Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
+ Value *OrigV = PN->getIncomingValueForBlock(EntryBB);
+
+ if (!OrigV)
+ return false;
+
+ if (ThenV != II && (!CI || ThenV != CI))
+ return false;
+
+ if (ConstantInt *CInt = dyn_cast<ConstantInt>(OrigV)) {
+ unsigned BitWidth = II->getType()->getIntegerBitWidth();
+
+ // Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz
+ // intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits
+ // of the value in input to the cttz/ctlz.
+ if (CInt->getValue() != BitWidth)
+ return false;
+
+ // Hoist the call to cttz/ctlz from ThenBB into EntryBB.
+ EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(),
+ ThenBB->begin(), std::prev(ThenBB->end()));
+
+ // Update PN setting ThenV as the incoming value from both 'EntryBB'
+ // and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this
+ // phi node if all the incoming values are the same.
+ PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV);
+ PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV);
+
+ // Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call.
+ if (cast<ConstantInt>(II->getArgOperand(1))->isOne()) {
+ Type *Ty = II->getArgOperand(0)->getType();
+ Value *Args[] = { II->getArgOperand(0),
+ ConstantInt::getFalse(II->getContext()) };
+ Module *M = EntryBB->getParent()->getParent();
+ Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty);
+ IRBuilder<> Builder(II);
+ Instruction *NewI = Builder.CreateCall(IF, Args);
+
+ // Replace the old call to cttz/ctlz.
+ II->replaceAllUsesWith(NewI);
+ II->eraseFromParent();
+ }
+
+ // Update BrInst condition so that the branch to EndBB is always taken.
+ // Later on, method 'ConstantFoldTerminator' will simplify this branch
+ // replacing it with a direct branch to 'EndBB'.
+ // As a side effect, CodeGenPrepare will attempt to simplify the control
+ // flow graph by deleting basic block 'ThenBB' and merging 'EntryBB' into
+ // 'EndBB' (calling method 'EliminateFallThrough').
+ BrInst->setCondition(ConstantInt::getTrue(BrInst->getContext()));
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Some targets can do store(extractelement) with one instruction.
+/// Try to push the extractelement towards the stores when the target
+/// has this feature and this is profitable.
+bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
+ unsigned CombineCost = UINT_MAX;
+ if (DisableStoreExtract || !TLI ||
+ (!StressStoreExtract &&
+ !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
+ Inst->getOperand(1), CombineCost)))
+ return false;
+
+ // At this point we know that Inst is a vector to scalar transition.
+ // Try to move it down the def-use chain, until:
+ // - We can combine the transition with its single use
+ // => we got rid of the transition.
+ // - We escape the current basic block
+ // => we would need to check that we are moving it at a cheaper place and
+ // we do not do that for now.
+ BasicBlock *Parent = Inst->getParent();
+ DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
+ VectorPromoteHelper VPH(*TLI, *TTI, Inst, CombineCost);
+ // If the transition has more than one use, assume this is not going to be
+ // beneficial.
+ while (Inst->hasOneUse()) {
+ Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
+ DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
+
+ if (ToBePromoted->getParent() != Parent) {
+ DEBUG(dbgs() << "Instruction to promote is in a different block ("
+ << ToBePromoted->getParent()->getName()
+ << ") than the transition (" << Parent->getName() << ").\n");
+ return false;
+ }
+
+ if (VPH.canCombine(ToBePromoted)) {
+ DEBUG(dbgs() << "Assume " << *Inst << '\n'
+ << "will be combined with: " << *ToBePromoted << '\n');
+ VPH.recordCombineInstruction(ToBePromoted);
+ bool Changed = VPH.promote();
+ NumStoreExtractExposed += Changed;
+ return Changed;
+ }
+
+ DEBUG(dbgs() << "Try promoting.\n");
+ if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
+ return false;
+
+ DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
+
+ VPH.enqueueForPromotion(ToBePromoted);
+ Inst = ToBePromoted;
+ }
+ return false;
+}
+
+bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {