#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CallSite.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
/// Extract one bit from mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
-static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) {
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
SDValue Vec = Op.getOperand(0);
SDLoc dl(Vec);
MVT VecVT = Vec.getSimpleValueType();
}
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- unsigned MaxSift = VecVT.getSizeInBits() - 1;
+ const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ unsigned MaxSift = rc->getSize()*8 - 1;
Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, MVT::i8));
Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
DAG.getConstant(SSECC, MVT::i8));
}
+/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// operand \p Op1. If non-trivial (for example because it's not constant)
+/// return an empty value.
+static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
+{
+ BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+ if (!BV)
+ return SDValue();
+
+ MVT VT = Op1.getSimpleValueType();
+ MVT EVT = VT.getVectorElementType();
+ unsigned n = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> ULTOp1;
+
+ for (unsigned i = 0; i < n; ++i) {
+ ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+ if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
+ return SDValue();
+
+ // Avoid underflow.
+ APInt Val = Elt->getAPIntValue();
+ if (Val == 0)
+ return SDValue();
+
+ ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
+ }
+
+ return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op1), VT, ULTOp1.data(),
+ ULTOp1.size());
+}
+
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0);
// operations may be required for some comparisons.
unsigned Opc;
bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+ bool Subus = false;
switch (SetCCOpcode) {
default: llvm_unreachable("Unexpected SETCC condition");
if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
}
+ bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+ if (!MinMax && hasSubus) {
+ // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+ // Op0 u<= Op1:
+ // t = psubus Op0, Op1
+ // pcmpeq t, <0..0>
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETULT: {
+ // If the comparison is against a constant we can turn this into a
+ // setule. With psubus, setule does not require a swap. This is
+ // beneficial because the constant in the register is no longer
+ // destructed as the destination so it can be hoisted out of a loop.
+ // Only do this pre-AVX since vpcmp* is no longer destructive.
+ if (Subtarget->hasAVX())
+ break;
+ SDValue ULEOp1 = ChangeVSETULTtoVSETULE(Op1, DAG);
+ if (ULEOp1.getNode()) {
+ Op1 = ULEOp1;
+ Subus = true; Invert = false; Swap = false;
+ }
+ break;
+ }
+ // Psubus is better than flip-sign because it requires no inversion.
+ case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
+ case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
+ }
+
+ if (Subus) {
+ Opc = X86ISD::SUBUS;
+ FlipSigns = false;
+ }
+ }
+
if (Swap)
std::swap(Op0, Op1);
if (MinMax)
Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+ if (Subus)
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+ getZeroVector(VT, Subtarget, DAG, dl));
+
return Result;
}
return true;
}
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+ unsigned Bits = Ty->getScalarSizeInBits();
+
+ // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+ // particularly cheaper than those without.
+ if (Bits == 8)
+ return false;
+
+ // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+ // variable shifts just as cheap as scalar ones.
+ if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+ return false;
+
+ // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+ // fully general vector.
+ return true;
+}
+
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
// Transfer the remainder of MBB and its successor edges to endMBB.
endMBB->splice(endMBB->begin(), thisMBB,
- llvm::next(MachineBasicBlock::iterator(MI)),
- thisMBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
// Make offsetMBB and overflowMBB successors of thisMBB
// Transfer the remainder of MBB and its successor edges to EndMBB.
EndMBB->splice(EndMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)),
- MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
// The original block will now fall through to the XMM save block.
MachineBasicBlock* BB,
const TargetRegisterInfo* TRI) {
// Scan forward through BB for a use/def of EFLAGS.
- MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+ MachineBasicBlock::iterator miI(std::next(SelectItr));
for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
const MachineInstr& mi = *miI;
if (mi.readsRegister(X86::EFLAGS))
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
- llvm::next(MachineBasicBlock::iterator(MI)),
- BB->end());
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add the true and fallthrough blocks as its successors.
MF->insert(MBBIter, mallocMBB);
MF->insert(MBBIter, continueMBB);
- continueMBB->splice(continueMBB->begin(), BB, llvm::next
- (MachineBasicBlock::iterator(MI)), BB->end());
+ continueMBB->splice(continueMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
continueMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add code to the main basic block to check if the stack limit has been hit,
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB: