X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=cba145208da63a4ff91e9bde0550dd7c376bb261;hp=38a0f06bc4b2ce8a5d15594deeabbdeaffe6cab2;hb=3922da8ae8fab29de6416eeeebf21208b1491557;hpb=8f2a85e0995a5bb2943bf9c5021950beaf48265e diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 38a0f06bc4b..cba145208da 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -44,11 +44,13 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include +#include #include using namespace llvm; @@ -56,6 +58,17 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); +static cl::opt ExperimentalVectorWideningLegalization( + "x86-experimental-vector-widening-legalization", cl::init(false), + cl::desc("Enable an experimental vector type legalization through widening " + "rather than promotion."), + cl::Hidden); + +static cl::opt ExperimentalVectorShuffleLowering( + "x86-experimental-vector-shuffle-lowering", cl::init(false), + cl::desc("Enable an experimental vector shuffle lowering code path."), + cl::Hidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -502,6 +515,25 @@ void X86TargetLowering::resetOperationActions() { } } + // Special handling for half-precision floating point conversions. + // If we don't have F16C support, then lower half float conversions + // into library calls. + if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + } + + // There's never any support for operations beyond MVT::f32. + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f80, MVT::f16, Expand); + if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { @@ -580,34 +612,18 @@ void X86TargetLowering::resetOperationActions() { // Expand certain atomics for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; - setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } - if (!Subtarget->is64Bit()) { - setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); - } - if (Subtarget->hasCmpxchg16b()) { - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags - if (!Subtarget->isTargetDarwin() && - !Subtarget->isTargetELF() && - !Subtarget->isTargetCygMing()) { + if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && + !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } @@ -873,7 +889,12 @@ void X86TargetLowering::resetOperationActions() { (MVT::SimpleValueType)InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, Expand); + + // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types, + // we have to deal with them whether we ask for Expansion or not. Setting + // Expand causes its own optimisation problems though, so leave them legal. + if (VT.getVectorElementType() == MVT::i1) + setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // FIXME: In order to prevent SSE instructions being expanded to MMX ones @@ -1439,6 +1460,11 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::OR, MVT::v16i32, Legal); setOperationAction(ISD::XOR, MVT::v16i32, Legal); + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); + } + // Custom lower several nodes. for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { @@ -1479,6 +1505,11 @@ void X86TargetLowering::resetOperationActions() { } }// has AVX-512 + if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { + addRegisterClass(MVT::v32i1, &X86::VK32RegClass); + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + } + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. for (int VT = MVT::FIRST_VECTOR_VALUETYPE; @@ -1592,6 +1623,16 @@ void X86TargetLowering::resetOperationActions() { setPrefFunctionAlignment(4); // 2^4 bytes. } +TargetLoweringBase::LegalizeTypeAction +X86TargetLowering::getPreferredVectorAction(EVT VT) const { + if (ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() != 1 && + VT.getVectorElementType().getSimpleVT() != MVT::i1) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; @@ -2276,6 +2317,10 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); @@ -3023,7 +3068,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the -// original REtADDR, but before the saved framepointer or the spilled registers +// original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 @@ -4731,28 +4776,6 @@ bool X86::isZeroNode(SDValue Elt) { return false; } -/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in -/// their permute mask. -static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - SmallVector MaskVec; - - for (unsigned i = 0; i != NumElems; ++i) { - int Idx = SVOp->getMaskElt(i); - if (Idx >= 0) { - if (Idx < (int)NumElems) - Idx += NumElems; - else - Idx -= NumElems; - } - MaskVec.push_back(Idx); - } - return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), - SVOp->getOperand(0), &MaskVec[0]); -} - /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper @@ -4838,19 +4861,6 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, return true; } -/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are -/// all the same. -static bool isSplatVector(SDNode *N) { - if (N->getOpcode() != ISD::BUILD_VECTOR) - return false; - - SDValue SplatValue = N->getOperand(0); - for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) - if (N->getOperand(i) != SplatValue) - return false; - return true; -} - /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved /// to an zero vector. /// FIXME: move to dag combiner / method on ShuffleVectorSDNode @@ -5759,18 +5769,22 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); case ISD::BUILD_VECTOR: { - // The BUILD_VECTOR node must be a splat. - if (!isSplatVector(Op.getNode())) + auto *BVOp = cast(Op.getNode()); + BitVector UndefElements; + SDValue Splat = BVOp->getSplatValue(&UndefElements); + + // We need a splat of a single value to use broadcast, and it doesn't + // make any sense if the value is only in one element of the vector. + if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) return SDValue(); - Ld = Op.getOperand(0); + Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || - Ld.getOpcode() == ISD::ConstantFP); + Ld.getOpcode() == ISD::ConstantFP); - // The suspected load node has several users. Make sure that all - // of its users are from the BUILD_VECTOR node. - // Constants may have multiple users. - if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + // Make sure that all of the users of a non-constant load are from the + // BUILD_VECTOR node. + if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); break; } @@ -6072,21 +6086,35 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, + SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { + EVT VT = N->getValueType(0); + assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); - assert(N->getValueType(0).isVector() && - N->getValueType(0).getVectorNumElements() >= LastIdx && + assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; + V0 = DAG.getUNDEF(VT); + V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); + + // Skip UNDEFs. + if (Op->getOpcode() == ISD::UNDEF) { + // Update the expected vector extract index. + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + ExpectedVExtractIdx += 2; + continue; + } + CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) @@ -6107,12 +6135,15 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast(Op1.getOperand(1))->getZExtValue(); - - if (i == 0) - V0 = Op0.getOperand(0); - else if (i * 2 == NumElts) { - V1 = Op0.getOperand(0); - ExpectedVExtractIdx = BaseIdx; + + if (i * 2 < NumElts) { + if (V0.getOpcode() == ISD::UNDEF) + V0 = Op0.getOperand(0); + } else { + if (V1.getOpcode() == ISD::UNDEF) + V1 = Op0.getOperand(0); + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; @@ -6158,9 +6189,14 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI +/// +/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower +/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to +/// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDLoc DL, SelectionDAG &DAG, - unsigned X86Opcode, bool Mode) { + unsigned X86Opcode, bool Mode, + bool isUndefLO, bool isUndefHI) { EVT VT = V0.getValueType(); assert(VT.is256BitVector() && VT == V1.getValueType() && "Invalid nodes in input!"); @@ -6172,18 +6208,150 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); EVT NewVT = V0_LO.getValueType(); - SDValue LO, HI; + SDValue LO = DAG.getUNDEF(NewVT); + SDValue HI = DAG.getUNDEF(NewVT); + if (Mode) { - LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); - HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); + if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) + HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { - LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); - HI = DAG.getNode(X86Opcode, DL, NewVT, V1_HI, V1_HI); + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || + V1_LO->getOpcode() != ISD::UNDEF)) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); + + if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || + V1_HI->getOpcode() != ISD::UNDEF)) + HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } +/// \brief Try to fold a build_vector that performs an 'addsub' into the +/// sequence of 'vadd + vsub + blendi'. +static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(BV); + EVT VT = BV->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + SDValue InVec0 = DAG.getUNDEF(VT); + SDValue InVec1 = DAG.getUNDEF(VT); + + assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || + VT == MVT::v2f64) && "build_vector with an invalid type found!"); + + // Don't try to emit a VSELECT that cannot be lowered into a blend. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + + // Odd-numbered elements in the input build vector are obtained from + // adding two integer/float elements. + // Even-numbered elements in the input build vector are obtained from + // subtracting two integer/float elements. + unsigned ExpectedOpcode = ISD::FSUB; + unsigned NextExpectedOpcode = ISD::FADD; + bool AddFound = false; + bool SubFound = false; + + for (unsigned i = 0, e = NumElts; i != e; i++) { + SDValue Op = BV->getOperand(i); + + // Skip 'undef' values. + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::UNDEF) { + std::swap(ExpectedOpcode, NextExpectedOpcode); + continue; + } + + // Early exit if we found an unexpected opcode. + if (Opcode != ExpectedOpcode) + return SDValue(); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) + // Early exit if we cannot match that sequence. + if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Op0.getOperand(1)) || + !isa(Op1.getOperand(1)) || + Op0.getOperand(1) != Op1.getOperand(1)) + return SDValue(); + + unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); + if (I0 != i) + return SDValue(); + + // We found a valid add/sub node. Update the information accordingly. + if (i & 1) + AddFound = true; + else + SubFound = true; + + // Update InVec0 and InVec1. + if (InVec0.getOpcode() == ISD::UNDEF) + InVec0 = Op0.getOperand(0); + if (InVec1.getOpcode() == ISD::UNDEF) + InVec1 = Op1.getOperand(0); + + // Make sure that operands in input to each add/sub node always + // come from a same pair of vectors. + if (InVec0 != Op0.getOperand(0)) { + if (ExpectedOpcode == ISD::FSUB) + return SDValue(); + + // FADD is commutable. Try to commute the operands + // and then test again. + std::swap(Op0, Op1); + if (InVec0 != Op0.getOperand(0)) + return SDValue(); + } + + if (InVec1 != Op1.getOperand(0)) + return SDValue(); + + // Update the pair of expected opcodes. + std::swap(ExpectedOpcode, NextExpectedOpcode); + } + + // Don't try to fold this build_vector into a VSELECT if it has + // too many UNDEF operands. + if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && + InVec1.getOpcode() != ISD::UNDEF) { + // Emit a sequence of vector add and sub followed by a VSELECT. + // The new VSELECT will be lowered into a BLENDI. + // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI' + // and emit a single ADDSUB instruction. + SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1); + SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1); + + // Construct the VSELECT mask. + EVT MaskVT = VT.changeVectorElementTypeToInteger(); + EVT SVT = MaskVT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + SmallVector Ops; + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + APInt Value = i & 1 ? APInt::getNullValue(SVTBits) : + APInt::getAllOnesValue(SVTBits); + SDValue Constant = DAG.getConstant(Value, SVT); + Ops.push_back(Constant); + } + + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops); + return DAG.getSelect(DL, VT, Mask, Sub, Add); + } + + return SDValue(); +} + static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc DL(N); @@ -6192,20 +6360,46 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, BuildVectorSDNode *BV = cast(N); SDValue InVec0, InVec1; + // Try to match an ADDSUB. + if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { + SDValue Value = matchAddSub(BV, DAG, Subtarget); + if (Value.getNode()) + return Value; + } + // Try to match horizontal ADD/SUB. + unsigned NumUndefsLO = 0; + unsigned NumUndefsHI = 0; + unsigned Half = NumElts/2; + + // Count the number of UNDEF operands in the build_vector in input. + for (unsigned i = 0, e = Half; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsLO++; + + for (unsigned i = Half, e = NumElts; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsHI++; + + // Early exit if this is either a build_vector of all UNDEFs or all the + // operands but one are UNDEF. + if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) + return SDValue(); + if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. - if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } @@ -6216,16 +6410,20 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Try to match an AVX horizontal add/sub of packed single/double // precision floating point values from 256-bit vectors. SDValue InVec2, InVec3; - if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::FADD, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::FSUB, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. @@ -6233,15 +6431,19 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, unsigned X86Opcode; bool CanFold = true; - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::ADD, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; - else if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::SUB, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; @@ -6252,29 +6454,45 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); + // Do not try to expand this build_vector into a pair of horizontal + // add/sub if we can emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + // Convert this build_vector into a pair of horizontal binop followed by // a concat vector. - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false); + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, + isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget->hasAVX()) { unsigned X86Opcode; - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; - else if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; - else if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; - else if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); + // Don't try to expand this build_vector into a pair of horizontal add/sub + // if we can simply emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + // Convert this build_vector into two horizontal add/sub followed by // a concat vector. - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true); + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, + isUndefLO, isUndefHI); } return SDValue(); @@ -6667,125 +6885,1255 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return LowerAVXCONCAT_VECTORS(Op, DAG); } -static bool isBlendMask(ArrayRef MaskVals, MVT VT, bool hasSSE41, - bool hasInt256, unsigned *MaskOut = nullptr) { - MVT EltVT = VT.getVectorElementType(); - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return false; +//===----------------------------------------------------------------------===// +// Vector shuffle lowering +// +// This is an experimental code path for lowering vector shuffles on x86. It is +// designed to handle arbitrary vector shuffles and blends, gracefully +// degrading performance as necessary. It works hard to recognize idiomatic +// shuffles and lower them to optimal instruction patterns without leaving +// a framework that allows reasonably efficient handling of all vector shuffle +// patterns. +//===----------------------------------------------------------------------===// - if (!hasSSE41 || EltVT == MVT::i8) - return false; - if (!hasInt256 && VT == MVT::v16i16) - return false; +/// \brief Tiny helper function to identify a no-op mask. +/// +/// This is a somewhat boring predicate function. It checks whether the mask +/// array input, which is assumed to be a single-input shuffle mask of the kind +/// used by the X86 shuffle instructions (not a fully general +/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an +/// in-place shuffle are 'no-op's. +static bool isNoopShuffleMask(ArrayRef Mask) { + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] != -1 && Mask[i] != i) + return false; + return true; +} - unsigned MaskValue = 0; - unsigned NumElems = VT.getVectorNumElements(); - // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - unsigned NumLanes = (NumElems - 1) / 8 + 1; - unsigned NumElemsInLane = NumElems / NumLanes; +/// \brief Helper function to classify a mask as a single-input mask. +/// +/// This isn't a generic single-input test because in the vector shuffle +/// lowering we canonicalize single inputs to be the first input operand. This +/// means we can more quickly test for a single input by only checking whether +/// an input from the second operand exists. We also assume that the size of +/// mask corresponds to the size of the input vectors which isn't true in the +/// fully general case. +static bool isSingleInputShuffleMask(ArrayRef Mask) { + for (int M : Mask) + if (M >= (int)Mask.size()) + return false; + return true; +} - // Blend for v16i16 should be symetric for the both lanes. - for (unsigned i = 0; i < NumElemsInLane; ++i) { +/// \brief Get a 4-lane 8-bit shuffle immediate for a mask. +/// +/// This helper function produces an 8-bit shuffle immediate corresponding to +/// the ubiquitous shuffle encoding scheme used in x86 instructions for +/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for +/// example. +/// +/// NB: We rely heavily on "undef" masks preserving the input lane. +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, + SelectionDAG &DAG) { + assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); + assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); - int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; - int EltIdx = MaskVals[i]; + unsigned Imm = 0; + Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; + Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; + Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; + Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; + return DAG.getConstant(Imm, MVT::i8); +} - if ((EltIdx < 0 || EltIdx == (int)i) && - (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) - continue; +/// \brief Handle lowering of 2-lane 64-bit floating point shuffles. +/// +/// This is the basis function for the 2-lane 64-bit shuffles as we have full +/// support for floating point shuffles but not integer shuffles. These +/// instructions will incur a domain crossing penalty on some chips though so +/// it is better to avoid lowering through this for integer vectors where +/// possible. +static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); - if (((unsigned)EltIdx == (i + NumElems)) && - (SndLaneEltIdx < 0 || - (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) - MaskValue |= (1 << i); - else - return false; + if (isSingleInputShuffleMask(Mask)) { + // Straight shuffle of a single input vector. Simulate this by using the + // single input as both of the "inputs" to this instruction.. + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); } + assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); + assert(Mask[1] >= 2 && "Non-canonicalized blend!"); - if (MaskOut) - *MaskOut = MaskValue; - return true; + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); } -// Try to lower a shuffle node into a simple blend instruction. -// This function assumes isBlendMask returns true for this -// SuffleVectorSDNode -static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - unsigned MaskValue, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - MVT EltVT = VT.getVectorElementType(); - assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), - Subtarget->hasInt256() && "Trying to lower a " - "VECTOR_SHUFFLE to a Blend but " - "with the wrong mask")); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - unsigned NumElems = VT.getVectorNumElements(); +/// \brief Handle lowering of 2-lane 64-bit integer shuffles. +/// +/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by +/// the integer unit to minimize domain crossing penalties. However, for blends +/// it falls back to the floating point shuffle operation with appropriate bit +/// casting. +static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) { + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + // We have to map the mask as it is actually a v4i32 shuffle instruction. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); + int WidenedMask[4] = { + std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, + std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; + return DAG.getNode( + ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); + } + + // We implement this with SHUFPD which is pretty lame because it will likely + // incur 2 cycles of stall for integer vectors on Nehalem and older chips. + // However, all the alternatives are still more cycles and newer chips don't + // have this problem. It would be really nice if x86 had better shuffles here. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); +} + +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SDValue LowV = V1, HighV = V2; + int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) + // Straight shuffle of a single input vector. We pass the input vector to + // both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + if (NumV2Elements == 1) { + int V2Index = + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - + Mask.begin(); + // Compute the index adjacent to V2Index and in the same half by toggling + // the low bit. + int V2AdjIndex = V2Index ^ 1; + + if (Mask[V2AdjIndex] == -1) { + // Handles all the cases where we have a single V2 element and an undef. + // This will only ever happen in the high lanes because we commute the + // vector otherwise. + if (V2Index < 2) + std::swap(LowV, HighV); + NewMask[V2Index] -= 4; + } else { + // Handle the case where the V2 element ends up adjacent to a V1 element. + // To make this work, blend them together as the first step. + int V1Index = V2AdjIndex; + int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; + V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, + getV4X86ShuffleImm8ForMask(BlendMask, DAG)); + + // Now proceed to reconstruct the final blend as we have the necessary + // high or low half formed. + if (V2Index < 2) { + LowV = V2; + HighV = V1; + } else { + HighV = V2; + } + NewMask[V1Index] = 2; // We put the V1 element in V2[2]. + NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. + } + } else if (NumV2Elements == 2) { + if (Mask[0] < 4 && Mask[1] < 4) { + // Handle the easy case where we have V1 in the low lanes and V2 in the + // high lanes. We never see this reversed because we sort the shuffle. + NewMask[2] -= 4; + NewMask[3] -= 4; + } else { + // We have a mixture of V1 and V2 in both low and high lanes. Rather than + // trying to place elements directly, just blend them and set up the final + // shuffle to place them. - // Convert i32 vectors to floating point if it is not AVX2. - // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. - MVT BlendVT = VT; - if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { - BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); - V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); - } + // The first two blend mask elements are for V1, the second two are for + // V2. + int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], + Mask[2] < 4 ? Mask[2] : Mask[3], + (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, + (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; + V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, + getV4X86ShuffleImm8ForMask(BlendMask, DAG)); - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, - DAG.getConstant(MaskValue, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Ret); + // Now we do a normal shuffle of V1 by giving V1 as both operands to + // a blend. + LowV = HighV = V1; + NewMask[0] = Mask[0] < 4 ? 0 : 2; + NewMask[1] = Mask[0] < 4 ? 2 : 0; + NewMask[2] = Mask[2] < 4 ? 1 : 3; + NewMask[3] = Mask[2] < 4 ? 3 : 1; + } + } + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, + getV4X86ShuffleImm8ForMask(NewMask, DAG)); } -/// In vector type \p VT, return true if the element at index \p InputIdx -/// falls on a different 128-bit lane than \p OutputIdx. -static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, - unsigned OutputIdx) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; -} +/// \brief Lower 4-lane i32 vector shuffles. +/// +/// We try to handle these with integer-domain shuffles where we can, but for +/// blends we use the floating point domain blend instructions. +static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // We implement this with SHUFPS because it can blend from two vectors. + // Because we're going to eventually use SHUFPS, we use SHUFPS even to build + // up the inputs, bypassing domain shift penalties that we would encur if we + // directly used PSHUFD on Nehalem and older. For newer chips, this isn't + // relevant. + return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, + DAG.getVectorShuffle( + MVT::v4f32, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); +} + +/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 +/// shuffle lowering, and the most complex part. +/// +/// The lowering strategy is to try to form pairs of input lanes which are +/// targeted at the same half of the final vector, and then use a dword shuffle +/// to place them onto the right half, and finally unpack the paired lanes into +/// their final position. +/// +/// The exact breakdown of how to form these dword pairs and align them on the +/// correct sides is really tricky. See the comments within the function for +/// more of the details. +static SDValue lowerV8I16SingleInputVectorShuffle( + SDLoc DL, SDValue V, MutableArrayRef Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + MutableArrayRef LoMask = Mask.slice(0, 4); + MutableArrayRef HiMask = Mask.slice(4, 4); + + SmallVector LoInputs; + std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); + SmallVector HiInputs; + std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 0; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); + int NumLToL = + std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); + int NumHToL = LoInputs.size() - NumLToL; + int NumLToH = + std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); + int NumHToH = HiInputs.size() - NumLToH; + MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); + MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); + MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); + MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); + + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all + // such inputs we can swap two of the dwords across the half mark and end up + // with <=2 inputs to each half in each half. Once there, we can fall through + // to the generic code below. For example: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] + // + // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2 + // and 2-2. + auto balanceSides = [&](ArrayRef ThreeInputs, int OneInput, + int ThreeInputHalfSum, int OneInputHalfOffset) { + // Compute the index of dword with only one word among the three inputs in + // a half by taking the sum of the half with three inputs and subtracting + // the sum of the actual three inputs. The difference is the remaining + // slot. + int DWordA = (ThreeInputHalfSum - + std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) / + 2; + int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2; + + int PSHUFDMask[] = {0, 1, 2, 3}; + PSHUFDMask[DWordA] = DWordB; + PSHUFDMask[DWordB] = DWordA; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + + // Adjust the mask to match the new locations of A and B. + for (int &M : Mask) + if (M != -1 && M/2 == DWordA) + M = 2 * DWordB + M % 2; + else if (M != -1 && M/2 == DWordB) + M = 2 * DWordA + M % 2; + + // Recurse back into this routine to re-compute state now that this isn't + // a 3 and 1 problem. + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), + Mask); + }; + if (NumLToL == 3 && NumHToL == 1) + return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4); + else if (NumLToL == 1 && NumHToL == 3) + return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0); + else if (NumLToH == 1 && NumHToH == 3) + return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0); + else if (NumLToH == 3 && NumHToH == 1) + return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4); + + // At this point there are at most two inputs to the low and high halves from + // each half. That means the inputs can always be grouped into dwords and + // those dwords can then be moved to the correct half with a dword shuffle. + // We use at most one low and one high word shuffle to collect these paired + // inputs into dwords, and finally a dword shuffle to place them. + int PSHUFLMask[4] = {-1, -1, -1, -1}; + int PSHUFHMask[4] = {-1, -1, -1, -1}; + int PSHUFDMask[4] = {-1, -1, -1, -1}; + + // First fix the masks for all the inputs that are staying in their + // original halves. This will then dictate the targets of the cross-half + // shuffles. + auto fixInPlaceInputs = [&PSHUFDMask]( + ArrayRef InPlaceInputs, MutableArrayRef SourceHalfMask, + MutableArrayRef HalfMask, int HalfOffset) { + if (InPlaceInputs.empty()) + return; + if (InPlaceInputs.size() == 1) { + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; + return; + } -/// Generate a PSHUFB if possible. Selects elements from \p V1 according to -/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to -/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p -/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a -/// zero. -static SDValue getPSHUFB(ArrayRef MaskVals, SDValue V1, SDLoc &dl, - SelectionDAG &DAG) { - MVT VT = V1.getSimpleValueType(); - assert(VT.is128BitVector() || VT.is256BitVector()); + assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + // Put the second input next to the first so that they are packed into + // a dword. We find the adjacent index by toggling the low bit. + int AdjIndex = InPlaceInputs[0] ^ 1; + SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; + std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); + PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; + }; + if (!HToLInputs.empty()) + fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); + if (!LToHInputs.empty()) + fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); + + // Now gather the cross-half inputs and place them into a free dword of + // their target half. + // FIXME: This operation could almost certainly be simplified dramatically to + // look more like the 3-1 fixing operation. + auto moveInputsToRightHalf = [&PSHUFDMask]( + MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, + MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, + int SourceOffset, int DestOffset) { + auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { + return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; + }; + auto isDWordClobbered = [&isWordClobbered](ArrayRef SourceHalfMask, + int Word) { + int LowWord = Word & ~1; + int HighWord = Word | 1; + return isWordClobbered(SourceHalfMask, LowWord) || + isWordClobbered(SourceHalfMask, HighWord); + }; - MVT EltVT = VT.getVectorElementType(); - unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; - unsigned NumElts = VT.getVectorNumElements(); + if (IncomingInputs.empty()) + return; - SmallVector PshufbMask; - for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { - int InputIdx = MaskVals[OutputIdx]; - unsigned InputByteIdx; + if (ExistingInputs.empty()) { + // Map any dwords with inputs from them into the right half. + for (int Input : IncomingInputs) { + // If the source half mask maps over the inputs, turn those into + // swaps and use the swapped lane. + if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { + if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { + SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = + Input - SourceOffset; + // We have to swap the uses in our half mask in one sweep. + for (int &M : HalfMask) + if (M == SourceHalfMask[Input - SourceOffset]) + M = Input; + else if (M == Input) + M = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } else { + assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == + Input - SourceOffset && + "Previous placement doesn't match!"); + } + // Note that this correctly re-maps both when we do a swap and when + // we observe the other side of the swap above. We rely on that to + // avoid swapping the members of the input list directly. + Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } - if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) - InputByteIdx = 0x80; - else { - // Cross lane is not allowed. - if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) - return SDValue(); - InputByteIdx = InputIdx * EltSizeInBytes; - // Index is an byte offset within the 128-bit lane. - InputByteIdx &= 0xf; - } + // Map the input's dword into the correct half. + if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) + PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; + else + assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == + Input / 2 && + "Previous placement doesn't match!"); + } - for (unsigned j = 0; j < EltSizeInBytes; ++j) { - PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); - if (InputByteIdx != 0x80) - ++InputByteIdx; + // And just directly shift any other-half mask elements to be same-half + // as we will have mirrored the dword containing the element into the + // same position within that half. + for (int &M : HalfMask) + if (M >= SourceOffset && M < SourceOffset + 4) { + M = M - SourceOffset + DestOffset; + assert(M >= 0 && "This should never wrap below zero!"); + } + return; } - } + + // Ensure we have the input in a viable dword of its current half. This + // is particularly tricky because the original position may be clobbered + // by inputs being moved and *staying* in that half. + if (IncomingInputs.size() == 1) { + if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int InputFixed = std::find(std::begin(SourceHalfMask), + std::end(SourceHalfMask), -1) - + std::begin(SourceHalfMask) + SourceOffset; + SourceHalfMask[InputFixed - SourceOffset] = + IncomingInputs[0] - SourceOffset; + std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], + InputFixed); + IncomingInputs[0] = InputFixed; + } + } else if (IncomingInputs.size() == 2) { + if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || + isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; + assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && + "Not all dwords can be clobbered!"); + SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; + SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; + for (int &M : HalfMask) + if (M == IncomingInputs[0]) + M = SourceDWordBase + SourceOffset; + else if (M == IncomingInputs[1]) + M = SourceDWordBase + 1 + SourceOffset; + IncomingInputs[0] = SourceDWordBase + SourceOffset; + IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; + } + } else { + llvm_unreachable("Unhandled input size!"); + } + + // Now hoist the DWord down to the right half. + int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; + assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); + PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; + for (int Input : IncomingInputs) + std::replace(HalfMask.begin(), HalfMask.end(), Input, + FreeDWord * 2 + Input % 2); + }; + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, + /*SourceOffset*/ 4, /*DestOffset*/ 0); + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, + /*SourceOffset*/ 0, /*DestOffset*/ 4); + + // Now enact all the shuffles we've computed to move the inputs into their + // target half. + if (!isNoopShuffleMask(PSHUFLMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); + if (!isNoopShuffleMask(PSHUFHMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); + if (!isNoopShuffleMask(PSHUFDMask)) + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + + // At this point, each half should contain all its inputs, and we can then + // just shuffle them into their final position. + assert(std::count_if(LoMask.begin(), LoMask.end(), + [](int M) { return M >= 4; }) == 0 && + "Failed to lift all the high half inputs to the low mask!"); + assert(std::count_if(HiMask.begin(), HiMask.end(), + [](int M) { return M >= 0 && M < 4; }) == 0 && + "Failed to lift all the low half inputs to the high mask!"); + + // Do a half shuffle for the low mask. + if (!isNoopShuffleMask(LoMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(LoMask, DAG)); + + // Do a half shuffle with the high mask after shifting its values down. + for (int &M : HiMask) + if (M >= 0) + M -= 4; + if (!isNoopShuffleMask(HiMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(HiMask, DAG)); + + return V; +} + +/// \brief Detect whether the mask pattern should be lowered through +/// interleaving. +/// +/// This essentially tests whether viewing the mask as an interleaving of two +/// sub-sequences reduces the cross-input traffic of a blend operation. If so, +/// lowering it through interleaving is a significantly better strategy. +static bool shouldLowerAsInterleaving(ArrayRef Mask) { + int NumEvenInputs[2] = {0, 0}; + int NumOddInputs[2] = {0, 0}; + int NumLoInputs[2] = {0, 0}; + int NumHiInputs[2] = {0, 0}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + int InputIdx = Mask[i] >= Size; + + if (i < Size / 2) + ++NumLoInputs[InputIdx]; + else + ++NumHiInputs[InputIdx]; + + if ((i % 2) == 0) + ++NumEvenInputs[InputIdx]; + else + ++NumOddInputs[InputIdx]; + } + + // The minimum number of cross-input results for both the interleaved and + // split cases. If interleaving results in fewer cross-input results, return + // true. + int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], + NumEvenInputs[0] + NumOddInputs[1]); + int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], + NumLoInputs[0] + NumHiInputs[1]); + return InterleavedCrosses < SplitCrosses; +} + +/// \brief Blend two v8i16 vectors using a naive unpack strategy. +/// +/// This strategy only works when the inputs from each vector fit into a single +/// half of that vector, and generally there are not so many inputs as to leave +/// the in-place shuffles required highly constrained (and thus expensive). It +/// shifts all the inputs into a single side of both input vectors and then +/// uses an unpack to interleave these inputs in a single vector. At that +/// point, we will fall back on the generic single input shuffle lowering. +static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, + SDValue V2, + MutableArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + SmallVector LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; + for (int i = 0; i < 8; ++i) + if (Mask[i] >= 0 && Mask[i] < 4) + LoV1Inputs.push_back(i); + else if (Mask[i] >= 4 && Mask[i] < 8) + HiV1Inputs.push_back(i); + else if (Mask[i] >= 8 && Mask[i] < 12) + LoV2Inputs.push_back(i); + else if (Mask[i] >= 12) + HiV2Inputs.push_back(i); + + int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); + int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); + (void)NumV1Inputs; + (void)NumV2Inputs; + assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); + assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); + assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); + + bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= + HiV1Inputs.size() + HiV2Inputs.size(); + + auto moveInputsToHalf = [&](SDValue V, ArrayRef LoInputs, + ArrayRef HiInputs, bool MoveToLo, + int MaskOffset) { + ArrayRef GoodInputs = MoveToLo ? LoInputs : HiInputs; + ArrayRef BadInputs = MoveToLo ? HiInputs : LoInputs; + if (BadInputs.empty()) + return V; + + int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int MoveOffset = MoveToLo ? 0 : 4; + + if (GoodInputs.empty()) { + for (int BadInput : BadInputs) { + MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; + Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; + } + } else { + if (GoodInputs.size() == 2) { + // If the low inputs are spread across two dwords, pack them into + // a single dword. + MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = + Mask[GoodInputs[0]] - MaskOffset; + MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = + Mask[GoodInputs[1]] - MaskOffset; + Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + } else { + // Otherwise pin the low inputs. + for (int GoodInput : GoodInputs) + MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; + } + + int MoveMaskIdx = + std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - + std::begin(MoveMask); + assert(MoveMaskIdx >= MoveOffset && "Established above"); + + if (BadInputs.size() == 2) { + assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); + assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); + MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = + Mask[BadInputs[0]] - MaskOffset; + MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = + Mask[BadInputs[1]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; + Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; + } else { + assert(BadInputs.size() == 1 && "All sizes handled"); + MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; + } + } + + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), + MoveMask); + }; + V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, + /*MaskOffset*/ 0); + V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, + /*MaskOffset*/ 8); + + // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes + // cross-half traffic in the final shuffle. + + // Munge the mask to be a single-input mask after the unpack merges the + // results. + for (int &M : Mask) + if (M != -1) + M = 2 * (M % 4) + (M / 8); + + return DAG.getVectorShuffle( + MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, + DL, MVT::v8i16, V1, V2), + DAG.getUNDEF(MVT::v8i16), Mask); +} + +/// \brief Generic lowering of 8-lane i16 shuffles. +/// +/// This handles both single-input shuffles and combined shuffle/blends with +/// two inputs. The single input shuffles are immediately delegated to +/// a dedicated lowering routine. +/// +/// The blends are lowered in one of three fundamental ways. If there are few +/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle +/// of the input is significantly cheaper when lowered as an interleaving of +/// the two inputs, try to interleave them. Otherwise, blend the low and high +/// halves of the inputs separately (making them have relatively few inputs) +/// and then concatenate them. +static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef OrigMask = SVOp->getMask(); + int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; + MutableArrayRef Mask(MaskStorage); + + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + auto isV1 = [](int M) { return M >= 0 && M < 8; }; + auto isV2 = [](int M) { return M >= 8; }; + + int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); + int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); + + if (NumV2Inputs == 0) + return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); + + assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " + "to be V1-input shuffles."); + + if (NumV1Inputs + NumV2Inputs <= 4) + return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); + + // Check whether an interleaving lowering is likely to be more efficient. + // This isn't perfect but it is a strong heuristic that tends to work well on + // the kinds of shuffles that show up in practice. + // + // FIXME: Handle 1x, 2x, and 4x interleaving. + if (shouldLowerAsInterleaving(Mask)) { + // FIXME: Figure out whether we should pack these into the low or high + // halves. + + int EMask[8], OMask[8]; + for (int i = 0; i < 4; ++i) { + EMask[i] = Mask[2*i]; + OMask[i] = Mask[2*i + 1]; + EMask[i + 4] = -1; + OMask[i + 4] = -1; + } + + SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); + SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); + + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); + } + + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + + for (int i = 0; i < 4; ++i) { + LoBlendMask[i] = Mask[i]; + HiBlendMask[i] = Mask[i + 4]; + } + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); + LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); + HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); + + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); +} + +/// \brief Generic lowering of v16i8 shuffles. +/// +/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to +/// detect any complexity reducing interleaving. If that doesn't help, it uses +/// UNPCK to spread the i8 elements across two i16-element vectors, and uses +/// the existing lowering for v8i16 blends on each half, finally PACK-ing them +/// back together. +static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef OrigMask = SVOp->getMask(); + assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + int MaskStorage[16] = { + OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], + OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], + OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; + MutableArrayRef Mask(MaskStorage); + MutableArrayRef LoMask = Mask.slice(0, 8); + MutableArrayRef HiMask = Mask.slice(8, 8); + + // For single-input shuffles, there are some nicer lowering tricks we can use. + if (isSingleInputShuffleMask(Mask)) { + // Check whether we can widen this to an i16 shuffle by duplicating bytes. + // Notably, this handles splat and partial-splat shuffles more efficiently. + // However, it only makes sense if the pre-duplication shuffle simplifies + // things significantly. Currently, this means we need to be able to + // express the pre-duplication shuffle as an i16 shuffle. + // + // FIXME: We should check for other patterns which can be widened into an + // i16 shuffle as well. + auto canWidenViaDuplication = [](ArrayRef Mask) { + for (int i = 0; i < 16; i += 2) { + if (Mask[i] != Mask[i + 1]) + return false; + } + return true; + }; + auto tryToWidenViaDuplication = [&]() -> SDValue { + if (!canWidenViaDuplication(Mask)) + return SDValue(); + SmallVector LoInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0 && M < 8; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), + LoInputs.end()); + SmallVector HiInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 8; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), + HiInputs.end()); + + bool TargetLo = LoInputs.size() >= HiInputs.size(); + ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; + ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; + + int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + SmallDenseMap LaneMap; + for (int I : InPlaceInputs) { + PreDupI16Shuffle[I/2] = I/2; + LaneMap[I] = I; + } + int j = TargetLo ? 0 : 4, je = j + 4; + for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { + // Check if j is already a shuffle of this input. This happens when + // there are two adjacent bytes after we move the low one. + if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { + // If we haven't yet mapped the input, search for a slot into which + // we can map it. + while (j < je && PreDupI16Shuffle[j] != -1) + ++j; + + if (j == je) + // We can't place the inputs into a single half with a simple i16 shuffle, so bail. + return SDValue(); + + // Map this input with the i16 shuffle. + PreDupI16Shuffle[j] = MovingInputs[i] / 2; + } + + // Update the lane map based on the mapping we ended up with. + LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; + } + V1 = DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); + + // Unpack the bytes to form the i16s that will be shuffled into place. + V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, V1, V1); + + int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + for (int i = 0; i < 16; i += 2) { + if (Mask[i] != -1) + PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); + }; + if (SDValue V = tryToWidenViaDuplication()) + return V; + } + + // Check whether an interleaving lowering is likely to be more efficient. + // This isn't perfect but it is a strong heuristic that tends to work well on + // the kinds of shuffles that show up in practice. + // + // FIXME: We need to handle other interleaving widths (i16, i32, ...). + if (shouldLowerAsInterleaving(Mask)) { + // FIXME: Figure out whether we should pack these into the low or high + // halves. + + int EMask[16], OMask[16]; + for (int i = 0; i < 8; ++i) { + EMask[i] = Mask[2*i]; + OMask[i] = Mask[2*i + 1]; + EMask[i + 8] = -1; + OMask[i + 8] = -1; + } + + SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); + SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); + + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); + } + + int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + + auto buildBlendMasks = [](MutableArrayRef HalfMask, + MutableArrayRef V1HalfBlendMask, + MutableArrayRef V2HalfBlendMask) { + for (int i = 0; i < 8; ++i) + if (HalfMask[i] >= 0 && HalfMask[i] < 16) { + V1HalfBlendMask[i] = HalfMask[i]; + HalfMask[i] = i; + } else if (HalfMask[i] >= 16) { + V2HalfBlendMask[i] = HalfMask[i] - 16; + HalfMask[i] = i + 8; + } + }; + buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); + buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); + + SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); + + auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef LoBlendMask, + MutableArrayRef HiBlendMask) { + SDValue V1, V2; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(HiBlendMask.begin(), HiBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, + DAG.getConstant(0x00FF, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke V2. + V2 = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into V1. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into V1 and the high half into + // V2 so that we can blend them as i16s. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } + + SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); + return std::make_pair(BlendedLo, BlendedHi); + }; + SDValue V1Lo, V1Hi, V2Lo, V2Hi; + std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); + std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); + + return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); +} + +/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. +/// +/// This routine breaks down the specific type of 128-bit shuffle and +/// dispatches to the lowering routines accordingly. +static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + switch (VT.SimpleTy) { + case MVT::v2i64: + return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v2f64: + return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i32: + return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4f32: + return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i16: + return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i8: + return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Unimplemented!"); + } +} + +/// \brief Tiny helper function to test whether adjacent masks are sequential. +static bool areAdjacentMasksSequential(ArrayRef Mask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) + if (Mask[i] + 1 != Mask[i+1]) + return false; + + return true; +} + +/// \brief Top-level lowering for x86 vector shuffles. +/// +/// This handles decomposition, canonicalization, and lowering of all x86 +/// vector shuffles. Most of the specific lowering strategies are encapsulated +/// above in helper routines. The canonicalization attempts to widen shuffles +/// to involve fewer lanes of wider elements, consolidate symmetric patterns +/// s.t. only one of the two inputs needs to be tested, etc. +static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + MVT VT = Op.getSimpleValueType(); + int NumElements = VT.getVectorNumElements(); + SDLoc dl(Op); + + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + // When we create a shuffle node we put the UNDEF node to second operand, + // but in some cases the first operand may be transformed to UNDEF. + // In this case we should just commute the node. + if (V1IsUndef) + return DAG.getCommutedVectorShuffle(*SVOp); + + // Check for non-undef masks pointing at an undef vector and make the masks + // undef as well. This makes it easier to match the shuffle based solely on + // the mask. + if (V2IsUndef) + for (int M : Mask) + if (M >= NumElements) { + SmallVector NewMask(Mask.begin(), Mask.end()); + for (int &M : NewMask) + if (M >= NumElements) + M = -1; + return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); + } + + // For integer vector shuffles, try to collapse them into a shuffle of fewer + // lanes but wider integers. We cap this to not form integers larger than i64 + // but it might be interesting to form i128 integers to handle flipping the + // low and high halves of AVX 256-bit vectors. + if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && + areAdjacentMasksSequential(Mask)) { + SmallVector NewMask; + for (int i = 0, Size = Mask.size(); i < Size; i += 2) + NewMask.push_back(Mask[i] / 2); + MVT NewVT = + MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), + VT.getVectorNumElements() / 2); + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + } + + int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; + for (int M : SVOp->getMask()) + if (M < 0) + ++NumUndefElements; + else if (M < NumElements) + ++NumV1Elements; + else + ++NumV2Elements; + + // Commute the shuffle as needed such that more elements come from V1 than + // V2. This allows us to match the shuffle pattern strictly on how many + // elements come from V1 without handling the symmetric cases. + if (NumV2Elements > NumV1Elements) + return DAG.getCommutedVectorShuffle(*SVOp); + + // When the number of V1 and V2 elements are the same, try to minimize the + // number of uses of V2 in the low half of the vector. + if (NumV1Elements == NumV2Elements) { + int LowV1Elements = 0, LowV2Elements = 0; + for (int M : SVOp->getMask().slice(0, NumElements / 2)) + if (M >= NumElements) + ++LowV2Elements; + else if (M >= 0) + ++LowV1Elements; + if (LowV2Elements > LowV1Elements) + return DAG.getCommutedVectorShuffle(*SVOp); + } + + // For each vector width, delegate to a specialized lowering routine. + if (VT.getSizeInBits() == 128) + return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + llvm_unreachable("Unimplemented!"); +} + + +//===----------------------------------------------------------------------===// +// Legacy vector shuffle lowering +// +// This code is the legacy code handling vector shuffles until the above +// replaces its functionality and performance. +//===----------------------------------------------------------------------===// + +static bool isBlendMask(ArrayRef MaskVals, MVT VT, bool hasSSE41, + bool hasInt256, unsigned *MaskOut = nullptr) { + MVT EltVT = VT.getVectorElementType(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return false; + + if (!hasSSE41 || EltVT == MVT::i8) + return false; + if (!hasInt256 && VT == MVT::v16i16) + return false; + + unsigned MaskValue = 0; + unsigned NumElems = VT.getVectorNumElements(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + + int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; + int EltIdx = MaskVals[i]; + + if ((EltIdx < 0 || EltIdx == (int)i) && + (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) + continue; + + if (((unsigned)EltIdx == (i + NumElems)) && + (SndLaneEltIdx < 0 || + (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) + MaskValue |= (1 << i); + else + return false; + } + + if (MaskOut) + *MaskOut = MaskValue; + return true; +} + +// Try to lower a shuffle node into a simple blend instruction. +// This function assumes isBlendMask returns true for this +// SuffleVectorSDNode +static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, + unsigned MaskValue, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = SVOp->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), + Subtarget->hasInt256() && "Trying to lower a " + "VECTOR_SHUFFLE to a Blend but " + "with the wrong mask")); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + SDLoc dl(SVOp); + unsigned NumElems = VT.getVectorNumElements(); + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + +/// In vector type \p VT, return true if the element at index \p InputIdx +/// falls on a different 128-bit lane than \p OutputIdx. +static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, + unsigned OutputIdx) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; +} + +/// Generate a PSHUFB if possible. Selects elements from \p V1 according to +/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to +/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p +/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a +/// zero. +static SDValue getPSHUFB(ArrayRef MaskVals, SDValue V1, SDLoc &dl, + SelectionDAG &DAG) { + MVT VT = V1.getSimpleValueType(); + assert(VT.is128BitVector() || VT.is256BitVector()); + + MVT EltVT = VT.getVectorElementType(); + unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector PshufbMask; + for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { + int InputIdx = MaskVals[OutputIdx]; + unsigned InputByteIdx; + + if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) + InputByteIdx = 0x80; + else { + // Cross lane is not allowed. + if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) + return SDValue(); + InputByteIdx = InputIdx * EltSizeInBytes; + // Index is an byte offset within the 128-bit lane. + InputByteIdx &= 0xf; + } + + for (unsigned j = 0; j < EltSizeInBytes; ++j) { + PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); + if (InputByteIdx != 0x80) + ++InputByteIdx; + } + } MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); if (ShufVT != VT) @@ -7726,12 +9074,13 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); } + unsigned SrcIndex = Mask[DestIndex] % 4; if (MayFoldLoad(From)) { // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. SDValue NewLoad = - NarrowVectorLoadToElement(cast(From), DestIndex, DAG); + NarrowVectorLoadToElement(cast(From), SrcIndex, DAG); if (!NewLoad.getNode()) return SDValue(); @@ -7752,7 +9101,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, } // Vector-element-to-vector - unsigned SrcIndex = Mask[DestIndex] % 4; SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); } @@ -7919,6 +9267,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool OptForSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + // Check if we should use the experimental vector shuffle lowering. If so, + // delegate completely to that code path. + if (ExperimentalVectorShuffleLowering) + return lowerVectorShuffle(Op, Subtarget, DAG); + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); if (V1IsUndef && V2IsUndef) @@ -7928,7 +9281,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // Vector shuffle lowering takes 3 steps: // @@ -8040,7 +9393,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (ShouldXformToMOVHLPS(M, VT) || ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); if (isShift) { // No better options. Use a vshldq / vsrldq. @@ -8052,8 +9405,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool Commuted = false; // FIXME: This should also accept a bitcast of a splat? Be careful, not // 1,1,1,1 -> v8i16 though. - V1IsSplat = isSplatVector(V1.getNode()); - V2IsSplat = isSplatVector(V2.getNode()); + BitVector UndefElements; + if (auto *BVOp = dyn_cast(V1.getNode())) + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + V1IsSplat = true; + if (auto *BVOp = dyn_cast(V2.getNode())) + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + V2IsSplat = true; // Canonicalize the splat or undef, if present, to be on the RHS. if (!V2IsUndef && V1IsSplat && !V2IsSplat) { @@ -8107,7 +9465,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Normalize the node to match x86 shuffle ops if needed if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific @@ -8129,6 +9487,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePSHUFLWImmediate(SVOp), DAG); + unsigned MaskValue; + if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), + &MaskValue)) + return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); + if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); @@ -8166,11 +9529,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - unsigned MaskValue; - if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), - &MaskValue)) - return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); - if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) return getINSERTPS(SVOp, dl, DAG); @@ -12431,11 +13789,37 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); - } + } + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx2_packusdw: + return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pshuf_d: + return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pshufl_w: + return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), + case Intrinsic::x86_sse2_pshufh_w: + return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_ssse3_psign_b_128: @@ -12885,6 +14269,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 0); } +// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that +// read performance monitor counters (x86_rdpmc). +static void getReadPerformanceCounter(SDNode *N, SDLoc DL, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl &Results) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue LO, HI; + + // The ECX register is used to select the index of the performance counter + // to read. + SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, + N->getOperand(2)); + SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); + + // Reads the content of a 64-bit performance counter and returns it in the + // registers EDX:EAX. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + Chain = HI.getValue(1); + + if (Subtarget->is64Bit()) { + // The EAX register is loaded with the low-order 32 bits. The EDX register + // is loaded with the supported high-order bits of the counter. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is // also used to custom lower READCYCLECOUNTER nodes. @@ -12949,7 +14378,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, } enum IntrinsicType { - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST }; struct IntrinsicData { @@ -13043,6 +14472,8 @@ static void InitIntinsicsMap() { IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc, + IntrinsicData(RDPMC, X86ISD::RDPMC_DAG, 0))); Initialized = true; } @@ -13118,6 +14549,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } + // Read Performance Monitoring Counters. + case RDPMC: { + SmallVector Results; + getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); + } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); @@ -13706,7 +15143,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), static_cast(MVT::v2i64).getTypeForEVT(*DAG.getContext()), - Callee, &Args, 0) + Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); @@ -13722,10 +15159,23 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())); - // Get the high parts. - const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8}; - SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); - SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + // PMULxD operations multiply each even value (starting at 0) of LHS with + // the related value of RHS and produce a widen result. + // E.g., PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> + // + // In other word, to have all the results, we need to perform two PMULxD: + // 1. one with the even values. + // 2. one with the odd values. + // To achieve #2, with need to place the odd values at an even position. + // + // Place the odd value at an even position (basically, shift all values 1 + // step to the left): + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; + // => + SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + // => + SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. @@ -13733,16 +15183,41 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + // PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + // PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); + DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. - const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15}; - SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14}; - SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + // The internal representation is big endian. + // In other words, a i64 bitcasted to 2 x i32 has its high part at index 0 + // and its low part at index 1. + // Moreover, we have: Mul1 = ; Mul2 = + // Vector index 0 1 ; 2 3 + // We want + // Vector index 0 2 1 3 + // Since each element is seen as 2 x i32, we get: + // high_mask[i] = 2 x vector_index[i] + // low_mask[i] = 2 x vector_index[i] + 1 + // where vector_index = {0, Size/2, 1, Size/2 + 1, ..., + // Size/2 - 1, Size/2 + Size/2 - 1} + // where Size is the number of element of the final vector. + SDValue Highs, Lows; + if (VT == MVT::v8i32) { + const int HighMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } else { + const int HighMask[] = {0, 4, 2, 6}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {1, 5, 3, 7}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. @@ -13758,7 +15233,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows); + // The low part of a MUL_LOHI is supposed to be the first value and the + // high part the second value. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Lows, Highs); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, @@ -13769,10 +15246,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Amt = Op.getOperand(1); // Optimize shl/srl/sra with constant shift amount. - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast(SclrAmt)) { - uint64_t ShiftAmt = C->getZExtValue(); + if (auto *BVAmt = dyn_cast(Amt)) { + if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { + uint64_t ShiftAmt = ShiftConst->getZExtValue(); if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && @@ -14079,15 +15555,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); SDValue V; - if (!Subtarget->hasSSE2()) - return SDValue(); + assert(VT.isVector() && "Custom lowering only for vector shifts!"); + assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); V = LowerScalarImmediateShift(Op, DAG, Subtarget); if (V.getNode()) @@ -14529,7 +16004,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, - Op.getOperand(2), SDValue()); + Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), @@ -14539,9 +16014,18 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); + SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); - return cpOut; + SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, + MVT::i32, cpOut.getValue(2)); + SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), + DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + + DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); + return SDValue(); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, @@ -14697,7 +16181,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, RetTy, Callee, &Args, 0); + .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); std::pair CallResult = TLI.LowerCallTo(CLI); @@ -14721,7 +16205,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); - case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -14803,8 +16288,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } static void ReplaceATOMIC_LOAD(SDNode *Node, - SmallVectorImpl &Results, - SelectionDAG &DAG) { + SmallVectorImpl &Results, + SelectionDAG &DAG) { SDLoc dl(Node); EVT VT = cast(Node)->getMemoryVT(); @@ -14813,39 +16298,16 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, // (The only way to get a 16-byte load is cmpxchg16b) // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. SDValue Zero = DAG.getConstant(0, VT); - SDVTList VTs = DAG.getVTList(VT, MVT::Other); + SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); SDValue Swap = - DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP, dl, VT, VTs, + DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, Node->getOperand(0), Node->getOperand(1), Zero, Zero, cast(Node)->getMemOperand(), cast(Node)->getOrdering(), cast(Node)->getOrdering(), cast(Node)->getSynchScope()); Results.push_back(Swap.getValue(0)); - Results.push_back(Swap.getValue(1)); -} - -static void -ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl&Results, - SelectionDAG &DAG, unsigned NewOp) { - SDLoc dl(Node); - assert (Node->getValueType(0) == MVT::i64 && - "Only know how to expand i64 atomics"); - - SDValue Chain = Node->getOperand(0); - SDValue In1 = Node->getOperand(1); - SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(0)); - SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(1)); - SDValue Ops[] = { Chain, In1, In2L, In2H }; - SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); - SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64, - cast(Node)->getMemOperand()); - SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF)); - Results.push_back(Result.getValue(2)); + Results.push_back(Swap.getValue(2)); } /// ReplaceNodeResults - Replace a node with an illegal result type @@ -14932,13 +16394,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, Results); + case Intrinsic::x86_rdpmc: + return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); } - case ISD::ATOMIC_CMP_SWAP: { + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; @@ -14980,61 +16444,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + + SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, + MVT::i32, cpOutH.getValue(2)); + SDValue Success = + DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); - Results.push_back(cpOutH.getValue(1)); + Results.push_back(Success); + Results.push_back(EFLAGS.getValue(1)); return; } + case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_SWAP: { - unsigned Opc; - switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected opcode"); - case ISD::ATOMIC_LOAD_ADD: - Opc = X86ISD::ATOMADD64_DAG; - break; - case ISD::ATOMIC_LOAD_AND: - Opc = X86ISD::ATOMAND64_DAG; - break; - case ISD::ATOMIC_LOAD_NAND: - Opc = X86ISD::ATOMNAND64_DAG; - break; - case ISD::ATOMIC_LOAD_OR: - Opc = X86ISD::ATOMOR64_DAG; - break; - case ISD::ATOMIC_LOAD_SUB: - Opc = X86ISD::ATOMSUB64_DAG; - break; - case ISD::ATOMIC_LOAD_XOR: - Opc = X86ISD::ATOMXOR64_DAG; - break; - case ISD::ATOMIC_LOAD_MAX: - Opc = X86ISD::ATOMMAX64_DAG; - break; - case ISD::ATOMIC_LOAD_MIN: - Opc = X86ISD::ATOMMIN64_DAG; - break; - case ISD::ATOMIC_LOAD_UMAX: - Opc = X86ISD::ATOMUMAX64_DAG; - break; - case ISD::ATOMIC_LOAD_UMIN: - Opc = X86ISD::ATOMUMIN64_DAG; - break; - case ISD::ATOMIC_SWAP: - Opc = X86ISD::ATOMSWAP64_DAG; - break; - } - ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); - return; - } + case ISD::ATOMIC_LOAD_UMAX: + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by X86AtomicExpand.cpp. + break; case ISD::ATOMIC_LOAD: { ReplaceATOMIC_LOAD(N, Results, DAG); return; @@ -15055,6 +16491,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); + if (ExperimentalVectorWideningLegalization) { + // If we are legalizing vectors by widening, we already have the desired + // legal vector type, just return it. + Results.push_back(ToVecInt); + return; + } + SmallVector Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, @@ -15087,6 +16530,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; + case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; @@ -15141,12 +16585,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; - case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; - case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; - case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; - case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; - case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; - case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; @@ -15187,6 +16625,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::PACKSS: return "X86ISD::PACKSS"; + case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; @@ -15443,6 +16883,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || + isMOVHLPSMask(M, SVT) || isSHUFPMask(M, SVT) || isPSHUFDMask(M, SVT) || isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || @@ -15455,760 +16896,81 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256())); } -bool -X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, - EVT VT) const { - if (!VT.isSimple()) - return false; - - MVT SVT = VT.getSimpleVT(); - unsigned NumElts = SVT.getVectorNumElements(); - // FIXME: This collection of masks seems suspect. - if (NumElts == 2) - return true; - if (NumElts == 4 && SVT.is128BitVector()) { - return (isMOVLMask(Mask, SVT) || - isCommutedMOVLMask(Mask, SVT, true) || - isSHUFPMask(Mask, SVT) || - isSHUFPMask(Mask, SVT, /* Commuted */ true)); - } - return false; -} - -//===----------------------------------------------------------------------===// -// X86 Scheduler Hooks -//===----------------------------------------------------------------------===// - -/// Utility function to emit xbegin specifying the start of an RTM region. -static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, - const TargetInstrInfo *TII) { - DebugLoc DL = MI->getDebugLoc(); - - const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; - - // For the v = xbegin(), we generate - // - // thisMBB: - // xbegin sinkMBB - // - // mainMBB: - // eax = -1 - // - // sinkMBB: - // v = eax - - MachineBasicBlock *thisMBB = MBB; - MachineFunction *MF = MBB->getParent(); - MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); - MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); - MF->insert(I, mainMBB); - MF->insert(I, sinkMBB); - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); - - // thisMBB: - // xbegin sinkMBB - // # fallthrough to mainMBB - // # abortion to sinkMBB - BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); - thisMBB->addSuccessor(mainMBB); - thisMBB->addSuccessor(sinkMBB); - - // mainMBB: - // EAX = -1 - BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); - mainMBB->addSuccessor(sinkMBB); - - // sinkMBB: - // EAX is live into the sinkMBB - sinkMBB->addLiveIn(X86::EAX); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addReg(X86::EAX); - - MI->eraseFromParent(); - return sinkMBB; -} - -// Get CMPXCHG opcode for the specified data type. -static unsigned getCmpXChgOpcode(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::LCMPXCHG8; - case MVT::i16: return X86::LCMPXCHG16; - case MVT::i32: return X86::LCMPXCHG32; - case MVT::i64: return X86::LCMPXCHG64; - default: - break; - } - llvm_unreachable("Invalid operand size!"); -} - -// Get LOAD opcode for the specified data type. -static unsigned getLoadOpcode(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::MOV8rm; - case MVT::i16: return X86::MOV16rm; - case MVT::i32: return X86::MOV32rm; - case MVT::i64: return X86::MOV64rm; - default: - break; - } - llvm_unreachable("Invalid operand size!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction. -static unsigned getNonAtomicOpcode(unsigned Opc) { - switch (Opc) { - case X86::ATOMAND8: return X86::AND8rr; - case X86::ATOMAND16: return X86::AND16rr; - case X86::ATOMAND32: return X86::AND32rr; - case X86::ATOMAND64: return X86::AND64rr; - case X86::ATOMOR8: return X86::OR8rr; - case X86::ATOMOR16: return X86::OR16rr; - case X86::ATOMOR32: return X86::OR32rr; - case X86::ATOMOR64: return X86::OR64rr; - case X86::ATOMXOR8: return X86::XOR8rr; - case X86::ATOMXOR16: return X86::XOR16rr; - case X86::ATOMXOR32: return X86::XOR32rr; - case X86::ATOMXOR64: return X86::XOR64rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction with -// extra opcode. -static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, - unsigned &ExtraOpc) { - switch (Opc) { - case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; - case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; - case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; - case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; - case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; - case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; - case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; - case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; - case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; - case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; - case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; - case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; - case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; - case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; - case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; - case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; - case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; - case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; - case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; - case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction for -// 64-bit data type on 32-bit target. -static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { - switch (Opc) { - case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; - case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; - case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; - case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; - case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; - case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; - case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; - case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; - case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; - case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction for -// 64-bit data type on 32-bit target with extra opcode. -static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, - unsigned &HiOpc, - unsigned &ExtraOpc) { - switch (Opc) { - case X86::ATOMNAND6432: - ExtraOpc = X86::NOT32r; - HiOpc = X86::AND32rr; - return X86::AND32rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get pseudo CMOV opcode from the specified data type. -static unsigned getPseudoCMOVOpc(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::CMOV_GR8; - case MVT::i16: return X86::CMOV_GR16; - case MVT::i32: return X86::CMOV_GR32; - default: - break; - } - llvm_unreachable("Unknown CMOV opcode!"); -} - -// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. -// They will be translated into a spin-loop or compare-exchange loop from -// -// ... -// dst = atomic-fetch-op MI.addr, MI.val -// ... -// -// to -// -// ... -// t1 = LOAD MI.addr -// loop: -// t4 = phi(t1, t3 / loop) -// t2 = OP MI.val, t4 -// EAX = t4 -// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] -// t3 = EAX -// JNE loop -// sink: -// dst = t3 -// ... -MachineBasicBlock * -X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, - MachineBasicBlock *MBB) const { - MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); - - MachineRegisterInfo &MRI = MF->getRegInfo(); - - const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; - - assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && - "Unexpected number of operands"); - - assert(MI->hasOneMemOperand() && - "Expected atomic-load-op to have one memoperand"); - - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - - unsigned DstReg, SrcReg; - unsigned MemOpndSlot; - - unsigned CurOp = 0; - - DstReg = MI->getOperand(CurOp++).getReg(); - MemOpndSlot = CurOp; - CurOp += X86::AddrNumOperands; - SrcReg = MI->getOperand(CurOp++).getReg(); - - const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - MVT::SimpleValueType VT = *RC->vt_begin(); - unsigned t1 = MRI.createVirtualRegister(RC); - unsigned t2 = MRI.createVirtualRegister(RC); - unsigned t3 = MRI.createVirtualRegister(RC); - unsigned t4 = MRI.createVirtualRegister(RC); - unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); - - unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); - unsigned LOADOpc = getLoadOpcode(VT); - - // For the atomic load-arith operator, we generate - // - // thisMBB: - // t1 = LOAD [MI.addr] - // mainMBB: - // t4 = phi(t1 / thisMBB, t3 / mainMBB) - // t1 = OP MI.val, EAX - // EAX = t4 - // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] - // t3 = EAX - // JNE mainMBB - // sinkMBB: - // dst = t3 - - MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); - MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); - MF->insert(I, mainMBB); - MF->insert(I, sinkMBB); - - MachineInstrBuilder MIB; - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); - - // thisMBB: - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { - unsigned flags = (*MMOI)->getFlags(); - flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; - MachineMemOperand *MMO = - MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, - (*MMOI)->getSize(), - (*MMOI)->getBaseAlignment(), - (*MMOI)->getTBAAInfo(), - (*MMOI)->getRanges()); - MIB.addMemOperand(MMO); - } - - thisMBB->addSuccessor(mainMBB); - - // mainMBB: - MachineBasicBlock *origMainMBB = mainMBB; - - // Add a PHI. - MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - llvm_unreachable("Unhandled atomic-load-op opcode!"); - case X86::ATOMAND8: - case X86::ATOMAND16: - case X86::ATOMAND32: - case X86::ATOMAND64: - case X86::ATOMOR8: - case X86::ATOMOR16: - case X86::ATOMOR32: - case X86::ATOMOR64: - case X86::ATOMXOR8: - case X86::ATOMXOR16: - case X86::ATOMXOR32: - case X86::ATOMXOR64: { - unsigned ARITHOpc = getNonAtomicOpcode(Opc); - BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) - .addReg(t4); - break; - } - case X86::ATOMNAND8: - case X86::ATOMNAND16: - case X86::ATOMNAND32: - case X86::ATOMNAND64: { - unsigned Tmp = MRI.createVirtualRegister(RC); - unsigned NOTOpc; - unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); - BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) - .addReg(t4); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); - break; - } - case X86::ATOMMAX8: - case X86::ATOMMAX16: - case X86::ATOMMAX32: - case X86::ATOMMAX64: - case X86::ATOMMIN8: - case X86::ATOMMIN16: - case X86::ATOMMIN32: - case X86::ATOMMIN64: - case X86::ATOMUMAX8: - case X86::ATOMUMAX16: - case X86::ATOMUMAX32: - case X86::ATOMUMAX64: - case X86::ATOMUMIN8: - case X86::ATOMUMIN16: - case X86::ATOMUMIN32: - case X86::ATOMUMIN64: { - unsigned CMPOpc; - unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); - - BuildMI(mainMBB, DL, TII->get(CMPOpc)) - .addReg(SrcReg) - .addReg(t4); - - if (Subtarget->hasCMov()) { - if (VT != MVT::i8) { - // Native support - BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) - .addReg(SrcReg) - .addReg(t4); - } else { - // Promote i8 to i32 to use CMOV32 - const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo(); - const TargetRegisterClass *RC32 = - TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); - unsigned SrcReg32 = MRI.createVirtualRegister(RC32); - unsigned AccReg32 = MRI.createVirtualRegister(RC32); - unsigned Tmp = MRI.createVirtualRegister(RC32); - - unsigned Undef = MRI.createVirtualRegister(RC32); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); - - BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) - .addReg(Undef) - .addReg(SrcReg) - .addImm(X86::sub_8bit); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) - .addReg(Undef) - .addReg(t4) - .addImm(X86::sub_8bit); - - BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) - .addReg(SrcReg32) - .addReg(AccReg32); - - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) - .addReg(Tmp, 0, X86::sub_8bit); - } - } else { - // Use pseudo select and lower them. - assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && - "Invalid atomic-load-op transformation!"); - unsigned SelOpc = getPseudoCMOVOpc(VT); - X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); - assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); - MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) - .addReg(SrcReg).addReg(t4) - .addImm(CC); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // Replace the original PHI node as mainMBB is changed after CMOV - // lowering. - BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - Phi->eraseFromParent(); - } - break; - } - } - - // Copy PhyReg back from virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) - .addReg(t4); +bool +X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, + EVT VT) const { + if (!VT.isSimple()) + return false; - MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); + MVT SVT = VT.getSimpleVT(); + unsigned NumElts = SVT.getVectorNumElements(); + // FIXME: This collection of masks seems suspect. + if (NumElts == 2) + return true; + if (NumElts == 4 && SVT.is128BitVector()) { + return (isMOVLMask(Mask, SVT) || + isCommutedMOVLMask(Mask, SVT, true) || + isSHUFPMask(Mask, SVT) || + isSHUFPMask(Mask, SVT, /* Commuted */ true)); } - MIB.addReg(t2); - MIB.setMemRefs(MMOBegin, MMOEnd); - - // Copy PhyReg back to virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) - .addReg(PhyReg); - - BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - - mainMBB->addSuccessor(origMainMBB); - mainMBB->addSuccessor(sinkMBB); - - // sinkMBB: - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstReg) - .addReg(t3); - - MI->eraseFromParent(); - return sinkMBB; + return false; } -// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic -// instructions. They will be translated into a spin-loop or compare-exchange -// loop from -// -// ... -// dst = atomic-fetch-op MI.addr, MI.val -// ... -// -// to -// -// ... -// t1L = LOAD [MI.addr + 0] -// t1H = LOAD [MI.addr + 4] -// loop: -// t4L = phi(t1L, t3L / loop) -// t4H = phi(t1H, t3H / loop) -// t2L = OP MI.val.lo, t4L -// t2H = OP MI.val.hi, t4H -// EAX = t4L -// EDX = t4H -// EBX = t2L -// ECX = t2H -// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] -// t3L = EAX -// t3H = EDX -// JNE loop -// sink: -// dstL = t3L -// dstH = t3H -// ... -MachineBasicBlock * -X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, - MachineBasicBlock *MBB) const { - MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); +//===----------------------------------------------------------------------===// +// X86 Scheduler Hooks +//===----------------------------------------------------------------------===// - MachineRegisterInfo &MRI = MF->getRegInfo(); +/// Utility function to emit xbegin specifying the start of an RTM region. +static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, + const TargetInstrInfo *TII) { + DebugLoc DL = MI->getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = MBB; ++I; - assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && - "Unexpected number of operands"); - - assert(MI->hasOneMemOperand() && - "Expected atomic-load-op32 to have one memoperand"); - - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - - unsigned DstLoReg, DstHiReg; - unsigned SrcLoReg, SrcHiReg; - unsigned MemOpndSlot; - - unsigned CurOp = 0; - - DstLoReg = MI->getOperand(CurOp++).getReg(); - DstHiReg = MI->getOperand(CurOp++).getReg(); - MemOpndSlot = CurOp; - CurOp += X86::AddrNumOperands; - SrcLoReg = MI->getOperand(CurOp++).getReg(); - SrcHiReg = MI->getOperand(CurOp++).getReg(); - - const TargetRegisterClass *RC = &X86::GR32RegClass; - const TargetRegisterClass *RC8 = &X86::GR8RegClass; - - unsigned t1L = MRI.createVirtualRegister(RC); - unsigned t1H = MRI.createVirtualRegister(RC); - unsigned t2L = MRI.createVirtualRegister(RC); - unsigned t2H = MRI.createVirtualRegister(RC); - unsigned t3L = MRI.createVirtualRegister(RC); - unsigned t3H = MRI.createVirtualRegister(RC); - unsigned t4L = MRI.createVirtualRegister(RC); - unsigned t4H = MRI.createVirtualRegister(RC); - - unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; - unsigned LOADOpc = X86::MOV32rm; - - // For the atomic load-arith operator, we generate + // For the v = xbegin(), we generate // - // thisMBB: - // t1L = LOAD [MI.addr + 0] - // t1H = LOAD [MI.addr + 4] - // mainMBB: - // t4L = phi(t1L / thisMBB, t3L / mainMBB) - // t4H = phi(t1H / thisMBB, t3H / mainMBB) - // t2L = OP MI.val.lo, t4L - // t2H = OP MI.val.hi, t4H - // EBX = t2L - // ECX = t2H - // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] - // t3L = EAX - // t3H = EDX - // JNE loop - // sinkMBB: - // dstL = t3L - // dstH = t3H + // thisMBB: + // xbegin sinkMBB + // + // mainMBB: + // eax = -1 + // + // sinkMBB: + // v = eax MachineBasicBlock *thisMBB = MBB; + MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); - MachineInstrBuilder MIB; - // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: - // Lo - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { - unsigned flags = (*MMOI)->getFlags(); - flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; - MachineMemOperand *MMO = - MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, - (*MMOI)->getSize(), - (*MMOI)->getBaseAlignment(), - (*MMOI)->getTBAAInfo(), - (*MMOI)->getRanges()); - MIB.addMemOperand(MMO); - }; - MachineInstr *LowMI = MIB; - - // Hi - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - if (i == X86::AddrDisp) { - MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) - } else { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - } - MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); - + // xbegin sinkMBB + // # fallthrough to mainMBB + // # abortion to sinkMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(sinkMBB); // mainMBB: - MachineBasicBlock *origMainMBB = mainMBB; - - // Add PHIs. - MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) - .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); - MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) - .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); - case X86::ATOMAND6432: - case X86::ATOMOR6432: - case X86::ATOMXOR6432: - case X86::ATOMADD6432: - case X86::ATOMSUB6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) - .addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) - .addReg(SrcHiReg); - break; - } - case X86::ATOMNAND6432: { - unsigned HiOpc, NOTOpc; - unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); - unsigned TmpL = MRI.createVirtualRegister(RC); - unsigned TmpH = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) - .addReg(t4L); - BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) - .addReg(t4H); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); - break; - } - case X86::ATOMMAX6432: - case X86::ATOMMIN6432: - case X86::ATOMUMAX6432: - case X86::ATOMUMIN6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - unsigned cL = MRI.createVirtualRegister(RC8); - unsigned cH = MRI.createVirtualRegister(RC8); - unsigned cL32 = MRI.createVirtualRegister(RC); - unsigned cH32 = MRI.createVirtualRegister(RC); - unsigned cc = MRI.createVirtualRegister(RC); - // cl := cmp src_lo, lo - BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcLoReg).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(LoOpc), cL); - BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); - // ch := cmp src_hi, hi - BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcHiReg).addReg(t4H); - BuildMI(mainMBB, DL, TII->get(HiOpc), cH); - BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); - // cc := if (src_hi == hi) ? cl : ch; - if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) - .addReg(cH32).addReg(cL32); - } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) - .addReg(cH32).addReg(cL32) - .addImm(X86::COND_E); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - } - BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); - if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) - .addReg(SrcLoReg).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) - .addReg(SrcHiReg).addReg(t4H); - } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) - .addReg(SrcLoReg).addReg(t4L) - .addImm(X86::COND_NE); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the - // 2nd CMOV lowering. - mainMBB->addLiveIn(X86::EFLAGS); - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) - .addReg(SrcHiReg).addReg(t4H) - .addImm(X86::COND_NE); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // Replace the original PHI node as mainMBB is changed after CMOV - // lowering. - BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) - .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); - BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) - .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); - PhiL->eraseFromParent(); - PhiH->eraseFromParent(); - } - break; - } - case X86::ATOMSWAP6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); - break; - } - } - - // Copy EDX:EAX back from HiReg:LoReg - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); - // Copy ECX:EBX from t1H:t1L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); - - MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - MIB.setMemRefs(MMOBegin, MMOEnd); - - // Copy EDX:EAX back to t3H:t3L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); - - BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - - mainMBB->addSuccessor(origMainMBB); + // EAX = -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); mainMBB->addSuccessor(sinkMBB); // sinkMBB: + // EAX is live into the sinkMBB + sinkMBB->addLiveIn(X86::EAX); BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstLoReg) - .addReg(t3L); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstHiReg) - .addReg(t3H); + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); MI->eraseFromParent(); return sinkMBB; @@ -17423,62 +18185,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::XBEGIN: return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo()); - // Atomic Lowering. - case X86::ATOMAND8: - case X86::ATOMAND16: - case X86::ATOMAND32: - case X86::ATOMAND64: - // Fall through - case X86::ATOMOR8: - case X86::ATOMOR16: - case X86::ATOMOR32: - case X86::ATOMOR64: - // Fall through - case X86::ATOMXOR16: - case X86::ATOMXOR8: - case X86::ATOMXOR32: - case X86::ATOMXOR64: - // Fall through - case X86::ATOMNAND8: - case X86::ATOMNAND16: - case X86::ATOMNAND32: - case X86::ATOMNAND64: - // Fall through - case X86::ATOMMAX8: - case X86::ATOMMAX16: - case X86::ATOMMAX32: - case X86::ATOMMAX64: - // Fall through - case X86::ATOMMIN8: - case X86::ATOMMIN16: - case X86::ATOMMIN32: - case X86::ATOMMIN64: - // Fall through - case X86::ATOMUMAX8: - case X86::ATOMUMAX16: - case X86::ATOMUMAX32: - case X86::ATOMUMAX64: - // Fall through - case X86::ATOMUMIN8: - case X86::ATOMUMIN16: - case X86::ATOMUMIN32: - case X86::ATOMUMIN64: - return EmitAtomicLoadArith(MI, BB); - - // This group does 64-bit operations on a 32-bit host. - case X86::ATOMAND6432: - case X86::ATOMOR6432: - case X86::ATOMXOR6432: - case X86::ATOMNAND6432: - case X86::ATOMADD6432: - case X86::ATOMSUB6432: - case X86::ATOMMAX6432: - case X86::ATOMMIN6432: - case X86::ATOMUMAX6432: - case X86::ATOMUMIN6432: - case X86::ATOMSWAP6432: - return EmitAtomicLoadArith6432(MI, BB); - case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -17750,6 +18456,333 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Get the PSHUF-style mask from PSHUF node. +/// +/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 +/// PSHUF-style masks that can be reused with such instructions. +static SmallVector getPSHUFShuffleMask(SDValue N) { + SmallVector Mask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + (void)HaveMask; + assert(HaveMask); + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + return Mask; + case X86ISD::PSHUFLW: + Mask.resize(4); + return Mask; + case X86ISD::PSHUFHW: + Mask.erase(Mask.begin(), Mask.begin() + 4); + for (int &M : Mask) + M -= 4; + return Mask; + default: + llvm_unreachable("No valid shuffle instruction found!"); + } +} + +/// \brief Search for a combinable shuffle across a chain ending in pshufd. +/// +/// We walk up the chain and look for a combinable shuffle, skipping over +/// shuffles that we could hoist this shuffle's transformation past without +/// altering anything. +static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N.getOpcode() == X86ISD::PSHUFD && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFD: + // Found another dword shuffle. + break; + + case X86ISD::PSHUFLW: + // Check that the low words (being shuffled) are the identity in the + // dword shuffle, and the high words are self-contained. + if (Mask[0] != 0 || Mask[1] != 1 || + !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) + return false; + + continue; + + case X86ISD::PSHUFHW: + // Check that the high words (being shuffled) are the identity in the + // dword shuffle, and the low words are self-contained. + if (Mask[2] != 2 || Mask[3] != 3 || + !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) + return false; + + continue; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword + // shuffle into a preceding word shuffle. + if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + return false; + + // Search for a half-shuffle which we can combine with. + unsigned CombineOp = + V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + if (V.getOperand(0) != V.getOperand(1) || + !V->isOnlyUserOf(V.getOperand(0).getNode())) + return false; + V = V.getOperand(0); + do { + switch (V.getOpcode()) { + default: + return false; // Nothing to combine. + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOp) + break; + + // Fallthrough! + case ISD::BITCAST: + V = V.getOperand(0); + continue; + } + break; + } while (V.hasOneUse()); + break; + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask. + SmallVector VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // It is possible that one of the combinable shuffles was completely absorbed + // by the other, just replace it and revisit all users in that case. + if (Old.getNode() == V.getNode()) { + DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true); + return true; + } + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + +/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. +/// +/// We walk up the chain, skipping shuffles of the other half and looking +/// through shuffles which switch halves trying to find a shuffle of the same +/// pair of dwords. +static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert( + (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + unsigned CombineOpcode = N.getOpcode(); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOpcode) + break; + + // Other-half shuffles are no-ops. + continue; + + case X86ISD::PSHUFD: { + // We can only handle pshufd if the half we are combining either stays in + // its half, or switches to the other half. Bail if one of these isn't + // true. + SmallVector VMask = getPSHUFShuffleMask(V); + int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2; + if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) || + (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2))) + return false; + + // Map the mask through the pshufd and keep walking up the chain. + for (int i = 0; i < 4; ++i) + Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2; + + // Switch halves if the pshufd does. + CombineOpcode = + VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + continue; + } + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask (adjusted to account for all + // the pshufd instructions encountered). + SmallVector VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + +/// \brief Try to combine x86 target specific shuffles. +static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + SmallVector Mask; + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + Mask = getPSHUFShuffleMask(N); + assert(Mask.size() == 4); + break; + default: + return SDValue(); + } + + // Nuke no-op shuffles that show up after combining. + if (isNoopShuffleMask(Mask)) + return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Look for simplifications involving one or two shuffle instructions. + SDValue V = N.getOperand(0); + switch (N.getOpcode()) { + default: + break; + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + assert(VT == MVT::v8i16); + (void)VT; + + if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle, so we're done. + + // See if this reduces to a PSHUFD which is no more expensive and can + // combine with more operations. + if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && + areAdjacentMasksSequential(Mask)) { + int DMask[] = {-1, -1, -1, -1}; + int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; + DMask[DOffset + 0] = DOffset + Mask[0] / 2; + DMask[DOffset + 1] = DOffset + Mask[2] / 2; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + DCI.AddToWorklist(V.getNode()); + V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + getV4X86ShuffleImm8ForMask(DMask, DAG)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + } + + // Look for shuffle patterns which can be implemented as a single unpack. + // FIXME: This doesn't handle the location of the PSHUFD generically, and + // only works when we have a PSHUFD followed by two half-shuffles. + if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && + (V.getOpcode() == X86ISD::PSHUFLW || + V.getOpcode() == X86ISD::PSHUFHW) && + V.getOpcode() != N.getOpcode() && + V.hasOneUse()) { + SDValue D = V.getOperand(0); + while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) + D = D.getOperand(0); + if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { + SmallVector VMask = getPSHUFShuffleMask(V); + SmallVector DMask = getPSHUFShuffleMask(D); + int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int WordMask[8]; + for (int i = 0; i < 4; ++i) { + WordMask[i + NOffset] = Mask[i] + NOffset; + WordMask[i + VOffset] = VMask[i] + VOffset; + } + // Map the word mask through the DWord mask. + int MappedMask[8]; + for (int i = 0; i < 8; ++i) + MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; + const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; + const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; + if (std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackLoMask)) || + std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackHiMask))) { + // We can replace all three shuffles with an unpack. + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL + : X86ISD::UNPCKH, + DL, MVT::v8i16, V, V); + } + } + } + + break; + + case X86ISD::PSHUFD: + if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle. + + break; + } + + return SDValue(); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -17831,7 +18864,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + if (LD.getNode()) + return LD; + + if (isTargetShuffle(N->getOpcode())) { + SDValue Shuffle = + PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + + return SDValue(); } /// PerformTruncateCombine - Converts truncate operation to @@ -18485,28 +19529,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); - // If the RHS is a constant we have to reverse the const canonicalization. - // x > C-1 ? x+-C : 0 --> subus x, C - if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { - APInt A = cast(OpRHS.getOperand(0))->getAPIntValue(); - if (CondRHS.getConstantOperandVal(0) == -A-1) - return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, - DAG.getConstant(-A, VT)); - } - - // Another special case: If C was a sign bit, the sub has been - // canonicalized into a xor. - // FIXME: Would it be better to use computeKnownBits to determine whether - // it's safe to decanonicalize the xor? - // x s< 0 ? x^C : 0 --> subus x, C - if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && - ISD::isBuildVectorAllZeros(CondRHS.getNode()) && - isSplatVector(OpRHS.getNode())) { - APInt A = cast(OpRHS.getOperand(0))->getAPIntValue(); - if (A.isSignBit()) - return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); - } + if (auto *OpRHSBV = dyn_cast(OpRHS)) + if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { + if (auto *CondRHSBV = dyn_cast(CondRHS)) + if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x > C-1 ? x+-C : 0 --> subus x, C + if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && + CondRHSConst->getAPIntValue() == + (-OpRHSConst->getAPIntValue() - 1)) + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(-OpRHSConst->getAPIntValue(), VT)); + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use computeKnownBits to determine + // whether it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> subus x, C + if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + OpRHSConst->getAPIntValue().isSignBit()) + // Note that we have to rebuild the RHS constant here to ensure we + // don't rely on particular values of undef lanes. + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(OpRHSConst->getAPIntValue(), VT)); + } } } @@ -19073,6 +20123,8 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, if (C->isAllOnesValue()) return Op1; } + + return SDValue(); } // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. @@ -19212,16 +20264,15 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. // (shl V, 1) -> add V,V - if (isSplatVector(N1.getNode())) { - assert(N0.getValueType().isVector() && "Invalid vector shift type"); - ConstantSDNode *N1C = dyn_cast(N1->getOperand(0)); - // We shift all of the values by one. In many cases we do not have - // hardware support for this operation. This is better expressed as an ADD - // of two values. - if (N1C && (1 == N1C->getZExtValue())) { - return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); + if (auto *N1BV = dyn_cast(N1)) + if (auto *N1SplatC = N1BV->getConstantSplatNode()) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1SplatC->getZExtValue() == 1) + return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } - } return SDValue(); } @@ -19240,10 +20291,9 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, SDValue Amt = N->getOperand(1); SDLoc DL(N); - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast(SclrAmt)) { - APInt ShiftAmt = C->getAPIntValue(); + if (auto *AmtBV = dyn_cast(Amt)) + if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { + APInt ShiftAmt = AmtSplat->getAPIntValue(); unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s @@ -19253,7 +20303,6 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, if (ShiftAmt.trunc(8).uge(MaxAmount)) return getZeroVector(VT, Subtarget, DAG, DL); } - } return SDValue(); } @@ -19447,9 +20496,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; - bool RHSConst = (isSplatVector(N1.getNode()) && - isa(N1->getOperand(0))); - if (!RHSTrunc && !RHSConst) + ConstantSDNode *RHSConstSplat = nullptr; + if (auto *RHSBV = dyn_cast(N1)) + RHSConstSplat = RHSBV->getConstantSplatNode(); + if (!RHSTrunc && !RHSConstSplat) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -19459,9 +20509,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); - if (RHSConst) { + if (RHSConstSplat) { N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), - N1->getOperand(0)); + SDValue(RHSConstSplat, 0)); SmallVector C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { @@ -19607,12 +20657,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); unsigned SraAmt = ~0; if (Mask.getOpcode() == ISD::SRA) { - SDValue Amt = Mask.getOperand(1); - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast(SclrAmt)) - SraAmt = C->getZExtValue(); - } + if (auto *AmtBV = dyn_cast(Mask.getOperand(1))) + if (auto *AmtConst = AmtBV->getConstantSplatNode()) + SraAmt = AmtConst->getZExtValue(); } else if (Mask.getOpcode() == X86ISD::VSRAI) { SDValue SraC = Mask.getOperand(1); SraAmt = cast(SraC)->getZExtValue(); @@ -20749,8 +21796,59 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, + SelectionDAG &DAG) { + // Take advantage of vector comparisons producing 0 or -1 in each lane to + // optimize away operation when it's from a constant. + // + // The general transformation is: + // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> + // AND(VECTOR_CMP(x,y), constant2) + // constant2 = UNARYOP(constant) + + // Early exit if this isn't a vector operation or if the operand of the + // unary operation isn't a bitwise AND. + EVT VT = N->getValueType(0); + if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || + N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC) + return SDValue(); + + // Now check that the other operand of the AND is a constant splat. We could + // make the transformation for non-constant splats as well, but it's unclear + // that would be a benefit as it would not eliminate any operations, just + // perform one more step in scalar code before moving to the vector unit. + if (BuildVectorSDNode *BV = + dyn_cast(N->getOperand(0)->getOperand(1))) { + // Bail out if the vector isn't a constant splat. + if (!BV->getConstantSplatNode()) + return SDValue(); + + // Everything checks out. Build up the new and improved node. + SDLoc DL(N); + EVT IntVT = BV->getValueType(0); + // Create a new constant of the appropriate type for the transformed + // DAG. + SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + // The AND node needs bitcasts to/from an integer vector type around it. + SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, + N->getOperand(0)->getOperand(0), MaskConst); + SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); + return Res; + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { + // First try to optimize away the conversion entirely when it's + // conditionally from a constant. Vectors only. + SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); + if (Res != SDValue()) + return Res; + + // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT InVT = Op0->getValueType(0);