X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=60ecf3f43f6a36c79289b5304ad3dd18edd56580;hb=9a2478ac1a9aafcd5e89808868e170cfdfefcdc1;hp=68cd44116d488785dc157fe16765265e49de1bca;hpb=ed4c8c633c52a40ad1a3e8687f290be4aeb1f0e8;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 68cd44116d4..60ecf3f43f6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -45,6 +45,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/VariadicFunction.h" #include "llvm/Support/CallSite.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" @@ -56,6 +57,9 @@ using namespace dwarf; STATISTIC(NumTailCalls, "Number of tail calls"); +static cl::opt UseRegMask("x86-use-regmask", + cl::desc("Use register masks for x86 calls")); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -1728,7 +1732,7 @@ static bool IsTailCallConvention(CallingConv::ID CC) { } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) + if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; CallSite CS(CI); @@ -1807,6 +1811,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); + bool IsWindows = Subtarget->isTargetWindows(); bool IsWin64 = Subtarget->isTargetWin64(); assert(!(isVarArg && IsTailCallConvention(CallConv)) && @@ -2042,7 +2047,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) + if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + ArgsAreStructReturn(Ins)) FuncInfo->setBytesToPopOnReturn(4); } @@ -2126,9 +2132,13 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWindows = Subtarget->isTargetWindows(); bool IsStructRet = CallIsStructReturn(Outs); bool IsSibcall = false; + if (MF.getTarget().Options.DisableTailCalls) + isTailCall = false; + if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, @@ -2506,6 +2516,14 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (Is64Bit && isVarArg && !IsWin64) Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); + // Experimental: Add a register mask operand representing the call-preserved + // registers. + if (UseRegMask) { + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + Ops.push_back(DAG.getRegisterMask(Mask)); + } + if (InFlag.getNode()) Ops.push_back(InFlag); @@ -2528,10 +2546,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, getTargetMachine().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything - else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) + else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + IsStructRet) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. + // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPush = 4; else NumBytesForCalleeToPush = 0; // Callee pops nothing. @@ -3265,17 +3285,35 @@ static bool isPALIGNRMask(ArrayRef Mask, EVT VT, bool hasSSSE3) { return true; } -/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// VSHUFPSY. -static bool isVSHUFPYMask(ArrayRef Mask, EVT VT, - bool HasAVX, bool Commuted = false) { - int NumElems = VT.getVectorNumElements(); +/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming +/// the two vector operands have swapped position. +static void CommuteVectorShuffleMask(SmallVectorImpl &Mask, + unsigned NumElems) { + for (unsigned i = 0; i != NumElems; ++i) { + int idx = Mask[i]; + if (idx < 0) + continue; + else if (idx < (int)NumElems) + Mask[i] = idx + NumElems; + else + Mask[i] = idx - NumElems; + } +} - if (!HasAVX || VT.getSizeInBits() != 256) +/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 128/256-bit +/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be +/// reverse of what x86 shuffles want. +static bool isSHUFPMask(ArrayRef Mask, EVT VT, bool HasAVX, + bool Commuted = false) { + if (!HasAVX && VT.getSizeInBits() == 256) return false; - if (NumElems != 4 && NumElems != 8) + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElems = NumElems/NumLanes; + + if (NumLaneElems != 2 && NumLaneElems != 4) return false; // VSHUFPSY divides the resulting vector into 4 chunks. @@ -3297,101 +3335,28 @@ static bool isVSHUFPYMask(ArrayRef Mask, EVT VT, // // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // - unsigned QuarterSize = NumElems/4; - unsigned HalfSize = QuarterSize*2; - for (unsigned l = 0; l != 2; ++l) { - unsigned LaneStart = l*HalfSize; - for (unsigned s = 0; s != 2; ++s) { - unsigned QuarterStart = s*QuarterSize; - unsigned Src = (Commuted) ? (1-s) : s; - unsigned SrcStart = Src*NumElems + LaneStart; - for (unsigned i = 0; i != QuarterSize; ++i) { - int Idx = Mask[i+QuarterStart+LaneStart]; - if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize)) - return false; - // For VSHUFPSY, the mask of the second half must be the same as the - // first but with the appropriate offsets. This works in the same way as - // VPERMILPS works with masks. - if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0) - continue; - if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+LaneStart)) - return false; - } + unsigned HalfLaneElems = NumLaneElems/2; + for (unsigned l = 0; l != NumElems; l += NumLaneElems) { + for (unsigned i = 0; i != NumLaneElems; ++i) { + int Idx = Mask[i+l]; + unsigned RngStart = l + ((Commuted == (igetValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types"); - assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types"); - - unsigned HalfSize = NumElems/2; - unsigned Mul = (NumElems == 8) ? 2 : 1; - unsigned Mask = 0; - for (unsigned i = 0; i != NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - if (Elt < 0) - continue; - Elt %= HalfSize; - unsigned Shamt = i; - // For VSHUFPSY, the mask of the first half must be equal to the second one. - if (NumElems == 8) Shamt %= HalfSize; - Mask |= Elt << (Shamt*Mul); - } - - return Mask; -} - -/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming -/// the two vector operands have swapped position. -static void CommuteVectorShuffleMask(SmallVectorImpl &Mask, - unsigned NumElems) { - for (unsigned i = 0; i != NumElems; ++i) { - int idx = Mask[i]; - if (idx < 0) - continue; - else if (idx < (int)NumElems) - Mask[i] = idx + NumElems; - else - Mask[i] = idx - NumElems; - } -} - -/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 128-bit -/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be -/// reverse of what x86 shuffles want. -static bool isSHUFPMask(ArrayRef Mask, EVT VT, bool Commuted = false) { - unsigned NumElems = VT.getVectorNumElements(); - - if (VT.getSizeInBits() != 128) - return false; - - if (NumElems != 2 && NumElems != 4) - return false; - - unsigned Half = NumElems / 2; - unsigned SrcStart = Commuted ? NumElems : 0; - for (unsigned i = 0; i != Half; ++i) - if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) - return false; - SrcStart = Commuted ? 0 : NumElems; - for (unsigned i = Half; i != NumElems; ++i) - if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) - return false; - - return true; -} - -bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { - return ::isSHUFPMask(N->getMask(), N->getValueType(0)); +bool X86::isSHUFPMask(ShuffleVectorSDNode *N, bool HasAVX) { + return ::isSHUFPMask(N->getMask(), N->getValueType(0), HasAVX); } /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand @@ -3745,17 +3710,16 @@ static bool isVPERMILPMask(ArrayRef Mask, EVT VT, bool HasAVX) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned LaneSize = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - unsigned LaneStart = l*LaneSize; + for (unsigned l = 0; l != NumElts; l += LaneSize) { for (unsigned i = 0; i != LaneSize; ++i) { - if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize)) + if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; - if (NumElts == 4 || l == 0) + if (NumElts != 8 || l == 0) continue; // VPERMILPS handling if (Mask[i] < 0) continue; - if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneStart)) + if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) return false; } } @@ -3952,20 +3916,33 @@ bool X86::isVINSERTF128Index(SDNode *N) { /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. -unsigned X86::getShuffleSHUFImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast(N); - unsigned NumOperands = SVOp->getValueType(0).getVectorNumElements(); +/// Handles 128-bit and 256-bit. +unsigned X86::getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); - unsigned Shift = (NumOperands == 4) ? 2 : 1; + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for PSHUF/SHUFP"); + + // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate + // independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + assert((NumLaneElts == 2 || NumLaneElts == 4) && + "Only supports 2 or 4 elements per lane"); + + unsigned Shift = (NumLaneElts == 4) ? 1 : 0; unsigned Mask = 0; - for (unsigned i = 0; i != NumOperands; ++i) { - int Val = SVOp->getMaskElt(NumOperands-i-1); - if (Val < 0) Val = 0; - if (Val >= (int)NumOperands) Val -= NumOperands; - Mask |= Val; - if (i != NumOperands - 1) - Mask <<= Shift; + for (unsigned i = 0; i != NumElts; ++i) { + int Elt = N->getMaskElt(i); + if (Elt < 0) continue; + Elt %= NumLaneElts; + unsigned ShAmt = i << Shift; + if (ShAmt >= 8) ShAmt -= 8; + Mask |= Elt << ShAmt; } + return Mask; } @@ -6241,6 +6218,13 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); + // If we are accessing the upper part of a YMM register + // then the EXTRACT_VECTOR_ELT is likely to be legalized to a sequence of + // EXTRACT_SUBVECTOR + EXTRACT_VECTOR_ELT, which are not detected at this point + // because the legalization of N did not happen yet. + if (Idx >= (int)NumElems/2 && VT.getSizeInBits() == 256) + return false; + // Skip one more bit_convert if necessary if (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); @@ -6635,8 +6619,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) || - isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true))) + if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are @@ -6664,7 +6647,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { X86::getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT)) + if (isSHUFPMask(M, VT, HasAVX)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, X86::getShuffleSHUFImmediate(SVOp), DAG); @@ -6692,11 +6675,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - // Handle VSHUFPS/DY permutations - if (isVSHUFPYMask(M, VT, HasAVX)) - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, - getShuffleVSHUFPYImmediate(SVOp), DAG); - //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, // lower it into other known shuffles. FIXME: this isn't true yet, but @@ -10167,8 +10145,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::MUL, dl, VT, Op, R); } if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { - assert((Subtarget->hasSSE2() || Subtarget->hasAVX()) && - "Need SSE2 for pslli/pcmpeq."); + assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); // a = a << 5; Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, @@ -11108,7 +11085,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, return (VT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, VT) || - isSHUFPMask(M, VT) || + isSHUFPMask(M, VT, Subtarget->hasAVX()) || isPSHUFDMask(M, VT) || isPSHUFHWMask(M, VT) || isPSHUFLWMask(M, VT) || @@ -11129,8 +11106,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, if (NumElts == 4 && VT.getSizeInBits() == 128) { return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || - isSHUFPMask(Mask, VT) || - isSHUFPMask(Mask, VT, /* Commuted */ true)); + isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || + isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true)); } return false; } @@ -12868,6 +12845,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); SDValue Cond = N->getOperand(0); @@ -13144,6 +13122,26 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // If we know that this node is legal then we know that it is going to be + // matched by one of the SSE/AVX BLEND instructions. These instructions only + // depend on the highest bit in each word. Try to use SimplifyDemandedBits + // to simplify previous instructions. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && + !DCI.isBeforeLegalize() && + TLI.isOperationLegal(ISD::VSELECT, VT)) { + unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + return SDValue(); } @@ -13409,6 +13407,11 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, BaseShAmt = Arg; break; } + // Handle the case where the build_vector is all undef + // FIXME: Should DAG allow this? + if (i == NumElts) + return SDValue(); + for (; i != NumElts; ++i) { SDValue Arg = ShAmtOp.getOperand(i); if (Arg.getOpcode() == ISD::UNDEF) continue; @@ -14609,7 +14612,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); case ISD::VSELECT: - case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); + case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);