X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=f4a2f692185189545ccad7cd372a2b606d766553;hb=22f779d1fd0d7a7e878a3c28bc5d9910ae655df3;hp=23910a350e86066130f2a3f43bc36a36b104e71b;hpb=46f7257ed1bbd2a169f6c930a805e702407d955b;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 23910a350e8..2f14d790742 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12,17 +12,17 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-isel" #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" -#include "X86.h" #include "X86CallingConv.h" #include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -31,6 +31,7 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -39,12 +40,10 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -53,6 +52,8 @@ #include using namespace llvm; +#define DEBUG_TYPE "x86-isel" + STATISTIC(NumTailCalls, "Number of tail calls"); // Forward declarations. @@ -85,7 +86,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); + makeArrayRef(Vec->op_begin()+NormalizedIdxVal, + ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, @@ -180,7 +182,7 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { const X86Subtarget *Subtarget = &TM.getSubtarget(); bool is64Bit = Subtarget->is64Bit(); - if (Subtarget->isTargetEnvMacho()) { + if (Subtarget->isTargetMacho()) { if (is64Bit) return new X86_64MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); @@ -190,7 +192,9 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { return new X86LinuxTargetObjectFile(); if (Subtarget->isTargetELF()) return new TargetLoweringObjectFileELF(); - if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + if (Subtarget->isTargetKnownWindowsMSVC()) + return new X86WindowsTargetObjectFile(); + if (Subtarget->isTargetCOFF()) return new TargetLoweringObjectFileCOFF(); llvm_unreachable("unknown subtarget type"); } @@ -249,7 +253,7 @@ void X86TargetLowering::resetOperationActions() { addBypassSlowDiv(64, 16); } - if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { + if (Subtarget->isTargetKnownWindowsMSVC()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); @@ -264,17 +268,17 @@ void X86TargetLowering::resetOperationActions() { // The _ftol2 runtime function has an unusual calling conv, which // is modeled by a special pseudo-instruction. - setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); - setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); - setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); - setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); + setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); + setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); + setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); + setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); setUseUnderscoreLongJmp(false); - } else if (Subtarget->isTargetMingw()) { + } else if (Subtarget->isTargetWindowsGNU()) { // MS runtime is weird: it exports _setjmp, but longjmp! setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(false); @@ -504,7 +508,9 @@ void X86TargetLowering::resetOperationActions() { } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); - setOperationAction(ISD::BSWAP , MVT::i16 , Expand); + + if (!Subtarget->hasMOVBE()) + setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); @@ -632,15 +638,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); - else if (TM.Options.EnableSegmentedStacks) - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. @@ -829,7 +828,9 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); @@ -941,6 +942,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::MULHU, MVT::v8i16, Legal); + setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v4i32, Legal); @@ -1108,9 +1113,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); - - setOperationAction(ISD::SDIV, MVT::v8i16, Custom); - setOperationAction(ISD::SDIV, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { @@ -1151,9 +1153,12 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); - + // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted + // even though v8i16 is a legal type. + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); @@ -1172,8 +1177,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); - setOperationAction(ISD::SDIV, MVT::v16i16, Custom); - setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); @@ -1226,9 +1229,12 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::MUL, MVT::v16i16, Legal); // Don't lower v32i8 because there is no 128-bit byte mul - setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::MULHU, MVT::v16i16, Legal); + setOperationAction(ISD::MULHS, MVT::v16i16, Legal); - setOperationAction(ISD::SDIV, MVT::v8i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1306,9 +1312,15 @@ void X86TargetLowering::resetOperationActions() { addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + addRegisterClass(MVT::i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::XOR, MVT::i1, Legal); + setOperationAction(ISD::OR, MVT::i1, Legal); + setOperationAction(ISD::AND, MVT::i1, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); @@ -1331,7 +1343,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FNEG, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); - setOperationAction(ISD::SDIV, MVT::v16i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); @@ -1346,17 +1357,20 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); - setOperationAction(ISD::TRUNCATE, MVT::i1, Legal); + setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); @@ -1370,12 +1384,17 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); @@ -1458,6 +1477,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + if (!Subtarget->is64Bit()) + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1482,9 +1503,9 @@ void X86TargetLowering::resetOperationActions() { if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::SHL_I128, 0); - setLibcallName(RTLIB::SRL_I128, 0); - setLibcallName(RTLIB::SRA_I128, 0); + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); } // Combine sin / cos into one node or libcall if possible. @@ -1500,6 +1521,15 @@ void X86TargetLowering::resetOperationActions() { } } + if (Subtarget->isTargetWin64()) { + setOperationAction(ISD::SDIV, MVT::i128, Custom); + setOperationAction(ISD::UDIV, MVT::i128, Custom); + setOperationAction(ISD::SREM, MVT::i128, Custom); + setOperationAction(ISD::UREM, MVT::i128, Custom); + setOperationAction(ISD::SDIVREM, MVT::i128, Custom); + setOperationAction(ISD::UDIVREM, MVT::i128, Custom); + } + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -1548,14 +1578,13 @@ void X86TargetLowering::resetOperationActions() { EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) - return MVT::i8; + return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; - const TargetMachine &TM = getTargetMachine(); - if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) + if (Subtarget->hasAVX512()) switch(VT.getVectorNumElements()) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; - } + } return VT.changeVectorElementTypeToInteger(); } @@ -1661,7 +1690,9 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const { } bool -X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { +X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, + unsigned, + bool *Fast) const { if (Fast) *Fast = Subtarget->isUnalignedMemAccessFast(); return true; @@ -1721,7 +1752,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, // FIXME: Why this routine is here? Move to RegInfo! std::pair X86TargetLowering::findRepresentativeClass(MVT VT) const{ - const TargetRegisterClass *RRC = 0; + const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: @@ -1789,8 +1820,8 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, return CCInfo.CheckReturn(Outs, RetCC_X86); } -const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { - static const uint16_t ScratchRegs[] = { X86::R11, 0 }; +const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { + static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } @@ -1832,6 +1863,9 @@ X86TargetLowering::LowerReturn(SDValue Chain, else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); + assert(VA.getLocInfo() != CCValAssign::FPExt && + "Unexpected FP-extend for return value."); + // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || @@ -1886,7 +1920,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { + (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Reg = FuncInfo->getSRetReturnReg(); @@ -1910,8 +1944,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(X86ISD::RET_FLAG, dl, - MVT::Other, &RetOps[0], RetOps.size()); + return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -2175,7 +2208,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); - bool IsWindows = Subtarget->isTargetWindows(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); assert(!(isVarArg && IsTailCallConvention(CallConv)) && @@ -2222,6 +2254,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; + else if (RegVT == MVT::i1) + RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) @@ -2270,7 +2304,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Save the argument into a virtual register so that we can access it // from the return points. if (MF.getFunction()->hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { + (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { X86MachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { @@ -2299,17 +2333,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; // FIXME: We should really autogenerate these arrays - static const uint16_t GPR64ArgRegsWin64[] = { + static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; - static const uint16_t GPR64ArgRegs64Bit[] = { + static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; - static const uint16_t XMMArgRegs64Bit[] = { + static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - const uint16_t *GPR64ArgRegs; + const MCPhysReg *GPR64ArgRegs; unsigned NumXMMRegs = 0; if (IsWin64) { @@ -2403,13 +2437,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SaveXMMOps.push_back(Val); } MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, - &SaveXMMOps[0], SaveXMMOps.size())); + MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOps[0], MemOps.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } } @@ -2420,7 +2452,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + if (!Is64Bit && !IsTailCallConvention(CallConv) && + !Subtarget->getTargetTriple().isOSMSVCRT() && argsAreStructReturn(Ins) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } @@ -2475,10 +2508,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). -static SDValue -EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, - SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, - unsigned SlotSize, int FPDiff, SDLoc dl) { +static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, + SDValue Chain, SDValue RetAddrFrIdx, + EVT PtrVT, unsigned SlotSize, + int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. @@ -2509,14 +2542,19 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - bool IsWindows = Subtarget->isTargetWindows(); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; - if (isTailCall) { + bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); + if (IsMustTail) { + // Force this to be a tail call. The verifier rules are enough to ensure + // that we can lower this successfully without moving the return address + // around. + isTailCall = true; + } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, @@ -2557,7 +2595,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; - if (isTailCall && !IsSibcall) { + if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. X86MachineFunctionInfo *X86Info = MF.getInfo(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); @@ -2570,9 +2608,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, X86Info->setTCReturnAddrDelta(FPDiff); } + unsigned NumBytesToPush = NumBytes; + unsigned NumBytesToPop = NumBytes; + + // If we have an inalloca argument, all stack space has already been allocated + // for us and be right at the top of the stack. We don't support multiple + // arguments passed in memory when using inalloca. + if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { + NumBytesToPush = 0; + assert(ArgLocs.back().getLocMemOffset() == 0 && + "an inalloca argument must be the only memory argument"); + } + if (!IsSibcall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), - dl); + Chain = DAG.getCALLSEQ_START( + Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl); SDValue RetAddrFrIdx; // Load return address for tail calls. @@ -2589,10 +2639,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const X86RegisterInfo *RegInfo = static_cast(getTargetMachine().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + // Skip inalloca arguments, they have already been written. + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (Flags.isInAlloca()) + continue; + CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; bool isByVal = Flags.isByVal(); // Promote the value if needed. @@ -2646,7 +2700,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); - if (StackPtr.getNode() == 0) + if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -2655,8 +2709,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!MemOpChains.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOpChains[0], MemOpChains.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT @@ -2665,15 +2718,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); } else { - // If we are tail calling a global or external symbol in GOT pic mode, we - // cannot use a direct jump, since that would make lazy dynamic linking - // impossible (see PR15086). So pretend this is not a tail call, to - // prevent the optimization to a jump. + // If we are tail calling and generating PIC/GOT style code load the + // address of the callee into ECX. The value in ecx is used as target of + // the tail jump. This is done to circumvent the ebx/callee-saved problem + // for tail calls on PIC/GOT architectures. Normally we would just put the + // address of GOT into ebx and then call target@PLT. But for tail calls + // ebx would be restored (since ebx is callee saved) before jumping to the + // target@PLT. + + // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast(Callee); - if ((G && !G->getGlobal()->hasHiddenVisibility() && - !G->getGlobal()->hasProtectedVisibility()) || - isa(Callee)) - isTailCall = false; + if (G && !G->getGlobal()->hasHiddenVisibility() && + !G->getGlobal()->hasProtectedVisibility()) + Callee = LowerGlobalAddress(Callee, DAG); + else if (isa(Callee)) + Callee = LowerExternalSymbol(Callee, DAG); } } @@ -2687,7 +2746,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. - static const uint16_t XMMArgRegs[] = { + static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; @@ -2699,8 +2758,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(NumXMMRegs, MVT::i8))); } - // For tail calls lower the arguments to the 'real' stack slot. - if (isTailCall) { + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls + // don't need this because the eligibility check rejects calls that require + // shuffling arguments passed in memory. + if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and @@ -2712,45 +2773,45 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector MemOpChains2; SDValue FIN; int FI = 0; - if (getTargetMachine().Options.GuaranteedTailCallOpt) { - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - if (VA.isRegLoc()) - continue; - assert(VA.isMemLoc()); - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; - // Create frame index. - int32_t Offset = VA.getLocMemOffset()+FPDiff; - uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; - FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - FIN = DAG.getFrameIndex(FI, getPointerTy()); - - if (Flags.isByVal()) { - // Copy relative to framepointer. - SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); - if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, - RegInfo->getStackRegister(), - getPointerTy()); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); - - MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, - ArgChain, - Flags, DAG, dl)); - } else { - // Store relative to framepointer. - MemOpChains2.push_back( - DAG.getStore(ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); - } + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) + continue; + assert(VA.isMemLoc()); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Skip inalloca arguments. They don't require any work. + if (Flags.isInAlloca()) + continue; + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + FIN = DAG.getFrameIndex(FI, getPointerTy()); + + if (Flags.isByVal()) { + // Copy relative to framepointer. + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, + RegInfo->getStackRegister(), + getPointerTy()); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); + + MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, + ArgChain, + Flags, DAG, dl)); + } else { + // Store relative to framepointer. + MemOpChains2.push_back( + DAG.getStore(ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); } } if (!MemOpChains2.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOpChains2[0], MemOpChains2.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, @@ -2781,7 +2842,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // We should use extra load for direct calls to dllimported functions in // non-JIT mode. const GlobalValue *GV = G->getGlobal(); - if (!GV->hasDLLImportLinkage()) { + if (!GV->hasDLLImportStorageClass()) { unsigned char OpFlags = 0; bool ExtraLoad = false; unsigned WrapperKind = ISD::DELETED_NODE; @@ -2853,8 +2914,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector Ops; if (!IsSibcall && isTailCall) { - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag, dl); + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(NumBytesToPop, true), + DAG.getIntPtrConstant(0, true), InFlag, dl); InFlag = Chain.getValue(1); } @@ -2886,32 +2948,33 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. - return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); + return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } - Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. - unsigned NumBytesForCalleeToPush; + unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, getTargetMachine().Options.GuaranteedTailCallOpt)) - NumBytesForCalleeToPush = NumBytes; // Callee pops everything - else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + NumBytesForCalleeToPop = NumBytes; // Callee pops everything + else if (!Is64Bit && !IsTailCallConvention(CallConv) && + !Subtarget->getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. // For MSVC Win32 targets, the caller pops the hidden struct pointer. - NumBytesForCalleeToPush = 4; + NumBytesForCalleeToPop = 4; else - NumBytesForCalleeToPush = 0; // Callee pops nothing. + NumBytesForCalleeToPop = 0; // Callee pops nothing. // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, - DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(NumBytesForCalleeToPush, + DAG.getIntPtrConstant(NumBytesToPop, true), + DAG.getIntPtrConstant(NumBytesForCalleeToPop, true), InFlag, dl); InFlag = Chain.getValue(1); @@ -3086,9 +3149,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // An stdcall caller is expected to clean up its arguments; the callee - // isn't going to do that. - if (!CCMatch && CallerCC == CallingConv::X86_StdCall) + // An stdcall/thiscall caller is expected to clean up its arguments; the + // callee isn't going to do that. + // FIXME: this is more restrictive than needed. We could produce a tailcall + // when the stack adjustment matches. For example, with a thiscall that takes + // only one argument. + if (!CCMatch && (CallerCC == CallingConv::X86_StdCall || + CallerCC == CallingConv::X86_ThisCall)) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via @@ -3409,6 +3476,24 @@ bool X86::isCalleePop(CallingConv::ID CallingConv, } } +/// \brief Return true if the condition is an unsigned comparison operation. +static bool isX86CCUnsigned(unsigned X86CC) { + switch (X86CC) { + default: llvm_unreachable("Invalid integer condition!"); + case X86::COND_E: return true; + case X86::COND_G: return false; + case X86::COND_GE: return false; + case X86::COND_L: return false; + case X86::COND_LE: return false; + case X86::COND_NE: return true; + case X86::COND_B: return true; + case X86::COND_A: return true; + case X86::COND_BE: return true; + case X86::COND_AE: return true; + } + llvm_unreachable("covered switch fell through?!"); +} + /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 /// specific condition code, returning the condition code and the LHS/RHS of the /// comparison to make. @@ -3527,6 +3612,18 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return false; } +/// \brief Returns true if it is beneficial to convert a load of a constant +/// to just the constant itself. +bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0 || BitSize > 64) + return false; + return true; +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { @@ -3848,6 +3945,29 @@ static bool isMOVLHPSMask(ArrayRef Mask, MVT VT) { return true; } +/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to INSERTPS. +/// i. e: If all but one element come from the same vector. +static bool isINSERTPSMask(ArrayRef Mask, MVT VT) { + // TODO: Deal with AVX's VINSERTPS + if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) + return false; + + unsigned CorrectPosV1 = 0; + unsigned CorrectPosV2 = 0; + for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) + if (Mask[i] == i) + ++CorrectPosV1; + else if (Mask[i] == i + 4) + ++CorrectPosV2; + + if (CorrectPosV1 == 3 || CorrectPosV2 == 3) + // We have 3 elements from one vector, and one from another. + return true; + + return false; +} + // // Some special combinations that can be optimized. // @@ -4067,6 +4187,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef Mask, MVT VT, bool HasInt256) { return true; } +// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or +// (src1[0], src0[1]), manipulation with 256-bit sub-vectors +static bool isINSERT64x4Mask(ArrayRef Mask, MVT VT, unsigned int *Imm) { + if (!VT.is512BitVector()) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfSize = NumElts/2; + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { + *Imm = 1; + return true; + } + } + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { + *Imm = 0; + return true; + } + } + return false; +} + /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. @@ -4598,7 +4741,7 @@ static bool ShouldXformToMOVHLPS(ArrayRef Mask, MVT VT) { /// isScalarLoadToVector - Returns true if the node is a scalar load that /// is promoted to a vector. It also returns the LoadSDNode by reference if /// required. -static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { +static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) { if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) return false; N = N->getOperand(0).getNode(); @@ -4724,21 +4867,24 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); + } else if (VT.getScalarType() == MVT::i1) { + assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); + SDValue Cst = DAG.getTargetConstant(0, MVT::i1); + SmallVector Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); @@ -4758,8 +4904,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, if (VT.is256BitVector()) { if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); @@ -5221,7 +5366,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, return SDValue(); SDLoc dl(Op); - SDValue V(0, 0); + SDValue V; bool First = true; for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; @@ -5234,7 +5379,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } if ((i & 1) != 0) { - SDValue ThisElt(0, 0), LastElt(0, 0); + SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; if (LastIsNonZero) { LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, @@ -5269,7 +5414,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return SDValue(); SDLoc dl(Op); - SDValue V(0, 0); + SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; @@ -5389,11 +5534,12 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, - SDLoc &DL, SelectionDAG &DAG) { + SDLoc &DL, SelectionDAG &DAG, + bool isAfterLegalize) { EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); - LoadSDNode *LDBase = NULL; + LoadSDNode *LDBase = nullptr; unsigned LastLoadedElt = -1U; // For each element in the initializer, see if we've found a load or an undef. @@ -5425,7 +5571,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + + if (isAfterLegalize && + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + SDValue NewLd = SDValue(); + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), @@ -5452,8 +5604,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, - array_lengthof(Ops), MVT::i64, + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, @@ -5568,7 +5719,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, unsigned ScalarSize = CVT.getSizeInBits(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { - const Constant *C = 0; + const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) @@ -5613,6 +5764,41 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); } +/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real +/// underlying vector and index. +/// +/// Modifies \p ExtractedFromVec to the real vector and returns the real +/// index. +static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, + SDValue ExtIdx) { + int Idx = cast(ExtIdx)->getZExtValue(); + if (!isa(ExtractedFromVec)) + return Idx; + + // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already + // lowered this: + // (extract_vector_elt (v8f32 %vreg1), Constant<6>) + // to: + // (extract_vector_elt (vector_shuffle<2,u,u,u> + // (extract_subvector (v8f32 %vreg0), Constant<4>), + // undef) + // Constant<0>) + // In this case the vector is the extract_subvector expression and the index + // is 2, as specified by the shuffle. + ShuffleVectorSDNode *SVOp = cast(ExtractedFromVec); + SDValue ShuffleVec = SVOp->getOperand(0); + MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); + assert(ShuffleVecVT.getVectorElementType() == + ExtractedFromVec.getSimpleValueType().getVectorElementType()); + + int ShuffleIdx = SVOp->getMaskElt(Idx); + if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { + ExtractedFromVec = ShuffleVec; + return ShuffleIdx; + } + return Idx; +} + static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -5646,34 +5832,32 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); + // Quit if non-constant index. + if (!isa(ExtIdx)) + return SDValue(); + int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); - // Quit if non-constant index. - if (!isa(ExtIdx)) - return SDValue(); - - if (VecIn1.getNode() == 0) + if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { - if (VecIn2.getNode() == 0) + if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } - unsigned Idx = cast(ExtIdx)->getZExtValue(); - if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } - if (VecIn1.getNode() == 0) + if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); @@ -5698,32 +5882,38 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(0, MVT::i1); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - Ops, VT.getVectorNumElements()); + SmallVector Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorAllOnes(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(1, MVT::i1); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - Ops, VT.getVectorNumElements()); + SmallVector Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } bool AllContants = true; uint64_t Immediate = 0; + int NonConstIdx = -1; + bool IsSplat = true; + unsigned NumNonConsts = 0; + unsigned NumConsts = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) continue; if (!isa(In)) { AllContants = false; - break; + NonConstIdx = idx; + NumNonConsts++; } - if (cast(In)->getZExtValue()) + else { + NumConsts++; + if (cast(In)->getZExtValue()) Immediate |= (1ULL << idx); + } + if (In != Op.getOperand(0)) + IsSplat = false; } if (AllContants) { @@ -5733,63 +5923,32 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0)); } - // Splat vector (with undefs) - SDValue In = Op.getOperand(0); - for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) { - if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF) - llvm_unreachable("Unsupported predicate operation"); - } - - SDValue EFLAGS, X86CC; - if (In.getOpcode() == ISD::SETCC) { - SDValue Op0 = In.getOperand(0); - SDValue Op1 = In.getOperand(1); - ISD::CondCode CC = cast(In.getOperand(2))->get(); - bool isFP = Op1.getValueType().isFloatingPoint(); - unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG); - - assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation"); - - X86CC = DAG.getConstant(X86CCVal, MVT::i8); - EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG); - EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); - } else if (In.getOpcode() == X86ISD::SETCC) { - X86CC = In.getOperand(0); - EFLAGS = In.getOperand(1); - } else { - // The algorithm: - // Bit1 = In & 0x1 - // if (Bit1 != 0) - // ZF = 0 - // else - // ZF = 1 - // if (ZF == 0) - // res = allOnes ### CMOVNE -1, %res - // else - // res = allZero - MVT InVT = In.getSimpleValueType(); - SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT)); - EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG); - X86CC = DAG.getConstant(X86::COND_NE, MVT::i8); - } - - if (VT == MVT::v16i1) { - SDValue Cst1 = DAG.getConstant(-1, MVT::i16); - SDValue Cst0 = DAG.getConstant(0, MVT::i16); - SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16, - Cst0, Cst1, X86CC, EFLAGS); - return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); - } - - if (VT == MVT::v8i1) { - SDValue Cst1 = DAG.getConstant(-1, MVT::i32); - SDValue Cst0 = DAG.getConstant(0, MVT::i32); - SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32, - Cst0, Cst1, X86CC, EFLAGS); - CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp); - return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); - } - llvm_unreachable("Unsupported predicate operation"); + if (NumNonConsts == 1 && NonConstIdx != 0) { + SDValue DstVec; + if (NumConsts) { + SDValue VecAsImm = DAG.getConstant(Immediate, + MVT::getIntegerVT(VT.getSizeInBits())); + DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); + } + else + DstVec = DAG.getUNDEF(VT); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + Op.getOperand(NonConstIdx), + DAG.getIntPtrConstant(NonConstIdx)); + } + if (!IsSplat && (NonConstIdx != 0)) + llvm_unreachable("Unsupported BUILD_VECTOR operation"); + MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; + SDValue Select; + if (IsSplat) + Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), + DAG.getConstant(-1, SelectVT), + DAG.getConstant(0, SelectVT)); + else + Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), + DAG.getConstant((Immediate | 1), SelectVT), + DAG.getConstant(Immediate, SelectVT)); + return DAG.getNode(ISD::BITCAST, dl, VT, Select); } SDValue @@ -5982,20 +6141,23 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. - if (VT.is256BitVector()) { - SmallVector V; + if (VT.is256BitVector() || VT.is512BitVector()) { + SmallVector V; for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. - SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); - SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], - NumElems/2); + SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, + makeArrayRef(&V[0], NumElems/2)); + SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, + makeArrayRef(&V[NumElems / 2], NumElems/2)); // Recreate the wider vector with the lower and upper part. - return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); + if (VT.is256BitVector()) + return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); + return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. @@ -6069,7 +6231,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. - SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); if (LD.getNode()) return LD; @@ -6228,6 +6390,58 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } +/// In vector type \p VT, return true if the element at index \p InputIdx +/// falls on a different 128-bit lane than \p OutputIdx. +static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, + unsigned OutputIdx) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; +} + +/// Generate a PSHUFB if possible. Selects elements from \p V1 according to +/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to +/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p +/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a +/// zero. +static SDValue getPSHUFB(ArrayRef MaskVals, SDValue V1, SDLoc &dl, + SelectionDAG &DAG) { + MVT VT = V1.getSimpleValueType(); + assert(VT.is128BitVector() || VT.is256BitVector()); + + MVT EltVT = VT.getVectorElementType(); + unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector PshufbMask; + for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { + int InputIdx = MaskVals[OutputIdx]; + unsigned InputByteIdx; + + if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) + InputByteIdx = 0x80; + else { + // Cross lane is not allowed. + if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) + return SDValue(); + InputByteIdx = InputIdx * EltSizeInBytes; + // Index is an byte offset within the 128-bit lane. + InputByteIdx &= 0xf; + } + + for (unsigned j = 0; j < EltSizeInBytes; ++j) { + PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); + if (InputByteIdx != 0x80) + ++InputByteIdx; + } + } + + MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); + if (ShufVT != VT) + V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); + return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask)); +} + // v8i16 shuffles - Prefer shuffles in the following order: // 1. [all] pshuflw, pshufhw, optional move // 2. [ssse3] 1 x pshufb @@ -6245,8 +6459,12 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, // Determine if more than 1 of the words in each of the low and high quadwords // of the result come from the same quadword of one of the two inputs. Undef // mask values count as coming from any quadword, for better codegen. + // + // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input + // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2. unsigned LoQuad[] = { 0, 0, 0, 0 }; unsigned HiQuad[] = { 0, 0, 0, 0 }; + // Indices of quads used. std::bitset<4> InputQuads; for (unsigned i = 0; i < 8; ++i) { unsigned *Quad = i < 4 ? LoQuad : HiQuad; @@ -6283,7 +6501,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, // For SSSE3, If all 8 words of the result come from only 1 quadword of each // of the two input vectors, shuffle them into one input vector so only a - // single pshufb instruction is necessary. If There are more than 2 input + // single pshufb instruction is necessary. If there are more than 2 input // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; @@ -6375,34 +6593,14 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, // mask, and elements that come from V1 in the V2 mask, so that the two // results can be OR'd together. bool TwoInputs = V1Used && V2Used; - for (unsigned i = 0; i != 8; ++i) { - int EltIdx = MaskVals[i] * 2; - int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; - int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; - pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); - } - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); - V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, &pshufbMask[0], 16)); + V1 = getPSHUFB(MaskVals, V1, dl, DAG); if (!TwoInputs) return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); // Calculate the shuffle mask for the second input, shuffle it, and // OR it with the first shuffled input. - pshufbMask.clear(); - for (unsigned i = 0; i != 8; ++i) { - int EltIdx = MaskVals[i] * 2; - int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; - int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; - pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); - } - V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); - V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, - DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, &pshufbMask[0], 16)); + CommuteVectorShuffleMask(MaskVals, 8); + V2 = getPSHUFB(MaskVals, V2, dl, DAG); V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); } @@ -6484,6 +6682,25 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, return NewV; } +/// \brief v16i16 shuffles +/// +/// FIXME: We only support generation of a single pshufb currently. We can +/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as +/// well (e.g 2 x pshufb + 1 x por). +static SDValue +LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast(Op); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + SDLoc dl(SVOp); + + if (V2.getOpcode() != ISD::UNDEF) + return SDValue(); + + SmallVector MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); + return getPSHUFB(MaskVals, V1, dl, DAG); +} + // v16i8 shuffles - Prefer shuffles in the following order: // 1. [ssse3] 1 x pshufb // 2. [ssse3] 2 x pshufb + 1 x por @@ -6524,7 +6741,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, &pshufbMask[0], 16)); + MVT::v16i8, pshufbMask)); // As PSHUFB will zero elements with negative indices, it's safe to ignore // the 2nd operand if it's undefined or zero. @@ -6542,7 +6759,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, &pshufbMask[0], 16)); + MVT::v16i8, pshufbMask)); return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); } @@ -6642,22 +6859,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, CommuteVectorShuffleMask(MaskVals, 32); V1 = V2; } - SmallVector pshufbMask; - for (unsigned i = 0; i != 32; i++) { - int EltIdx = MaskVals[i]; - if (EltIdx < 0 || EltIdx >= 32) - EltIdx = 0x80; - else { - if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) - // Cross lane is not allowed. - return SDValue(); - EltIdx &= 0xf; - } - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); - } - return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v32i8, &pshufbMask[0], 32)); + return getPSHUFB(MaskVals, V1, dl, DAG); } /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide @@ -6709,7 +6911,7 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, SDLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { - LoadSDNode *LD = NULL; + LoadSDNode *LD = nullptr; if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) LD = dyn_cast(SrcOp); if (!LD) { @@ -6828,8 +7030,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } // Construct the output using a BUILD_VECTOR. - Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], - SVOps.size()); + Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps); } else if (InputUsed[0] < 0) { // No input vectors were used! The result is undefined. Output[l] = DAG.getUNDEF(NVT); @@ -7111,6 +7312,84 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { getShuffleSHUFImmediate(SVOp), DAG); } +// It is only safe to call this function if isINSERTPSMask is true for +// this shufflevector mask. +static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, + SelectionDAG &DAG) { + // Generate an insertps instruction when inserting an f32 from memory onto a + // v4f32 or when copying a member from one v4f32 to another. + // We also use it for transferring i32 from one register to another, + // since it simply copies the same bits. + // If we're transfering an i32 from memory to a specific element in a + // register, we output a generic DAG that will match the PINSRD + // instruction. + // TODO: Optimize for AVX cases too (VINSERTPS) + MVT VT = SVOp->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + auto Mask = SVOp->getMask(); + assert((VT == MVT::v4f32 || VT == MVT::v4i32) && + "unsupported vector type for insertps/pinsrd"); + + int FromV1 = std::count_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }); + + SDValue From; + SDValue To; + unsigned DestIndex; + if (FromV1 == 1) { + From = V1; + To = V2; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }) - + Mask.begin(); + } else { + From = V2; + To = V1; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i >= 4; }) - + Mask.begin(); + } + + if (MayFoldLoad(From)) { + // Trivial case, when From comes from a load and is only used by the + // shuffle. Make it use insertps from the vector that we need from that + // load. + SDValue Addr = From.getOperand(1); + SDValue NewAddr = + DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(DestIndex * EVT.getStoreSize(), + Addr.getSimpleValueType())); + + LoadSDNode *Load = cast(From); + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + + if (EVT == MVT::f32) { + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad); + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, + InsertpsMask); + } else { // EVT == MVT::i32 + // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT + // instruction, to match the PINSRD instruction, which loads an i32 to a + // certain vector element. + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad, + DAG.getConstant(DestIndex, MVT::i32)); + } + } + + // Vector-element-to-vector + unsigned SrcIndex = Mask[DestIndex] % 4; + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); +} + // Reduce a vector shuffle to zext. static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { @@ -7281,7 +7560,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); - assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); + // When we create a shuffle node we put the UNDEF node to second operand, + // but in some cases the first operand may be transformed to UNDEF. + // In this case we should just commute the node. + if (V1IsUndef) + return CommuteVectorShuffle(SVOp, DAG); // Vector shuffle lowering takes 3 steps: // @@ -7450,7 +7733,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { CommuteVectorShuffleMask(M, NumElems); std::swap(V1, V2); std::swap(V1IsSplat, V2IsSplat); - Commuted = false; if (isUNPCKLMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); @@ -7510,6 +7792,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShuffleSHUFImmediate(SVOp), DAG); } + unsigned Idx; + if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx)) + return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl), + Idx*(NumElems/2), DAG, dl); + // Handle VPERM2F128/VPERM2I128 permutations if (isVPERM2X128Mask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, @@ -7519,6 +7806,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (BlendOp.getNode()) return BlendOp; + if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) + return getINSERTPS(SVOp, dl, DAG); + unsigned Imm8; if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); @@ -7532,14 +7822,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); } - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, - &permclMask[0], NumElems); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask); if (V2IsUndef) // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 return DAG.getNode(X86ISD::VPERMV, dl, VT, DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); - return DAG.getNode(X86ISD::VPERMV3, dl, VT, - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2); + return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2); } //===--------------------------------------------------------------------===// @@ -7555,6 +7844,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return NewOp; } + if (VT == MVT::v16i16 && Subtarget->hasInt256()) { + SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG); + if (NewOp.getNode()) + return NewOp; + } + if (VT == MVT::v16i8) { SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); if (NewOp.getNode()) @@ -7641,6 +7936,39 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +/// Extract one bit from mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +SDValue +X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDLoc dl(Vec); + MVT VecVT = Vec.getSimpleValueType(); + SDValue Idx = Op.getOperand(1); + MVT EltVT = Op.getSimpleValueType(); + + assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); + + // variable index can't be handled in mask registers, + // extend vector to VR512 + if (!isa(Idx)) { + MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + ExtVT.getVectorElementType(), Ext, Idx); + return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); + } + + unsigned IdxVal = cast(Idx)->getZExtValue(); + const TargetRegisterClass* rc = getRegClassFor(VecVT); + unsigned MaxSift = rc->getSize()*8 - 1; + Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift - IdxVal, MVT::i8)); + Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift, MVT::i8)); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, + DAG.getIntPtrConstant(0)); +} + SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { @@ -7648,6 +7976,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); + + if (Op.getSimpleValueType() == MVT::i1) + return ExtractBitFromMaskVector(Op, DAG); + if (!isa(Idx)) { if (VecVT.is512BitVector() || (VecVT.is256BitVector() && Subtarget->hasInt256() && @@ -7804,10 +8136,47 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +/// Insert one bit to mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +SDValue +X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue Elt = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + MVT VecVT = Vec.getSimpleValueType(); + + if (!isa(Idx)) { + // Non constant index. Extend source and destination, + // insert element and then truncate the result. + MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); + SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); + return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); + } + + unsigned IdxVal = cast(Idx)->getZExtValue(); + SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(IdxVal, MVT::i8)); + const TargetRegisterClass* rc = getRegClassFor(VecVT); + unsigned MaxSift = rc->getSize()*8 - 1; + EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(MaxSift, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, + DAG.getConstant(MaxSift - IdxVal, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); +} SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); + + if (EltVT == MVT::i1) + return InsertBitToMaskVector(Op, DAG); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); @@ -8152,10 +8521,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. @@ -8183,7 +8552,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { - return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, + return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } @@ -8200,7 +8569,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SDValue Base; if (is64Bit) { - Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, + Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; @@ -8339,7 +8708,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Args[] = { Chain, Offset }; - Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); + Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -8352,7 +8721,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Chain.getValue(1)); } - if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) { + if (Subtarget->isTargetKnownWindowsMSVC() || + Subtarget->isTargetWindowsGNU()) { // Just use the implicit TLS architecture // Need to generate someting similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -8367,7 +8737,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // If GV is an alias then use the aliasee for determining // thread-localness. if (const GlobalAlias *GA = dyn_cast(GV)) - GV = GA->resolveAliasedGlobal(false); + GV = GA->getAliasedGlobal(); SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); @@ -8380,13 +8750,16 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { : Type::getInt32PtrTy(*DAG.getContext(), 257)); - SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) : - (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) : - DAG.getExternalSymbol("_tls_array", getPointerTy())); + SDValue TlsArray = + Subtarget->is64Bit() + ? DAG.getIntPtrConstant(0x58) + : (Subtarget->isTargetWindowsGNU() + ? DAG.getIntPtrConstant(0x2C) + : DAG.getExternalSymbol("_tls_array", getPointerTy())); - SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, - MachinePointerInfo(Ptr), - false, false, false, 0); + SDValue ThreadPointer = + DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, + MachinePointerInfo(Ptr), false, false, false, 0); // Load the _tls_index variable SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); @@ -8422,15 +8795,20 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. -SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ +static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); + // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the + // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away + // during isel. + SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, + DAG.getConstant(VTBits - 1, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, MVT::i8)) : DAG.getConstant(0, VT); @@ -8438,12 +8816,15 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); - Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); - Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); + Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } + // If the shift amount is larger or equal than the width of a part we can't + // rely on the results of shld/shrd. Insert a test and select the appropriate + // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, MVT::i8)); SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, @@ -8455,25 +8836,25 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; if (Op.getOpcode() == ISD::SHL_PARTS) { - Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); - Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } else { - Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); - Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); + return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - EVT SrcVT = Op.getOperand(0).getValueType(); + MVT SrcVT = Op.getOperand(0).getSimpleValueType(); if (SrcVT.isVector()) return SDValue(); - assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && + assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); // These are really Legal; return the operand so the caller accepts it as @@ -8526,8 +8907,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, - Tys, Ops, array_lengthof(Ops), - SrcVT, MMO); + Tys, Ops, SrcVT, MMO); if (useSSE) { Chain = Result.getValue(1); @@ -8550,8 +8930,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, - Ops, array_lengthof(Ops), - Op.getValueType(), MMO); + Ops, Op.getValueType(), MMO); Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(SSFI), false, false, false, 0); @@ -8677,15 +9056,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); - EVT SVT = N0.getValueType(); + MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || SVT == MVT::v8i8 || SVT == MVT::v8i16) && "Custom UINT_TO_FP is not supported!"); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - SVT.getVectorNumElements()); + MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); } @@ -8704,8 +9082,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); - EVT SrcVT = N0.getValueType(); - EVT DstVT = Op.getValueType(); + MVT SrcVT = N0.getSimpleValueType(); + MVT DstVT = Op.getSimpleValueType(); if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); if (SrcVT == MVT::i32 && X86ScalarSSEf64) @@ -8747,7 +9125,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, - array_lengthof(Ops), MVT::i64, MMO); + MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); @@ -8840,8 +9218,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOLoad, MemSize, MemSize); - Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, - array_lengthof(Ops), DstTy, MMO); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); @@ -8855,8 +9232,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, array_lengthof(Ops), DstTy, - MMO); + Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); } else { SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, @@ -8868,8 +9244,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MVT::i32, eax.getValue(2)); SDValue Ops[] = { eax, edx }; SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) - : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) + : DAG.getMergeValues(Ops, DL); return std::make_pair(pair, SDValue()); } } @@ -8900,7 +9276,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, return SDValue(); if (Subtarget->hasInt256()) - return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In); + return DAG.getNode(X86ISD::VZEXT, dl, VT, In); SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); SDValue Undef = DAG.getUNDEF(InVT); @@ -8919,9 +9295,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op->getValueType(0).getSimpleVT(); + MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); - MVT InVT = In.getValueType().getSimpleVT(); + MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); unsigned int NumElts = VT.getVectorNumElements(); if (NumElts != 8 && NumElts != 16) @@ -8985,6 +9361,18 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); + + if (VT == MVT::i1) { + assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && + "Invalid scalar TRUNCATE operation"); + if (InVT == MVT::i32) + return SDValue(); + if (InVT.getSizeInBits() == 64) + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In); + else if (InVT.getSizeInBits() < 32) + In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); + return DAG.getNode(ISD::TRUNCATE, DL, VT, In); + } assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); @@ -9000,6 +9388,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; } + SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); const Constant *C = (dyn_cast(Cst))->getConstantIntValue(); SDValue CP = DAG.getConstantPool(C, getPointerTy()); @@ -9023,24 +9412,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0)); } - // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS. SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(2)); - OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); - - // The PSHUFD mask: - static const int ShufMask1[] = {0, 2, 0, 0}; - SDValue Undef = DAG.getUNDEF(VT); - OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1); - OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1); - - // The MOVLHPS mask: - static const int ShufMask2[] = {0, 1, 4, 5}; - return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); + static const int ShufMask[] = {0, 2, 4, 6}; + return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { @@ -9061,8 +9440,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { for (unsigned j = 0; j < 8; ++j) pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); } - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, - &pshufbMask[0], 32); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); @@ -9107,8 +9485,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); unsigned NumElems = VT.getVectorNumElements(); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - NumElems * 2); + MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); SmallVector MaskVec(NumElems * 2, -1); // Prepare truncation shuffle mask @@ -9123,20 +9500,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getSimpleValueType(); - if (VT.isVector()) { - if (VT == MVT::v8i16) - return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT, - DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op), - MVT::v8i32, Op.getOperand(0))); - return SDValue(); - } + assert(!Op.getSimpleValueType().isVector()); std::pair Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (FIST.getNode() == 0) return Op; + if (!FIST.getNode()) return Op; if (StackSlot.getNode()) // Load the result. @@ -9178,7 +9548,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { In, DAG.getUNDEF(SVT))); } -SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) { LLVMContext *Context = DAG.getContext(); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -9196,7 +9566,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, APInt(32, ~(1U << 31)))); C = ConstantVector::getSplat(NumElts, C); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), @@ -9212,7 +9583,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } -SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) { LLVMContext *Context = DAG.getContext(); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -9230,7 +9601,8 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, APInt(32, 1U << 31))); C = ConstantVector::getSplat(NumElts, C); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), @@ -9247,7 +9619,8 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } -SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext *Context = DAG.getContext(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -9283,7 +9656,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); } Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, 16); @@ -9316,7 +9689,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); } C = ConstantVector::get(CV); - CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, 16); @@ -9354,6 +9727,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, SmallVector Opnds; DenseMap VecInMap; + SmallVector VecIns; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to @@ -9393,6 +9767,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, VT != VecInMap.begin()->first.getValueType()) return SDValue(); M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; + VecIns.push_back(ExtractedFromVec); } M->second |= 1U << cast(Idx)->getZExtValue(); } @@ -9401,14 +9776,12 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, "Not extracted from 128-/256-bit vector."); unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; - SmallVector VecIns; for (DenseMap::const_iterator I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { // Quit if not all elements are used. if (I->second != FullMask) return SDValue(); - VecIns.push_back(I->first); } EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; @@ -9430,11 +9803,33 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, VecIns.back(), VecIns.back()); } +/// \brief return true if \c Op has a use that doesn't just read flags. +static bool hasNonFlagsUse(SDValue Op) { + for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; + ++UI) { + SDNode *User = *UI; + unsigned UOpNo = UI.getOperandNo(); + if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { + // Look pass truncate. + UOpNo = User->use_begin().getOperandNo(); + User = *User->use_begin(); + } + + if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && + !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) + return true; + } + return false; +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. -SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - SDLoc dl(Op); + if (Op.getValueType() == MVT::i1) + // KORTEST instruction should be selected + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, Op.getValueType())); // CF and OF aren't always set the way we want. Determine which // of these we need. @@ -9452,15 +9847,17 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, NeedOF = true; break; } - // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. - if (Op.getResNo() != 0 || NeedOF || NeedCF) + if (Op.getResNo() != 0 || NeedOF || NeedCF) { // Emit a CMP with 0, which is the TEST pattern. + //if (Op.getValueType() == MVT::i1) + // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, + // DAG.getConstant(0, MVT::i1)); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, Op.getValueType())); - + } unsigned Opcode = 0; unsigned NumOperands = 0; @@ -9529,31 +9926,35 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, Opcode = X86ISD::ADD; NumOperands = 2; break; - case ISD::AND: { - // If the primary and result isn't used, don't bother using X86ISD::AND, - // because a TEST instruction will be better. - bool NonFlagUse = false; - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - unsigned UOpNo = UI.getOperandNo(); - if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { - // Look pass truncate. - UOpNo = User->use_begin().getOperandNo(); - User = *User->use_begin(); - } - - if (User->getOpcode() != ISD::BRCOND && - User->getOpcode() != ISD::SETCC && - !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { - NonFlagUse = true; + case ISD::SHL: + case ISD::SRL: + // If we have a constant logical shift that's only used in a comparison + // against zero turn it into an equivalent AND. This allows turning it into + // a TEST instruction later. + if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && + isa(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + unsigned ShAmt = Op->getConstantOperandVal(1); + if (ShAmt >= BitWidth) // Avoid undefined shifts. break; - } + APInt Mask = ArithOp.getOpcode() == ISD::SRL + ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) + : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); + if (!Mask.isSignedIntN(32)) // Avoid large immediates. + break; + SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), + DAG.getConstant(Mask, VT)); + DAG.ReplaceAllUsesWith(Op, New); + Op = New; } + break; - if (!NonFlagUse) + case ISD::AND: + // If the primary and result isn't used, don't bother using X86ISD::AND, + // because a TEST instruction will be better. + if (!hasNonFlagsUse(Op)) break; - } // FALL THROUGH case ISD::SUB: case ISD::OR: @@ -9636,7 +10037,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, for (unsigned i = 0; i != NumOperands; ++i) Ops.push_back(Op.getOperand(i)); - SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); + SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesWith(Op, New); return SDValue(New.getNode(), 1); } @@ -9644,14 +10045,30 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, - SelectionDAG &DAG) const { - if (ConstantSDNode *C = dyn_cast(Op1)) + SDLoc dl, SelectionDAG &DAG) const { + if (ConstantSDNode *C = dyn_cast(Op1)) { if (C->getAPIntValue() == 0) - return EmitTest(Op0, X86CC, DAG); + return EmitTest(Op0, X86CC, dl, DAG); - SDLoc dl(Op0); + if (Op0.getValueType() == MVT::i1) + llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); + } + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { + // Do the comparison at i32 if it's smaller, besides the Atom case. + // This avoids subregister aliasing issues. Keep the smaller reference + // if we're optimizing for size, however, as that'll allow better folding + // of memory operations. + if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && + !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize) && + !Subtarget->isAtom()) { + unsigned ExtendOp = + isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); + Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); + } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, @@ -9836,38 +10253,74 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } -static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && Op.getValueType().getScalarType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); - SDLoc dl(Op); - + unsigned Opc = 0; bool Unsigned = false; + bool Swap = false; unsigned SSECC; switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: SSECC = 4; break; - case ISD::SETEQ: SSECC = 0; break; - case ISD::SETUGT: Unsigned = true; - case ISD::SETGT: SSECC = 6; break; // NLE - case ISD::SETULT: Unsigned = true; - case ISD::SETLT: SSECC = 1; break; - case ISD::SETUGE: Unsigned = true; - case ISD::SETGE: SSECC = 5; break; // NLT - case ISD::SETULE: Unsigned = true; + case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; + case ISD::SETUGT: SSECC = 6; Unsigned = true; break; + case ISD::SETLT: Swap = true; //fall-through + case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; + case ISD::SETULT: SSECC = 1; Unsigned = true; break; + case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT + case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap + case ISD::SETULE: Unsigned = true; //fall-through case ISD::SETLE: SSECC = 2; break; } - unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; + + if (Swap) + std::swap(Op0, Op1); + if (Opc) + return DAG.getNode(Opc, dl, VT, Op0, Op1); + Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); - +} + +/// \brief Try to turn a VSETULT into a VSETULE by modifying its second +/// operand \p Op1. If non-trivial (for example because it's not constant) +/// return an empty value. +static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) +{ + BuildVectorSDNode *BV = dyn_cast(Op1.getNode()); + if (!BV) + return SDValue(); + + MVT VT = Op1.getSimpleValueType(); + MVT EVT = VT.getVectorElementType(); + unsigned n = VT.getVectorNumElements(); + SmallVector ULTOp1; + + for (unsigned i = 0; i < n; ++i) { + ConstantSDNode *Elt = dyn_cast(BV->getOperand(i)); + if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT) + return SDValue(); + + // Avoid underflow. + APInt Val = Elt->getAPIntValue(); + if (Val == 0) + return SDValue(); + + ULTOp1.push_back(DAG.getConstant(Val - 1, EVT)); + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, @@ -9923,7 +10376,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (Subtarget->hasAVX512()) { if (Op1.getValueType().is512BitVector() || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) - return LowerIntVSETCC_AVX512(Op, DAG); + return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements. @@ -9941,21 +10394,22 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // operations may be required for some comparisons. unsigned Opc; bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; + bool Subus = false; switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; - case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break; + case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; - case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break; + case ISD::SETGT: Opc = X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; - case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; case ISD::SETULT: Swap = true; - case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; case ISD::SETUGE: Swap = true; - case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } @@ -9975,6 +10429,40 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } } + bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); + if (!MinMax && hasSubus) { + // As another special case, use PSUBUS[BW] when it's profitable. E.g. for + // Op0 u<= Op1: + // t = psubus Op0, Op1 + // pcmpeq t, <0..0> + switch (SetCCOpcode) { + default: break; + case ISD::SETULT: { + // If the comparison is against a constant we can turn this into a + // setule. With psubus, setule does not require a swap. This is + // beneficial because the constant in the register is no longer + // destructed as the destination so it can be hoisted out of a loop. + // Only do this pre-AVX since vpcmp* is no longer destructive. + if (Subtarget->hasAVX()) + break; + SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); + if (ULEOp1.getNode()) { + Op1 = ULEOp1; + Subus = true; Invert = false; Swap = false; + } + break; + } + // Psubus is better than flip-sign because it requires no inversion. + case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; + case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; + } + + if (Subus) { + Opc = X86ISD::SUBUS; + FlipSigns = false; + } + } + if (Swap) std::swap(Op0, Op1); @@ -10065,6 +10553,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (MinMax) Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); + if (Subus) + Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, + getZeroVector(VT, Subtarget, DAG, dl)); + return Result; } @@ -10074,7 +10566,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); - assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); + assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) + && "SetCC type must be 8-bit or 1-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); @@ -10106,23 +10599,38 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); bool Invert = (CC == ISD::SETNE) ^ cast(Op1)->isNullValue(); - if (!Invert) return Op0; + if (!Invert) + return Op0; CCode = X86::GetOppositeBranchCondition(CCode); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(CCode, MVT::i8), + Op0.getOperand(1)); + if (VT == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); + return SetCC; } } + if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && + (cast(Op1)->getZExtValue() == 1) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); + return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC); + } bool isFP = Op1.getSimpleValueType().isFloatingPoint(); unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) return SDValue(); - SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), EFLAGS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), EFLAGS); + if (VT == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); + return SetCC; } // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. @@ -10187,8 +10695,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { cast(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (SSECC != 8) { - unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd; - SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1, + if (Subtarget->hasAVX512()) { + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, + DAG.getConstant(SSECC, MVT::i8)); + return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); + } + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, MVT::i8)); SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); @@ -10246,7 +10758,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Res = DAG.getNOT(DL, Res, Res.getValueType()); ConstantSDNode *N2C = dyn_cast(Op2); - if (N2C == 0 || !N2C->isNullValue()) + if (!N2C || !N2C->isNullValue()) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } @@ -10335,7 +10847,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, DAG); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); } // a < b ? -1 : 0 -> RES = ~setcc_carry @@ -10375,7 +10887,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // condition is true. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = { Op2, Op1, CC, Cond }; - return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { @@ -10425,7 +10937,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); if (Subtarget->hasInt256()) - return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In); + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and @@ -10454,8 +10966,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), VT.getVectorNumElements()/2); - OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); - OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); + OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); + OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } @@ -10568,11 +11080,26 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { unsigned X86Opcode; unsigned X86Cond; SDVTList VTs; + // Keep this in sync with LowerXALUO, otherwise we might create redundant + // instructions that can't be removed afterwards (i.e. X86ISD::ADD and + // X86ISD::INC). switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; - case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::SADDO: + if (ConstantSDNode *C = dyn_cast(RHS)) + if (C->isOne()) { + X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; + break; + } + X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; - case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::SSUBO: + if (ConstantSDNode *C = dyn_cast(RHS)) + if (C->isOne()) { + X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; + break; + } + X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; default: llvm_unreachable("unexpected overflowing operator"); @@ -10741,7 +11268,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, DAG); + Cond = EmitTest(Cond, X86::COND_NE, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -10756,13 +11283,50 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { - assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || - getTargetMachine().Options.EnableSegmentedStacks) && - "This should be used only on Windows targets or when segmented stacks " - "are being used"); - assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); + MachineFunction &MF = DAG.getMachineFunction(); + bool SplitStack = MF.shouldSplitStack(); + bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) || + SplitStack; SDLoc dl(Op); + if (!Lower) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDNode* Node = Op.getNode(); + + unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" + " not tell us which reg is the stack pointer!"); + EVT VT = Node->getValueType(0); + SDValue Tmp1 = SDValue(Node, 0); + SDValue Tmp2 = SDValue(Node, 1); + SDValue Tmp3 = Node->getOperand(2); + SDValue Chain = Tmp1.getOperand(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true), + SDLoc(Node)); + + SDValue Size = Tmp2.getOperand(1); + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + unsigned Align = cast(Tmp3)->getZExtValue(); + const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + unsigned StackAlign = TFI.getStackAlignment(); + Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + if (Align > StackAlign) + Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, + DAG.getConstant(-(uint64_t)Align, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain + + Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), + DAG.getIntPtrConstant(0, true), SDValue(), + SDLoc(Node)); + + SDValue Ops[2] = { Tmp1, Tmp2 }; + return DAG.getMergeValues(Ops, dl); + } + // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); @@ -10772,8 +11336,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; - if (getTargetMachine().Options.EnableSegmentedStacks) { - MachineFunction &MF = DAG.getMachineFunction(); + if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { @@ -10795,7 +11358,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); SDValue Ops1[2] = { Value, Chain }; - return DAG.getMergeValues(Ops1, 2, dl); + return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); @@ -10819,7 +11382,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } SDValue Ops1[2] = { SP, Chain }; - return DAG.getMergeValues(Ops1, 2, dl); + return DAG.getMergeValues(Ops1, dl); } } @@ -10880,8 +11443,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, 16), false, false, 0); MemOps.push_back(Store); - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - &MemOps[0], MemOps.size()); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { @@ -10935,8 +11497,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { InstOps.push_back(DAG.getConstant(Align, MVT::i32)); SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, - VTs, &InstOps[0], InstOps.size(), - MVT::i64, + VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Align=*/0, /*Volatile=*/false, @@ -10971,14 +11532,15 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, // getTargetVShiftByConstNode - Handle vector element shifts where the shift // amount is a constant. Takes immediate version of shift as input. -static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, +static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { + MVT ElementType = VT.getVectorElementType(); // Check for ShiftAmt >= element width - if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) { + if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) - ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1; + ShiftAmt = ElementType.getSizeInBits() - 1; else return DAG.getConstant(0, VT); } @@ -10986,12 +11548,63 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"); + // Fold this packed vector shift into a build vector if SrcOp is a + // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. + if (VT == SrcOp.getSimpleValueType() && + ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { + SmallVector Elts; + unsigned NumElts = SrcOp->getNumOperands(); + ConstantSDNode *ND; + + switch(Opc) { + default: llvm_unreachable(nullptr); + case X86ISD::VSHLI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType)); + } + break; + case X86ISD::VSRLI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType)); + } + break; + case X86ISD::VSRAI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType)); + } + break; + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); + } + return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); } // getTargetVShiftNode - Handle vector element shifts where the shift amount // may or may not be a constant. Takes immediate version of shift as input. -static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, +static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); @@ -11015,11 +11628,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, MVT::i32); ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps); // The return type has to be a 128-bit type with the same element // type as the input type. - MVT EltVT = VT.getVectorElementType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); @@ -11138,6 +11751,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pmulhu_w: + case Intrinsic::x86_avx2_pmulhu_w: + return DAG.getNode(ISD::MULHU, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pmulh_w: + case Intrinsic::x86_avx2_pmulh_w: + return DAG.getNode(ISD::MULHS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // SSE2/AVX2 sub with unsigned saturation intrinsics case Intrinsic::x86_sse2_psubus_b: case Intrinsic::x86_sse2_psubus_w: @@ -11202,32 +11830,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxu_b: case Intrinsic::x86_avx2_pmaxu_w: case Intrinsic::x86_avx2_pmaxu_d: - case Intrinsic::x86_avx512_pmaxu_d: - case Intrinsic::x86_avx512_pmaxu_q: case Intrinsic::x86_sse2_pminu_b: case Intrinsic::x86_sse41_pminuw: case Intrinsic::x86_sse41_pminud: case Intrinsic::x86_avx2_pminu_b: case Intrinsic::x86_avx2_pminu_w: case Intrinsic::x86_avx2_pminu_d: - case Intrinsic::x86_avx512_pminu_d: - case Intrinsic::x86_avx512_pminu_q: case Intrinsic::x86_sse41_pmaxsb: case Intrinsic::x86_sse2_pmaxs_w: case Intrinsic::x86_sse41_pmaxsd: case Intrinsic::x86_avx2_pmaxs_b: case Intrinsic::x86_avx2_pmaxs_w: case Intrinsic::x86_avx2_pmaxs_d: - case Intrinsic::x86_avx512_pmaxs_d: - case Intrinsic::x86_avx512_pmaxs_q: case Intrinsic::x86_sse41_pminsb: case Intrinsic::x86_sse2_pmins_w: case Intrinsic::x86_sse41_pminsd: case Intrinsic::x86_avx2_pmins_b: case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: - case Intrinsic::x86_avx512_pmins_d: - case Intrinsic::x86_avx512_pmins_q: { + case Intrinsic::x86_avx2_pmins_d: { unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -11237,8 +11857,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxu_b: case Intrinsic::x86_avx2_pmaxu_w: case Intrinsic::x86_avx2_pmaxu_d: - case Intrinsic::x86_avx512_pmaxu_d: - case Intrinsic::x86_avx512_pmaxu_q: Opcode = X86ISD::UMAX; break; case Intrinsic::x86_sse2_pminu_b: @@ -11247,8 +11865,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pminu_b: case Intrinsic::x86_avx2_pminu_w: case Intrinsic::x86_avx2_pminu_d: - case Intrinsic::x86_avx512_pminu_d: - case Intrinsic::x86_avx512_pminu_q: Opcode = X86ISD::UMIN; break; case Intrinsic::x86_sse41_pmaxsb: @@ -11257,8 +11873,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxs_b: case Intrinsic::x86_avx2_pmaxs_w: case Intrinsic::x86_avx2_pmaxs_d: - case Intrinsic::x86_avx512_pmaxs_d: - case Intrinsic::x86_avx512_pmaxs_q: Opcode = X86ISD::SMAX; break; case Intrinsic::x86_sse41_pminsb: @@ -11267,8 +11881,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmins_b: case Intrinsic::x86_avx2_pmins_w: case Intrinsic::x86_avx2_pmins_d: - case Intrinsic::x86_avx512_pmins_d: - case Intrinsic::x86_avx512_pmins_q: Opcode = X86ISD::SMIN; break; } @@ -11281,14 +11893,10 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_sse2_max_pd: case Intrinsic::x86_avx_max_ps_256: case Intrinsic::x86_avx_max_pd_256: - case Intrinsic::x86_avx512_max_ps_512: - case Intrinsic::x86_avx512_max_pd_512: case Intrinsic::x86_sse_min_ps: case Intrinsic::x86_sse2_min_pd: case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: - case Intrinsic::x86_avx512_min_ps_512: - case Intrinsic::x86_avx512_min_pd_512: { + case Intrinsic::x86_avx_min_pd_256: { unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -11296,16 +11904,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_sse2_max_pd: case Intrinsic::x86_avx_max_ps_256: case Intrinsic::x86_avx_max_pd_256: - case Intrinsic::x86_avx512_max_ps_512: - case Intrinsic::x86_avx512_max_pd_512: Opcode = X86ISD::FMAX; break; case Intrinsic::x86_sse_min_ps: case Intrinsic::x86_sse2_min_pd: case Intrinsic::x86_avx_min_ps_256: case Intrinsic::x86_avx_min_pd_256: - case Intrinsic::x86_avx512_min_ps_512: - case Intrinsic::x86_avx512_min_pd_512: Opcode = X86ISD::FMIN; break; } @@ -11451,14 +12055,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - case Intrinsic::x86_avx512_kortestz: - case Intrinsic::x86_avx512_kortestc: { - unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B; + case Intrinsic::x86_avx512_kortestz_w: + case Intrinsic::x86_avx512_kortestc_w: { + unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); SDValue CC = DAG.getConstant(X86CC, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -11552,7 +12156,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { Opcode = X86ISD::VSRAI; break; } - return getTargetVShiftNode(Opcode, dl, Op.getValueType(), + return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); } @@ -11613,7 +12217,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), SDValue(PCMP.getNode(), 1)); @@ -11630,7 +12234,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + return DAG.getNode(Opcode, dl, VTs, NewOps); } case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: @@ -11737,7 +12341,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); EVT MaskVT = MVT::getVectorVT(MVT::i1, - Index.getValueType().getVectorNumElements()); + Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg = DAG.getConstant(~0, MaskVT); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); @@ -11745,7 +12349,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); + return DAG.getMergeValues(RetOps, dl); } static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -11757,7 +12361,7 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, assert(C && "Invalid scale type"); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); EVT MaskVT = MVT::getVectorVT(MVT::i1, - Index.getValueType().getVectorNumElements()); + Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); @@ -11767,7 +12371,7 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); + return DAG.getMergeValues(RetOps, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -11780,7 +12384,7 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); EVT MaskVT = MVT::getVectorVT(MVT::i1, - Index.getValueType().getVectorNumElements()); + Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg = DAG.getConstant(~0, MaskVT); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; @@ -11798,7 +12402,7 @@ static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); EVT MaskVT = MVT::getVectorVT(MVT::i1, - Index.getValueType().getVectorNumElements()); + Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; @@ -11806,6 +12410,69 @@ static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 1); } +// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that +// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is +// also used to custom lower READCYCLECOUNTER nodes. +static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl &Results) { + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); + SDValue LO, HI; + + // The processor's time-stamp counter (a 64-bit MSR) is stored into the + // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR + // and the EAX register is loaded with the low-order 32 bits. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + SDValue Chain = HI.getValue(1); + + if (Opcode == X86ISD::RDTSCP_DAG) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + + // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into + // the ECX register. Add 'ecx' explicitly to the chain. + SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, + HI.getValue(2)); + // Explicitly store the content of ECX at the location passed in input + // to the 'rdtscp' intrinsic. + Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), + MachinePointerInfo(), false, false, 0); + } + + if (Subtarget->is64Bit()) { + // The EDX register is loaded with the high-order 32 bits of the MSR, and + // the EAX register is loaded with the low-order 32 bits. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SmallVector Results; + SDLoc DL(Op); + getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + return DAG.getMergeValues(Results, DL); +} + static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -11836,7 +12503,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, DAG.getVTList(Op->getValueType(1), MVT::Glue), - Ops, array_lengthof(Ops)); + Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, @@ -11853,15 +12520,15 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case Intrinsic::x86_avx512_gather_dpi_512: { unsigned Opc; switch (IntNo) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; - case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; - case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; - case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; - case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; - case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; - case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; - case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; + case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; + case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; + case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; + case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; + case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; + case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; + case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; } SDValue Chain = Op.getOperand(0); SDValue Index = Op.getOperand(2); @@ -11880,23 +12547,23 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case Intrinsic::x86_avx512_gather_dpq_mask_512: { unsigned Opc; switch (IntNo) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_avx512_gather_qps_mask_512: - Opc = X86::VGATHERQPSZrm; break; - case Intrinsic::x86_avx512_gather_qpd_mask_512: - Opc = X86::VGATHERQPDZrm; break; - case Intrinsic::x86_avx512_gather_dpd_mask_512: - Opc = X86::VGATHERDPDZrm; break; - case Intrinsic::x86_avx512_gather_dps_mask_512: - Opc = X86::VGATHERDPSZrm; break; - case Intrinsic::x86_avx512_gather_qpi_mask_512: - Opc = X86::VPGATHERQDZrm; break; - case Intrinsic::x86_avx512_gather_qpq_mask_512: - Opc = X86::VPGATHERQQZrm; break; - case Intrinsic::x86_avx512_gather_dpi_mask_512: - Opc = X86::VPGATHERDDZrm; break; - case Intrinsic::x86_avx512_gather_dpq_mask_512: - Opc = X86::VPGATHERDQZrm; break; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_avx512_gather_qps_mask_512: + Opc = X86::VGATHERQPSZrm; break; + case Intrinsic::x86_avx512_gather_qpd_mask_512: + Opc = X86::VGATHERQPDZrm; break; + case Intrinsic::x86_avx512_gather_dpd_mask_512: + Opc = X86::VGATHERDPDZrm; break; + case Intrinsic::x86_avx512_gather_dps_mask_512: + Opc = X86::VGATHERDPSZrm; break; + case Intrinsic::x86_avx512_gather_qpi_mask_512: + Opc = X86::VPGATHERQDZrm; break; + case Intrinsic::x86_avx512_gather_qpq_mask_512: + Opc = X86::VPGATHERQQZrm; break; + case Intrinsic::x86_avx512_gather_dpi_mask_512: + Opc = X86::VPGATHERDDZrm; break; + case Intrinsic::x86_avx512_gather_dpq_mask_512: + Opc = X86::VPGATHERDQZrm; break; } SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); @@ -11918,23 +12585,23 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case Intrinsic::x86_avx512_scatter_dpi_512: { unsigned Opc; switch (IntNo) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_avx512_scatter_qpd_512: - Opc = X86::VSCATTERQPDZmr; break; - case Intrinsic::x86_avx512_scatter_qps_512: - Opc = X86::VSCATTERQPSZmr; break; - case Intrinsic::x86_avx512_scatter_dpd_512: - Opc = X86::VSCATTERDPDZmr; break; - case Intrinsic::x86_avx512_scatter_dps_512: - Opc = X86::VSCATTERDPSZmr; break; - case Intrinsic::x86_avx512_scatter_qpi_512: - Opc = X86::VPSCATTERQDZmr; break; - case Intrinsic::x86_avx512_scatter_qpq_512: - Opc = X86::VPSCATTERQQZmr; break; - case Intrinsic::x86_avx512_scatter_dpq_512: - Opc = X86::VPSCATTERDQZmr; break; - case Intrinsic::x86_avx512_scatter_dpi_512: - Opc = X86::VPSCATTERDDZmr; break; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_avx512_scatter_qpd_512: + Opc = X86::VSCATTERQPDZmr; break; + case Intrinsic::x86_avx512_scatter_qps_512: + Opc = X86::VSCATTERQPSZmr; break; + case Intrinsic::x86_avx512_scatter_dpd_512: + Opc = X86::VSCATTERDPDZmr; break; + case Intrinsic::x86_avx512_scatter_dps_512: + Opc = X86::VSCATTERDPSZmr; break; + case Intrinsic::x86_avx512_scatter_qpi_512: + Opc = X86::VPSCATTERQDZmr; break; + case Intrinsic::x86_avx512_scatter_qpq_512: + Opc = X86::VPSCATTERQQZmr; break; + case Intrinsic::x86_avx512_scatter_dpq_512: + Opc = X86::VPSCATTERDQZmr; break; + case Intrinsic::x86_avx512_scatter_dpi_512: + Opc = X86::VPSCATTERDDZmr; break; } SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); @@ -11954,23 +12621,23 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case Intrinsic::x86_avx512_scatter_dpq_mask_512: { unsigned Opc; switch (IntNo) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_avx512_scatter_qpd_mask_512: - Opc = X86::VSCATTERQPDZmr; break; - case Intrinsic::x86_avx512_scatter_qps_mask_512: - Opc = X86::VSCATTERQPSZmr; break; - case Intrinsic::x86_avx512_scatter_dpd_mask_512: - Opc = X86::VSCATTERDPDZmr; break; - case Intrinsic::x86_avx512_scatter_dps_mask_512: - Opc = X86::VSCATTERDPSZmr; break; - case Intrinsic::x86_avx512_scatter_qpi_mask_512: - Opc = X86::VPSCATTERQDZmr; break; - case Intrinsic::x86_avx512_scatter_qpq_mask_512: - Opc = X86::VPSCATTERQQZmr; break; - case Intrinsic::x86_avx512_scatter_dpq_mask_512: - Opc = X86::VPSCATTERDQZmr; break; - case Intrinsic::x86_avx512_scatter_dpi_mask_512: - Opc = X86::VPSCATTERDDZmr; break; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_avx512_scatter_qpd_mask_512: + Opc = X86::VSCATTERQPDZmr; break; + case Intrinsic::x86_avx512_scatter_qps_mask_512: + Opc = X86::VSCATTERQPSZmr; break; + case Intrinsic::x86_avx512_scatter_dpd_mask_512: + Opc = X86::VSCATTERDPDZmr; break; + case Intrinsic::x86_avx512_scatter_dps_mask_512: + Opc = X86::VSCATTERDPSZmr; break; + case Intrinsic::x86_avx512_scatter_qpi_mask_512: + Opc = X86::VPSCATTERQDZmr; break; + case Intrinsic::x86_avx512_scatter_qpq_mask_512: + Opc = X86::VPSCATTERQQZmr; break; + case Intrinsic::x86_avx512_scatter_dpq_mask_512: + Opc = X86::VPSCATTERDQZmr; break; + case Intrinsic::x86_avx512_scatter_dpi_mask_512: + Opc = X86::VPSCATTERDDZmr; break; } SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); @@ -11980,6 +12647,22 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Scale = Op.getOperand(6); return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); } + // Read Time Stamp Counter (RDTSC). + case Intrinsic::x86_rdtsc: + // Read Time Stamp Counter and Processor ID (RDTSCP). + case Intrinsic::x86_rdtscp: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_rdtsc: + Opc = X86ISD::RDTSC_DAG; break; + case Intrinsic::x86_rdtscp: + Opc = X86ISD::RDTSCP_DAG; break; + } + SmallVector Results; + getReadTimeStampCounter(Op.getNode(), dl, Opc, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); + } // XTEST intrinsics. case Intrinsic::x86_xtest: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); @@ -11999,6 +12682,9 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setReturnAddressIsTaken(true); + if (verifyReturnAddressArgumentIsConstant(Op, DAG)) + return SDValue(); + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); EVT PtrVT = getPointerTy(); @@ -12041,6 +12727,18 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned X86TargetLowering::getRegisterByName(const char* RegName) const { + unsigned Reg = StringSwitch(RegName) + .Case("esp", X86::ESP) + .Case("rsp", X86::RSP) + .Default(0); + if (Reg) + return Reg; + report_fatal_error("Invalid register name global variable"); +} + SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = @@ -12160,7 +12858,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, MachinePointerInfo(TrmpAddr, 22), false, false, 0); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = cast(cast(Op.getOperand(5))->getValue()); @@ -12240,7 +12938,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, MachinePointerInfo(TrmpAddr, 6), false, false, 1); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } } @@ -12269,7 +12967,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, const TargetMachine &TM = MF.getTarget(); const TargetFrameLowering &TFI = *TM.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot @@ -12283,8 +12981,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), - Ops, array_lengthof(Ops), MVT::i16, - MMO); + Ops, MVT::i16, MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, @@ -12314,7 +13011,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, } static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); @@ -12337,7 +13034,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); @@ -12348,7 +13045,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); @@ -12373,7 +13070,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); Op = Op.getOperand(0); @@ -12389,13 +13086,13 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit // ones, and then concatenate the result back. static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"); @@ -12413,8 +13110,8 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), @@ -12422,15 +13119,15 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { - assert(Op.getValueType().is256BitVector() && - Op.getValueType().isInteger() && + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { - assert(Op.getValueType().is256BitVector() && - Op.getValueType().isInteger() && + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } @@ -12438,7 +13135,7 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget->hasInt256()) @@ -12507,64 +13204,109 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } -static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); - EVT EltTy = VT.getVectorElementType(); - unsigned NumElts = VT.getVectorNumElements(); - SDValue N0 = Op.getOperand(0); + assert(VT.isInteger() && VT.getSizeInBits() == 128 && + "Unexpected return type for lowering"); + + RTLIB::Libcall LC; + bool isSigned; + switch (Op->getOpcode()) { + default: llvm_unreachable("Unexpected request for libcall!"); + case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; + case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; + case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; + case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; + case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; + case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; + } + SDLoc dl(Op); + SDValue InChain = DAG.getEntryNode(); - // Lower sdiv X, pow2-const. - BuildVectorSDNode *C = dyn_cast(Op.getOperand(1)); - if (!C) - return SDValue(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { + EVT ArgVT = Op->getOperand(i).getValueType(); + assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && + "Unexpected argument type for lowering"); + SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); + Entry.Node = StackPtr; + InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), + false, false, 16); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Ty = PointerType::get(ArgTy,0); + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + } + + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy()); - APInt SplatValue, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, - HasAnyUndefs) || - EltTy.getSizeInBits() < SplatBitSize) - return SDValue(); + TargetLowering::CallLoweringInfo CLI( + InChain, static_cast(MVT::v2i64).getTypeForEVT(*DAG.getContext()), + isSigned, !isSigned, false, true, 0, getLibcallCallingConv(LC), + /*isTailCall=*/false, + /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG, + dl); + std::pair CallInfo = LowerCallTo(CLI); + + return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first); +} + +static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); + EVT VT = Op0.getValueType(); + SDLoc dl(Op); + + assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || + (VT == MVT::v8i32 && Subtarget->hasInt256())); + + // Get the high parts. + const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8}; + SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + + // Emit two multiplies, one for the lower 2 ints and one for the higher 2 + // ints. + MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; + bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; + unsigned Opcode = + (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); - if ((SplatValue != 0) && - (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { - unsigned Lg2 = SplatValue.countTrailingZeros(); - // Splat the sign bit. - SmallVector Sz(NumElts, - DAG.getConstant(EltTy.getSizeInBits() - 1, - EltTy)); - SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0], - NumElts)); - // Add (N0 < 0) ? abs2 - 1 : 0; - SmallVector Amt(NumElts, - DAG.getConstant(EltTy.getSizeInBits() - Lg2, - EltTy)); - SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0], - NumElts)); - SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); - SmallVector Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy)); - SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0], - NumElts)); - - // If we're dividing by a positive value, we're done. Otherwise, we must - // negate the result. - if (SplatValue.isNonNegative()) - return SRA; - - SmallVector V(NumElts, DAG.getConstant(0, EltTy)); - SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); - return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); + // Shuffle it back into the right order. + const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15}; + SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14}; + SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + + // If we have a signed multiply but no PMULDQ fix up the high parts of a + // unsigned multiply. + if (IsSigned && !Subtarget->hasSSE41()) { + SDValue ShAmt = + DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); + SDValue T1 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); + SDValue T2 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); + + SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); + Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - return SDValue(); + + return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); @@ -12603,7 +13345,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -12616,7 +13358,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { @@ -12629,7 +13371,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector V(16, DAG.getConstant(128 >> ShiftAmt, MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -12649,7 +13391,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -12662,7 +13404,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { @@ -12675,7 +13417,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector V(32, DAG.getConstant(128 >> ShiftAmt, MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -12691,13 +13433,13 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); - unsigned Ratio = Amt.getValueType().getVectorNumElements() / + unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; for (unsigned i = 0; i != Ratio; ++i) { ConstantSDNode *C = dyn_cast(Amt.getOperand(i)); - if (C == 0) + if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); @@ -12708,7 +13450,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, for (unsigned j = 0; j != Ratio; ++j) { ConstantSDNode *C = dyn_cast(Amt.getOperand(i + j)); - if (C == 0) + if (!C) return SDValue(); // 6 == Log2(64) ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); @@ -12736,7 +13478,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget* Subtarget) { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); @@ -12790,7 +13532,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, BaseShAmt = InVec.getOperand(1); } } - if (BaseShAmt.getNode() == 0) + if (!BaseShAmt.getNode()) BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, DAG.getIntPtrConstant(0)); } @@ -12806,7 +13548,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, default: llvm_unreachable("Unknown shift opcode!"); case ISD::SHL: - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return SDValue(); case MVT::v2i64: case MVT::v4i32: @@ -12819,7 +13561,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRA: - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return SDValue(); case MVT::v4i32: case MVT::v8i16: @@ -12830,7 +13572,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRL: - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return SDValue(); case MVT::v2i64: case MVT::v4i32: @@ -12853,7 +13595,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); - unsigned Ratio = Amt.getValueType().getVectorNumElements() / + unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); std::vector Vals(Ratio); for (unsigned i = 0; i != Ratio; ++i) @@ -12881,7 +13623,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); @@ -12914,6 +13656,39 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return Op; } + // If possible, lower this packed shift into a vector multiply instead of + // expanding it into a sequence of scalar shifts. + // Do this only if the vector shift count is a constant build_vector. + if (Op.getOpcode() == ISD::SHL && + (VT == MVT::v8i16 || VT == MVT::v4i32 || + (Subtarget->hasInt256() && VT == MVT::v16i16)) && + ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + SmallVector Elts; + EVT SVT = VT.getScalarType(); + unsigned SVTBits = SVT.getSizeInBits(); + const APInt &One = APInt(SVTBits, 1); + unsigned NumElems = VT.getVectorNumElements(); + + for (unsigned i=0; i !=NumElems; ++i) { + SDValue Op = Amt->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + Elts.push_back(Op); + continue; + } + + ConstantSDNode *ND = cast(Op); + const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue()); + uint64_t ShAmt = C.getZExtValue(); + if (ShAmt >= SVTBits) { + Elts.push_back(DAG.getUNDEF(SVT)); + continue; + } + Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); + return DAG.getNode(ISD::MUL, dl, VT, R, BV); + } + // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); @@ -12923,6 +13698,80 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } + + // If possible, lower this shift as a sequence of two shifts by + // constant plus a MOVSS/MOVSD instead of scalarizing it. + // Example: + // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) + // + // Could be rewritten as: + // (v4i32 (MOVSS (srl A, ), (srl A, ))) + // + // The advantage is that the two shifts from the example would be + // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing + // the vector shift into four scalar shifts plus four pairs of vector + // insert/extract. + if ((VT == MVT::v8i16 || VT == MVT::v4i32) && + ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + unsigned TargetOpcode = X86ISD::MOVSS; + bool CanBeSimplified; + // The splat value for the first packed shift (the 'X' from the example). + SDValue Amt1 = Amt->getOperand(0); + // The splat value for the second packed shift (the 'Y' from the example). + SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : + Amt->getOperand(2); + + // See if it is possible to replace this node with a sequence of + // two shifts followed by a MOVSS/MOVSD + if (VT == MVT::v4i32) { + // Check if it is legal to use a MOVSS. + CanBeSimplified = Amt2 == Amt->getOperand(2) && + Amt2 == Amt->getOperand(3); + if (!CanBeSimplified) { + // Otherwise, check if we can still simplify this node using a MOVSD. + CanBeSimplified = Amt1 == Amt->getOperand(1) && + Amt->getOperand(2) == Amt->getOperand(3); + TargetOpcode = X86ISD::MOVSD; + Amt2 = Amt->getOperand(2); + } + } else { + // Do similar checks for the case where the machine value type + // is MVT::v8i16. + CanBeSimplified = Amt1 == Amt->getOperand(1); + for (unsigned i=3; i != 8 && CanBeSimplified; ++i) + CanBeSimplified = Amt2 == Amt->getOperand(i); + + if (!CanBeSimplified) { + TargetOpcode = X86ISD::MOVSD; + CanBeSimplified = true; + Amt2 = Amt->getOperand(4); + for (unsigned i=0; i != 4 && CanBeSimplified; ++i) + CanBeSimplified = Amt1 == Amt->getOperand(i); + for (unsigned j=4; j != 8 && CanBeSimplified; ++j) + CanBeSimplified = Amt2 == Amt->getOperand(j); + } + } + + if (CanBeSimplified && isa(Amt1) && + isa(Amt2)) { + // Replace this node with two shifts followed by a MOVSS/MOVSD. + EVT CastVT = MVT::v4i32; + SDValue Splat1 = + DAG.getConstant(cast(Amt1)->getAPIntValue(), VT); + SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); + SDValue Splat2 = + DAG.getConstant(cast(Amt2)->getAPIntValue(), VT); + SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); + if (TargetOpcode == X86ISD::MOVSD) + CastVT = MVT::v2i64; + SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1); + SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2); + SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, + BitCast1, DAG); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); @@ -12966,10 +13815,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return R; } + // It's worth extending once and using the v8i32 shifts for 16-bit types, but + // the extra overheads to get from v16i8 to v8i32 make the existing SSE + // solution better. + if (Subtarget->hasInt256() && VT == MVT::v8i16) { + MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16; + unsigned ExtOpc = + Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + R = DAG.getNode(ExtOpc, dl, NewVT, R); + Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); + } + // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); - MVT EltVT = VT.getVectorElementType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); // Extract the two vectors @@ -12987,10 +13849,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, for (unsigned i = NumElems/2; i != NumElems; ++i) Amt2Csts.push_back(Amt->getOperand(i)); - Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt1Csts[0], NumElems/2); - Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt2Csts[0], NumElems/2); + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); } else { // Variable shift amount Amt1 = Extract128BitVector(Amt, 0, DAG, dl); @@ -13087,7 +13947,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT ExtraVT = cast(Op.getOperand(1))->getVT(); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); if (!Subtarget->hasSSE2() || !VT.isVector()) return SDValue(); @@ -13095,7 +13955,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, unsigned BitsDiff = VT.getScalarType().getSizeInBits() - ExtraVT.getScalarType().getSizeInBits(); - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return SDValue(); case MVT::v8i32: case MVT::v16i16: @@ -13110,7 +13970,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); - MVT EltVT = VT.getVectorElementType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); EVT ExtraEltVT = ExtraVT.getVectorElementType(); @@ -13127,19 +13987,27 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, // fall through case MVT::v4i32: case MVT::v8i16: { - // (sext (vzext x)) -> (vsext x) SDValue Op0 = Op.getOperand(0); SDValue Op00 = Op0.getOperand(0); SDValue Tmp1; // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. if (Op0.getOpcode() == ISD::BITCAST && - Op00.getOpcode() == ISD::VECTOR_SHUFFLE) + Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { + // (sext (vzext x)) -> (vsext x) Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); - if (Tmp1.getNode()) { - SDValue Tmp1Op0 = Tmp1.getOperand(0); - assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && - "This optimization is invalid without a VZEXT."); - return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + if (Tmp1.getNode()) { + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + // This folding is only valid when the in-reg type is a vector of i8, + // i16, or i32. + if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || + ExtraEltVT == MVT::i32) { + SDValue Tmp1Op0 = Tmp1.getOperand(0); + assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && + "This optimization is invalid without a VZEXT."); + return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + } + Op0 = Tmp1; + } } // If the above didn't work, then just use Shift-Left + Shift-Right. @@ -13189,11 +14057,11 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT T = Op.getValueType(); + MVT T = Op.getSimpleValueType(); SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; - switch(T.getSimpleVT().SimpleTy) { + switch(T.SimpleTy) { default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; @@ -13213,31 +14081,12 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, - Ops, array_lengthof(Ops), T, MMO); + Ops, T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); return cpOut; } -static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - assert(Subtarget->is64Bit() && "Result not type legalized?"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue TheChain = Op.getOperand(0); - SDLoc dl(Op); - SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); - SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); - SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, - rax.getValue(2)); - SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, - DAG.getConstant(32, MVT::i8)); - SDValue Ops[] = { - DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), - rdx.getValue(1) - }; - return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); -} - static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); @@ -13269,8 +14118,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { cast(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), negOp, - cast(Node)->getSrcValue(), - cast(Node)->getAlignment(), + cast(Node)->getMemOperand(), cast(Node)->getOrdering(), cast(Node)->getSynchScope()); } @@ -13301,7 +14149,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getNode()->getValueType(0); + EVT VT = Op.getNode()->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) @@ -13443,6 +14291,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); + case ISD::UMUL_LOHI: + case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); @@ -13460,7 +14310,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); - case ISD::SDIV: return LowerSDIV(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); } } @@ -13481,6 +14330,7 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, Node->getOperand(1), Zero, Zero, cast(Node)->getMemOperand(), cast(Node)->getOrdering(), + cast(Node)->getOrdering(), cast(Node)->getSynchScope()); Results.push_back(Swap.getValue(0)); Results.push_back(Swap.getValue(1)); @@ -13502,10 +14352,10 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl&Results, SDValue Ops[] = { Chain, In1, In2L, In2H }; SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64, cast(Node)->getMemOperand()); SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF)); Results.push_back(Result.getValue(2)); } @@ -13526,6 +14376,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SUBE: // We don't want to expand or promote these. return; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: { + SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); + Results.push_back(V); + return; + } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; @@ -13536,10 +14396,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, std::pair Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; - if (FIST.getNode() != 0) { + if (FIST.getNode()) { EVT VT = N->getValueType(0); // Return a load from the stack slot. - if (StackSlot.getNode() != 0) + if (StackSlot.getNode()) Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo(), false, false, false, 0)); @@ -13572,20 +14432,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(V); return; } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default : llvm_unreachable("Do not know how to custom type " + "legalize this intrinsic operation!"); + case Intrinsic::x86_rdtsc: + return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + case Intrinsic::x86_rdtscp: + return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, + Results); + } + } case ISD::READCYCLECOUNTER: { - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue TheChain = N->getOperand(0); - SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); - SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, - rd.getValue(1)); - SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, - eax.getValue(2)); - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - SDValue Ops[] = { eax, edx }; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, - array_lengthof(Ops))); - Results.push_back(edx.getValue(1)); - return; + return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); } case ISD::ATOMIC_CMP_SWAP: { EVT T = N->getValueType(0); @@ -13621,8 +14483,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MachineMemOperand *MMO = cast(N)->getMemOperand(); unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; - SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, - Ops, array_lengthof(Ops), T, MMO); + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -13630,7 +14491,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); Results.push_back(cpOutH.getValue(1)); return; } @@ -13692,7 +14553,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { - default: return NULL; + default: return nullptr; case X86ISD::BSF: return "X86ISD::BSF"; case X86ISD::BSR: return "X86ISD::BSR"; case X86ISD::SHLD: return "X86ISD::SHLD"; @@ -13719,8 +14580,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; - case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; - case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; + case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -13772,7 +14632,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; - case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; @@ -13805,17 +14664,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; - case X86ISD::BLSI: return "X86ISD::BLSI"; - case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; - case X86ISD::BLSR: return "X86ISD::BLSR"; - case X86ISD::BZHI: return "X86ISD::BZHI"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::TESTM: return "X86ISD::TESTM"; + case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; - case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; @@ -13835,12 +14690,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; + case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; + case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; + case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -13871,7 +14729,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, Reloc::Model R = getTargetMachine().getRelocationModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. - if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) + if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { @@ -13916,6 +14774,24 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, return true; } +bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { + unsigned Bits = Ty->getScalarSizeInBits(); + + // 8-bit shifts are always expensive, but versions with a scalar amount aren't + // particularly cheaper than those without. + if (Bits == 8) + return false; + + // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make + // variable shifts just as cheap as scalar ones. + if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) + return false; + + // Otherwise, it's significantly cheaper to shift by a scalar amount than by a + // fully general vector. + return true; +} + bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; @@ -14099,7 +14975,7 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, - llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: @@ -14333,7 +15209,7 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, - llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: @@ -14619,7 +15495,7 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, - llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: @@ -14979,7 +15855,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( OffsetDestReg = 0; // unused OverflowDestReg = DestReg; - offsetMBB = NULL; + offsetMBB = nullptr; overflowMBB = thisMBB; endMBB = thisMBB; } else { @@ -15015,8 +15891,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( // Transfer the remainder of MBB and its successor edges to endMBB. endMBB->splice(endMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(MI)), - thisMBB->end()); + std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Make offsetMBB and overflowMBB successors of thisMBB @@ -15186,8 +16061,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // Transfer the remainder of MBB and its successor edges to EndMBB. EndMBB->splice(EndMBB->begin(), MBB, - llvm::next(MachineBasicBlock::iterator(MI)), - MBB->end()); + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndMBB->transferSuccessorsAndUpdatePHIs(MBB); // The original block will now fall through to the XMM save block. @@ -15210,9 +16084,15 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MBB->addSuccessor(EndMBB); } + // Make sure the last operand is EFLAGS, which gets clobbered by the branch + // that was just emitted, but clearly shouldn't be "saved". + assert((MI->getNumOperands() <= 3 || + !MI->getOperand(MI->getNumOperands() - 1).isReg() || + MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) + && "Expected last argument to be EFLAGS"); unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; // In the XMM save block, save all the XMM argument registers. - for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { + for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; MachineMemOperand *MMO = F->getMachineMemOperand( @@ -15243,7 +16123,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { // Scan forward through BB for a use/def of EFLAGS. - MachineBasicBlock::iterator miI(llvm::next(SelectItr)); + MachineBasicBlock::iterator miI(std::next(SelectItr)); for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { const MachineInstr& mi = *miI; if (mi.readsRegister(X86::EFLAGS)) @@ -15308,8 +16188,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); + std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. @@ -15346,7 +16225,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - assert(getTargetMachine().Options.EnableSegmentedStacks); + assert(MF->shouldSplitStack()); unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; @@ -15389,8 +16268,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MF->insert(MBBIter, mallocMBB); MF->insert(MBBIter, continueMBB); - continueMBB->splice(continueMBB->begin(), BB, llvm::next - (MachineBasicBlock::iterator(MI)), BB->end()); + continueMBB->splice(continueMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); continueMBB->transferSuccessorsAndUpdatePHIs(BB); // Add code to the main basic block to check if the stack limit has been hit, @@ -15465,7 +16344,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); - assert(!Subtarget->isTargetEnvMacho()); + assert(!Subtarget->isTargetMacho()); // The lowering is pretty easy: we're just emitting the call to _alloca. The // non-trivial part is impdef of ESP. @@ -15496,7 +16375,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, } } else { const char *StackProbeSymbol = - Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; + Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca"; BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol(StackProbeSymbol) @@ -15631,7 +16510,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, - llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: @@ -15772,6 +16651,89 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, return MBB; } +// Replace 213-type (isel default) FMA3 instructions with 231-type for +// accumulator loops. Writing back to the accumulator allows the coalescer +// to remove extra copies in the loop. +MachineBasicBlock * +X86TargetLowering::emitFMA3Instr(MachineInstr *MI, + MachineBasicBlock *MBB) const { + MachineOperand &AddendOp = MI->getOperand(3); + + // Bail out early if the addend isn't a register - we can't switch these. + if (!AddendOp.isReg()) + return MBB; + + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Check whether the addend is defined by a PHI: + assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); + MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); + if (!AddendDef.isPHI()) + return MBB; + + // Look for the following pattern: + // loop: + // %addend = phi [%entry, 0], [%loop, %result] + // ... + // %result = FMA213 %m2, %m1, %addend + + // Replace with: + // loop: + // %addend = phi [%entry, 0], [%loop, %result] + // ... + // %result = FMA231 %addend, %m1, %m2 + + for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { + assert(AddendDef.getOperand(i).isReg()); + MachineOperand PHISrcOp = AddendDef.getOperand(i); + MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); + if (&PHISrcInst == MI) { + // Found a matching instruction. + unsigned NewFMAOpc = 0; + switch (MI->getOpcode()) { + case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; + case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; + case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; + case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; + case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; + case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; + case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; + case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; + case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; + case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; + case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; + case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; + case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; + case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; + case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; + case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; + case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; + case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; + case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; + case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; + case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; + case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; + case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; + case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; + default: llvm_unreachable("Unrecognized FMA variant."); + } + + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + MachineInstrBuilder MIB = + BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(3)) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(1)); + MBB->insert(MachineBasicBlock::iterator(MI), MIB); + MI->eraseFromParent(); + } + } + + return MBB; +} + MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -15999,6 +16961,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::EH_SjLj_LongJmp32: case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); + + case TargetOpcode::STACKMAP: + case TargetOpcode::PATCHPOINT: + return emitPatchPoint(MI, BB); + + case X86::VFMADDPDr213r: + case X86::VFMADDPSr213r: + case X86::VFMADDSDr213r: + case X86::VFMADDSSr213r: + case X86::VFMSUBPDr213r: + case X86::VFMSUBPSr213r: + case X86::VFMSUBSDr213r: + case X86::VFMSUBSSr213r: + case X86::VFNMADDPDr213r: + case X86::VFNMADDPSr213r: + case X86::VFNMADDSDr213r: + case X86::VFNMADDSSr213r: + case X86::VFNMSUBPDr213r: + case X86::VFNMSUBPSr213r: + case X86::VFNMSUBSDr213r: + case X86::VFNMSUBSSr213r: + case X86::VFMADDPDr213rY: + case X86::VFMADDPSr213rY: + case X86::VFMSUBPDr213rY: + case X86::VFMSUBPSr213rY: + case X86::VFNMADDPDr213rY: + case X86::VFNMADDPSr213rY: + case X86::VFNMSUBPDr213rY: + case X86::VFNMSUBPSr213rY: + return emitFMA3Instr(MI, BB); } } @@ -16073,8 +17065,10 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, } } -unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, - unsigned Depth) const { +unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &, + unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) return Op.getValueType().getScalarType().getSizeInBits(); @@ -16176,7 +17170,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, - array_lengthof(Ops), Ld->getMemoryVT(), Ld->getPointerInfo(), Ld->getAlignment(), @@ -16254,7 +17247,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); } /// PerformTruncateCombine - Converts truncate operation to @@ -16361,44 +17354,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } -/// Extract one bit from mask vector, like v16i1 or v8i1. -/// AVX-512 feature. -static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) { - SDValue Vec = N->getOperand(0); - SDLoc dl(Vec); - MVT VecVT = Vec.getSimpleValueType(); - SDValue Idx = N->getOperand(1); - MVT EltVT = N->getSimpleValueType(0); - - assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) || - "Unexpected operands in ExtractBitFromMaskVector"); - - // variable index - if (!isa(Idx)) { - MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); - SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - ExtVT.getVectorElementType(), Ext); - return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); - } - - unsigned IdxVal = cast(Idx)->getZExtValue(); - - MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits()); - unsigned MaxShift = VecVT.getSizeInBits() - 1; - Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec); - Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, - DAG.getConstant(MaxShift - IdxVal, ScalarVT)); - Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec, - DAG.getConstant(MaxShift, ScalarVT)); - - if (VecVT == MVT::v16i1) { - Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec); - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec); - } - return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec); -} - /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// to a simple store and scalar loads to extract the elements. @@ -16410,10 +17365,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, SDValue InputVector = N->getOperand(0); - if (InputVector.getValueType().getVectorElementType() == MVT::i1 && - !DCI.isBeforeLegalize()) - return ExtractBitFromMaskVector(N, DAG); - // Detect whether we are trying to convert from mmx to i32 and the bitcast // from mmx to v2i32 has a single usage. if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && @@ -16959,12 +17910,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted - TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) { + TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT && + // Check that condition value type matches vselect operand type + CondVT == VT) { assert(Cond.getValueType().isVector() && "vector select expects a vector selector!"); - EVT IntVT = Cond.getValueType(); bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); @@ -16979,7 +17931,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, ISD::CondCode NewCC = ISD::getSetCCInverse(cast(CC)->get(), Cond.getOperand(0).getValueType().isInteger()); - Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); + Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); std::swap(LHS, RHS); TValIsAllOnes = FValIsAllOnes; FValIsAllZeros = TValIsAllZeros; @@ -16992,16 +17944,91 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (TValIsAllOnes && FValIsAllZeros) Ret = Cond; else if (TValIsAllOnes) - Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, - DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); + Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, + DAG.getNode(ISD::BITCAST, DL, CondVT, RHS)); else if (FValIsAllZeros) - Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, - DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); + Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, + DAG.getNode(ISD::BITCAST, DL, CondVT, LHS)); return DAG.getNode(ISD::BITCAST, DL, VT, Ret); } } + // Try to fold this VSELECT into a MOVSS/MOVSD + if (N->getOpcode() == ISD::VSELECT && + Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) { + if (VT == MVT::v4i32 || VT == MVT::v4f32 || + (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) { + bool CanFold = false; + unsigned NumElems = Cond.getNumOperands(); + SDValue A = LHS; + SDValue B = RHS; + + if (isZero(Cond.getOperand(0))) { + CanFold = true; + + // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B) + // fold (vselect <0,-1> -> (movsd A, B) + for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) + CanFold = isAllOnes(Cond.getOperand(i)); + } else if (isAllOnes(Cond.getOperand(0))) { + CanFold = true; + std::swap(A, B); + + // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A) + // fold (vselect <-1,0> -> (movsd B, A) + for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) + CanFold = isZero(Cond.getOperand(i)); + } + + if (CanFold) { + if (VT == MVT::v4i32 || VT == MVT::v4f32) + return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG); + return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG); + } + + if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) { + // fold (v4i32: vselect <0,0,-1,-1>, A, B) -> + // (v4i32 (bitcast (movsd (v2i64 (bitcast A)), + // (v2i64 (bitcast B))))) + // + // fold (v4f32: vselect <0,0,-1,-1>, A, B) -> + // (v4f32 (bitcast (movsd (v2f64 (bitcast A)), + // (v2f64 (bitcast B))))) + // + // fold (v4i32: vselect <-1,-1,0,0>, A, B) -> + // (v4i32 (bitcast (movsd (v2i64 (bitcast B)), + // (v2i64 (bitcast A))))) + // + // fold (v4f32: vselect <-1,-1,0,0>, A, B) -> + // (v4f32 (bitcast (movsd (v2f64 (bitcast B)), + // (v2f64 (bitcast A))))) + + CanFold = (isZero(Cond.getOperand(0)) && + isZero(Cond.getOperand(1)) && + isAllOnes(Cond.getOperand(2)) && + isAllOnes(Cond.getOperand(3))); + + if (!CanFold && isAllOnes(Cond.getOperand(0)) && + isAllOnes(Cond.getOperand(1)) && + isZero(Cond.getOperand(2)) && + isZero(Cond.getOperand(3))) { + CanFold = true; + std::swap(LHS, RHS); + } + + if (CanFold) { + EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64; + SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS); + SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS); + SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA, + NewB, DAG); + return DAG.getNode(ISD::BITCAST, DL, VT, Select); + } + } + } + } + // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits @@ -17014,6 +18041,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (BitWidth == 1) return SDValue(); + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are set + // properly. + for (SDNode::use_iterator I = Cond->use_begin(), + E = Cond->use_end(); I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -17059,7 +18095,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; - const ConstantSDNode* C = 0; + const ConstantSDNode* C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? @@ -17194,8 +18230,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, MVT::i8), Flags }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), - Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); } // If this is a select between two integer constants, try to do some @@ -17310,7 +18345,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // the DCI.xxxx conditions are provided to postpone the optimization as // late as possible. - ConstantSDNode *CmpAgainst = 0; + ConstantSDNode *CmpAgainst = nullptr; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast(Cond.getOperand(1))) && !isa(Cond.getOperand(0))) { @@ -17325,8 +18360,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, CmpAgainst == dyn_cast(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), DAG.getConstant(CC, MVT::i8), Cond }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, - array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); } } } @@ -17546,18 +18580,42 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { - bool is64BitFP = (CMP00.getValueType() == MVT::f64); - X86ISD::NodeType NTOperator = is64BitFP ? - X86ISD::FSETCCsd : X86ISD::FSETCCss; // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; - SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, + if (Subtarget->hasAVX512()) { + SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, + CMP01, DAG.getConstant(x86cc, MVT::i8)); + if (N->getValueType(0) != MVT::i1) + return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), + FSetCC); + return FSetCC; + } + SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, + CMP00.getValueType(), CMP00, CMP01, DAG.getConstant(x86cc, MVT::i8)); - SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, - OnesOrZeroesF); - SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, - DAG.getConstant(1, MVT::i32)); + + bool is64BitFP = (CMP00.getValueType() == MVT::f64); + MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; + + if (is64BitFP && !Subtarget->is64Bit()) { + // On a 32-bit target, we cannot bitcast the 64-bit float to a + // 64-bit integer, since that's not a legal type. Since + // OnesOrZeroesF is all ones of all zeroes, we don't need all the + // bits, but can do this little dance to extract the lowest 32 bits + // and work with those going forward. + SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, + OnesOrZeroesF); + SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, + Vector64); + OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, + Vector32, DAG.getIntPtrConstant(0)); + IntVT = MVT::i32; + } + + SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF); + SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, + DAG.getConstant(1, IntVT)); SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); return OneBitOfTruth; } @@ -17653,7 +18711,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), N1->getOperand(0)); SmallVector C(WideVT.getVectorNumElements(), N1); - N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); + N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { N1 = N1->getOperand(0); } @@ -17690,64 +18748,13 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (R.getNode()) return R; - // Create BLSI, BLSR, and BZHI instructions - // BLSI is X & (-X) - // BLSR is X & (X-1) - // BZHI is X & ((1 << Y) - 1) + // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT == MVT::i32 || VT == MVT::i64) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (Subtarget->hasBMI()) { - // Check LHS for neg - if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && - isZero(N0.getOperand(0))) - return DAG.getNode(X86ISD::BLSI, DL, VT, N1); - - // Check RHS for neg - if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && - isZero(N1.getOperand(0))) - return DAG.getNode(X86ISD::BLSI, DL, VT, N0); - - // Check LHS for X-1 - if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && - isAllOnes(N0.getOperand(1))) - return DAG.getNode(X86ISD::BLSR, DL, VT, N1); - - // Check RHS for X-1 - if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && - isAllOnes(N1.getOperand(1))) - return DAG.getNode(X86ISD::BLSR, DL, VT, N0); - } - - if (Subtarget->hasBMI2()) { - // Check for (and (add (shl 1, Y), -1), X) - if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::SHL) { - SDValue N001 = N00.getOperand(1); - assert(N001.getValueType() == MVT::i8 && "unexpected type"); - ConstantSDNode *C = dyn_cast(N00.getOperand(0)); - if (C && C->getZExtValue() == 1) - return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001); - } - } - - // Check for (and X, (add (shl 1, Y), -1)) - if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) { - SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::SHL) { - SDValue N101 = N10.getOperand(1); - assert(N101.getValueType() == MVT::i8 && "unexpected type"); - ConstantSDNode *C = dyn_cast(N10.getOperand(0)); - if (C && C->getZExtValue() == 1) - return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101); - } - } - } - // Check for BEXTR. if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { @@ -17797,7 +18804,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -17807,6 +18813,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); // look for psign/blend if (VT == MVT::v2i64 || VT == MVT::v4i64) { @@ -17895,12 +18902,12 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, MachineFunction &MF = DAG.getMachineFunction(); bool OptForSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); - - // SHLD/SHRD instructions have lower register pressure, but on some - // platforms they have higher latency than the equivalent - // series of shifts/or that would otherwise be generated. + + // SHLD/SHRD instructions have lower register pressure, but on some + // platforms they have higher latency than the equivalent + // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions - // have higer latencies and we are not optimizing for size. + // have higher latencies and we are not optimizing for size. if (!OptForSize && Subtarget->isSHLDSlow()) return SDValue(); @@ -17987,8 +18994,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { SDValue Ops[] = { N0.getOperand(0), Neg, DAG.getConstant(X86::COND_GE, MVT::i8), SDValue(Neg.getNode(), 1) }; - return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), - Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); } return SDValue(); } @@ -17997,7 +19003,6 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -18007,28 +19012,6 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, return RV; } - // Try forming BMI if it is available. - if (!Subtarget->hasBMI()) - return SDValue(); - - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); - - assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); - - // Create BLSMSK instructions by finding X ^ (X-1) - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - - if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && - isAllOnes(N0.getOperand(1))) - return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); - - if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && - isAllOnes(N1.getOperand(1))) - return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); - return SDValue(); } @@ -18168,8 +19151,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); } - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], - Chains.size()); + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. @@ -18344,8 +19326,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Chains.push_back(Ch); } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], - Chains.size()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering @@ -18368,7 +19349,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { SDNode* LdVal = St->getValue().getNode(); - LoadSDNode *Ld = 0; + LoadSDNode *Ld = nullptr; int TokenFactorIndex = -1; SmallVector Ops; SDNode* ChainVal = St->getChain().getNode(); @@ -18411,8 +19392,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); - NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], - Ops.size()); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getPointerInfo(), @@ -18439,8 +19419,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, if (TokenFactorIndex != -1) { Ops.push_back(LoLd); Ops.push_back(HiLd); - NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], - Ops.size()); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } LoAddr = St->getBasePtr(); @@ -18832,6 +19811,17 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, } } + if (N0.getOpcode() == ISD::TRUNCATE && + N0.hasOneUse() && + N0.getOperand(0).hasOneUse()) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + return DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, + N00.getOperand(0), N00.getOperand(1)), + DAG.getConstant(1, VT)); + } + } if (VT.is256BitVector()) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) @@ -18843,10 +19833,13 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, // Optimize x == -y --> x+y == 0 // x != -y --> x+y != 0 -static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { ISD::CondCode CC = cast(N->getOperand(2))->get(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast(LHS.getOperand(0))) @@ -18864,17 +19857,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, DAG.getConstant(0, addV.getValueType()), CC); } + + if (VT.getScalarType() == MVT::i1) { + bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode()); + if (!IsSEXT0 && !IsVZero0) + return SDValue(); + bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) && + (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if (!IsSEXT1 && !IsVZero1) + return SDValue(); + + if (IsSEXT0 && IsVZero1) { + assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); + if (CC == ISD::SETEQ) + return DAG.getNOT(DL, LHS.getOperand(0), VT); + return LHS.getOperand(0); + } + if (IsSEXT1 && IsVZero0) { + assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type"); + if (CC == ISD::SETEQ) + return DAG.getNOT(DL, RHS.getOperand(0), VT); + return RHS.getOperand(0); + } + } + return SDValue(); } // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. -static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { - return DAG.getNode(ISD::AND, DL, MVT::i8, +static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, + MVT VT) { + if (VT == MVT::i8) + return DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), + DAG.getConstant(1, VT)); + assert (VT == MVT::i1 && "Unexpected type for SECCC node"); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), - DAG.getConstant(1, MVT::i8)); + DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS)); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT @@ -18899,7 +19926,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); - return MaterializeSETB(DL, NewEFLAGS, DAG); + return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); } } @@ -18907,7 +19934,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, // a zext and produces an all-ones bit which is more useful than 0/1 in some // cases. if (CC == X86::COND_B) - return MaterializeSETB(DL, EFLAGS, DAG); + return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); SDValue Flags; @@ -19142,7 +20169,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); - case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); + case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); @@ -19435,7 +20462,7 @@ TargetLowering::ConstraintWeight Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. - if (CallOperandVal == NULL) + if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. @@ -19553,7 +20580,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { - SDValue Result(0, 0); + SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) return; @@ -19636,7 +20663,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. - GlobalAddressSDNode *GA = 0; + GlobalAddressSDNode *GA = nullptr; int64_t Offset = 0; // Match either (GA), (GA+C), (GA+C1+C2), etc. @@ -19792,7 +20819,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); // Not found as a standard register? - if (Res.second == 0) { + if (!Res.second) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && @@ -19917,3 +20944,30 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return Res; } + +int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, + Type *Ty) const { + // Scaling factors are not free at all. + // An indexed folded instruction, i.e., inst (reg1, reg2, scale), + // will take 2 allocations in the out of order engine instead of 1 + // for plain addressing mode, i.e. inst (reg1). + // E.g., + // vaddps (%rsi,%drx), %ymm0, %ymm1 + // Requires two allocations (one for the load, one for the computation) + // whereas: + // vaddps (%rsi), %ymm0, %ymm1 + // Requires just 1 allocation, i.e., freeing allocations for other operations + // and having less micro operations to execute. + // + // For some X86 architectures, this is even worse because for instance for + // stores, the complex addressing mode forces the instruction to use the + // "load" ports instead of the dedicated "store" port. + // E.g., on Haswell: + // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + if (isLegalAddressingMode(AM, Ty)) + // Scale represents reg2 * scale, thus account for 1 + // as soon as we use a second register. + return AM.Scale != 0; + return -1; +}