//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-isel"
#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
-#include "X86.h"
+#include "X86CallingConv.h"
#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/VariadicFunction.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CallSite.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include <cctype>
using namespace llvm;
+#define DEBUG_TYPE "x86-isel"
+
STATISTIC(NumTailCalls, "Number of tail calls");
// Forward declarations.
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+ makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+ ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
VecIdx);
return Result;
-
+
}
/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
bool is64Bit = Subtarget->is64Bit();
- if (Subtarget->isTargetEnvMacho()) {
+ if (Subtarget->isTargetMacho()) {
if (is64Bit)
return new X86_64MachoTargetObjectFile();
return new TargetLoweringObjectFileMachO();
return new X86LinuxTargetObjectFile();
if (Subtarget->isTargetELF())
return new TargetLoweringObjectFileELF();
- if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+ if (Subtarget->isTargetKnownWindowsMSVC())
+ return new X86WindowsTargetObjectFile();
+ if (Subtarget->isTargetCOFF())
return new TargetLoweringObjectFileCOFF();
llvm_unreachable("unknown subtarget type");
}
addBypassSlowDiv(64, 16);
}
- if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
+ if (Subtarget->isTargetKnownWindowsMSVC()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
// The _ftol2 runtime function has an unusual calling conv, which
// is modeled by a special pseudo-instruction.
- setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
- setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
- setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
- setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
+ setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
}
if (Subtarget->isTargetDarwin()) {
// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
setUseUnderscoreSetJmp(false);
setUseUnderscoreLongJmp(false);
- } else if (Subtarget->isTargetMingw()) {
+ } else if (Subtarget->isTargetWindowsGNU()) {
// MS runtime is weird: it exports _setjmp, but longjmp!
setUseUnderscoreSetJmp(true);
setUseUnderscoreLongJmp(false);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
- setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+
+ if (!Subtarget->hasMOVBE())
+ setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
// These should be promoted to a larger select which is supported.
setOperationAction(ISD::SELECT , MVT::i1 , Promote);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Custom);
- else if (TM.Options.EnableSegmentedStacks)
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Custom);
- else
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
// f32 and f64 use SSE.
setOperationAction(ISD::FRINT, VT, Expand);
setOperationAction(ISD::FNEARBYINT, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::ADD, MVT::v2i64, Legal);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::SUB, MVT::v16i8, Legal);
setOperationAction(ISD::SUB, MVT::v8i16, Legal);
setOperationAction(ISD::SUB, MVT::v4i32, Legal);
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
setOperationAction(ISD::SRA, MVT::v4i32, Custom);
-
- setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
- setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
setOperationAction(ISD::FABS, MVT::v4f64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
-
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
-
+ // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
+ // even though v8i16 is a legal type.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+
setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
setOperationAction(ISD::SRA, MVT::v16i16, Custom);
setOperationAction(ISD::SRA, MVT::v32i8, Custom);
- setOperationAction(ISD::SDIV, MVT::v16i16, Custom);
-
setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
setOperationAction(ISD::FMA, MVT::v8f32, Legal);
setOperationAction(ISD::MUL, MVT::v16i16, Legal);
// Don't lower v32i8 because there is no 128-bit byte mul
- setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+ setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
- setOperationAction(ISD::SDIV, MVT::v8i32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+ addRegisterClass(MVT::i1, &X86::VK1RegClass);
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+ setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SETCC, MVT::i1, Custom);
+ setOperationAction(ISD::XOR, MVT::i1, Legal);
+ setOperationAction(ISD::OR, MVT::i1, Legal);
+ setOperationAction(ISD::AND, MVT::i1, Legal);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal);
setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
setOperationAction(ISD::FMA, MVT::v8f64, Legal);
setOperationAction(ISD::FMA, MVT::v16f32, Legal);
- setOperationAction(ISD::SDIV, MVT::v16i32, Custom);
-
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
+ }
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
- setOperationAction(ISD::TRUNCATE, MVT::i1, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);
setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
setOperationAction(ISD::AND, MVT::v8i64, Legal);
setOperationAction(ISD::OR, MVT::v8i64, Legal);
setOperationAction(ISD::XOR, MVT::v8i64, Legal);
+ setOperationAction(ISD::AND, MVT::v16i32, Legal);
+ setOperationAction(ISD::OR, MVT::v16i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v16i32, Legal);
// Custom lower several nodes.
for (int i = MVT::FIRST_VECTOR_VALUETYPE;
if (!VT.is512BitVector())
continue;
- if (VT != MVT::v8i64) {
- setOperationAction(ISD::XOR, VT, Promote);
- AddPromotedToType (ISD::XOR, VT, MVT::v8i64);
- setOperationAction(ISD::OR, VT, Promote);
- AddPromotedToType (ISD::OR, VT, MVT::v8i64);
- setOperationAction(ISD::AND, VT, Promote);
- AddPromotedToType (ISD::AND, VT, MVT::v8i64);
- }
if ( EltSize >= 32) {
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
if (!VT.is512BitVector())
continue;
- setOperationAction(ISD::LOAD, VT, Promote);
- AddPromotedToType (ISD::LOAD, VT, MVT::v8i64);
setOperationAction(ISD::SELECT, VT, Promote);
AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
}
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ if (!Subtarget->is64Bit())
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
- setLibcallName(RTLIB::SHL_I128, 0);
- setLibcallName(RTLIB::SRL_I128, 0);
- setLibcallName(RTLIB::SRA_I128, 0);
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
}
// Combine sin / cos into one node or libcall if possible.
}
}
+ if (Subtarget->isTargetWin64()) {
+ setOperationAction(ISD::SDIV, MVT::i128, Custom);
+ setOperationAction(ISD::UDIV, MVT::i128, Custom);
+ setOperationAction(ISD::SREM, MVT::i128, Custom);
+ setOperationAction(ISD::UREM, MVT::i128, Custom);
+ setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+ }
+
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
}
EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
- if (!VT.isVector()) return MVT::i8;
+ if (!VT.isVector())
+ return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
+
+ if (Subtarget->hasAVX512())
+ switch(VT.getVectorNumElements()) {
+ case 8: return MVT::v8i1;
+ case 16: return MVT::v16i1;
+ }
+
return VT.changeVectorElementTypeToInteger();
}
}
bool
-X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+ unsigned,
+ bool *Fast) const {
if (Fast)
*Fast = Subtarget->isUnalignedMemAccessFast();
return true;
// FIXME: Why this routine is here? Move to RegInfo!
std::pair<const TargetRegisterClass*, uint8_t>
X86TargetLowering::findRepresentativeClass(MVT VT) const{
- const TargetRegisterClass *RRC = 0;
+ const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
return true;
}
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ assert(SrcAS != DestAS && "Expected different address spaces!");
+
+ return SrcAS < 256 && DestAS < 256;
+}
+
//===----------------------------------------------------------------------===//
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
return CCInfo.CheckReturn(Outs, RetCC_X86);
}
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+ static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
+ return ScratchRegs;
+}
+
SDValue
X86TargetLowering::LowerReturn(SDValue Chain,
CallingConv::ID CallConv, bool isVarArg,
else if (VA.getLocInfo() == CCValAssign::BCvt)
ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+ assert(VA.getLocInfo() != CCValAssign::FPExt &&
+ "Unexpected FP-extend for return value.");
+
// If this is x86-64, and we disabled SSE, we can't return FP values,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+ (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
unsigned Reg = FuncInfo->getSRetReturnReg();
if (Flag.getNode())
RetOps.push_back(Flag);
- return DAG.getNode(X86ISD::RET_FLAG, dl,
- MVT::Other, &RetOps[0], RetOps.size());
+ return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
}
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
MachineFrameInfo *MFI = MF.getFrameInfo();
bool Is64Bit = Subtarget->is64Bit();
- bool IsWindows = Subtarget->isTargetWindows();
bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
RC = &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
RC = &X86::VR64RegClass;
+ else if (RegVT == MVT::i1)
+ RC = &X86::VK1RegClass;
else if (RegVT == MVT::v8i1)
RC = &X86::VK8RegClass;
else if (RegVT == MVT::v16i1)
// Save the argument into a virtual register so that we can access it
// from the return points.
if (MF.getFunction()->hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+ (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
unsigned Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
// FIXME: We should really autogenerate these arrays
- static const uint16_t GPR64ArgRegsWin64[] = {
+ static const MCPhysReg GPR64ArgRegsWin64[] = {
X86::RCX, X86::RDX, X86::R8, X86::R9
};
- static const uint16_t GPR64ArgRegs64Bit[] = {
+ static const MCPhysReg GPR64ArgRegs64Bit[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
};
- static const uint16_t XMMArgRegs64Bit[] = {
+ static const MCPhysReg XMMArgRegs64Bit[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
- const uint16_t *GPR64ArgRegs;
+ const MCPhysReg *GPR64ArgRegs;
unsigned NumXMMRegs = 0;
if (IsWin64) {
SaveXMMOps.push_back(Val);
}
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
- MVT::Other,
- &SaveXMMOps[0], SaveXMMOps.size()));
+ MVT::Other, SaveXMMOps));
}
if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOps[0], MemOps.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
}
}
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
- if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+ if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+ !Subtarget->getTargetTriple().isOSMSVCRT() &&
argsAreStructReturn(Ins) == StackStructReturn)
FuncInfo->setBytesToPopOnReturn(4);
}
/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
-static SDValue
-EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
- SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
- unsigned SlotSize, int FPDiff, SDLoc dl) {
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ EVT PtrVT, unsigned SlotSize,
+ int FPDiff, SDLoc dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
- bool IsWindows = Subtarget->isTargetWindows();
StructReturnType SR = callIsStructReturn(Outs);
bool IsSibcall = false;
if (MF.getTarget().Options.DisableTailCalls)
isTailCall = false;
- if (isTailCall) {
+ bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+ if (IsMustTail) {
+ // Force this to be a tail call. The verifier rules are enough to ensure
+ // that we can lower this successfully without moving the return address
+ // around.
+ isTailCall = true;
+ } else if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
- if (isTailCall && !IsSibcall) {
+ if (isTailCall && !IsSibcall && !IsMustTail) {
// Lower arguments at fp - stackoffset + fpdiff.
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
X86Info->setTCReturnAddrDelta(FPDiff);
}
+ unsigned NumBytesToPush = NumBytes;
+ unsigned NumBytesToPop = NumBytes;
+
+ // If we have an inalloca argument, all stack space has already been allocated
+ // for us and be right at the top of the stack. We don't support multiple
+ // arguments passed in memory when using inalloca.
+ if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+ NumBytesToPush = 0;
+ assert(ArgLocs.back().getLocMemOffset() == 0 &&
+ "an inalloca argument must be the only memory argument");
+ }
+
if (!IsSibcall)
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
- dl);
+ Chain = DAG.getCALLSEQ_START(
+ Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
const X86RegisterInfo *RegInfo =
static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // Skip inalloca arguments, they have already been written.
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (Flags.isInAlloca())
+ continue;
+
CCValAssign &VA = ArgLocs[i];
EVT RegVT = VA.getLocVT();
SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
bool isByVal = Flags.isByVal();
// Promote the value if needed.
}
} else if (!IsSibcall && (!isTailCall || isByVal)) {
assert(VA.isMemLoc());
- if (StackPtr.getNode() == 0)
+ if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy());
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
}
if (!MemOpChains.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOpChains[0], MemOpChains.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
if (Subtarget->isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// registers used and is in the range 0 - 8 inclusive.
// Count the number of XMM registers allocated.
- static const uint16_t XMMArgRegs[] = {
+ static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
DAG.getConstant(NumXMMRegs, MVT::i8)));
}
- // For tail calls lower the arguments to the 'real' stack slot.
- if (isTailCall) {
+ // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
+ // don't need this because the eligibility check rejects calls that require
+ // shuffling arguments passed in memory.
+ if (!IsSibcall && isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
// before any new outgoing arguments are stored to the stack, because the
// outgoing stack slots may alias the incoming argument stack slots, and
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- if (getTargetMachine().Options.GuaranteedTailCallOpt) {
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
- if (VA.isRegLoc())
- continue;
- assert(VA.isMemLoc());
- SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
- // Create frame index.
- int32_t Offset = VA.getLocMemOffset()+FPDiff;
- uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
- FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
- FIN = DAG.getFrameIndex(FI, getPointerTy());
-
- if (Flags.isByVal()) {
- // Copy relative to framepointer.
- SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
- if (StackPtr.getNode() == 0)
- StackPtr = DAG.getCopyFromReg(Chain, dl,
- RegInfo->getStackRegister(),
- getPointerTy());
- Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
-
- MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
- ArgChain,
- Flags, DAG, dl));
- } else {
- // Store relative to framepointer.
- MemOpChains2.push_back(
- DAG.getStore(ArgChain, dl, Arg, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, 0));
- }
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Skip inalloca arguments. They don't require any work.
+ if (Flags.isInAlloca())
+ continue;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl,
+ RegInfo->getStackRegister(),
+ getPointerTy());
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(
+ DAG.getStore(ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ false, false, 0));
}
}
if (!MemOpChains2.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOpChains2[0], MemOpChains2.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
// We should use extra load for direct calls to dllimported functions in
// non-JIT mode.
const GlobalValue *GV = G->getGlobal();
- if (!GV->hasDLLImportLinkage()) {
+ if (!GV->hasDLLImportStorageClass()) {
unsigned char OpFlags = 0;
bool ExtraLoad = false;
unsigned WrapperKind = ISD::DELETED_NODE;
SmallVector<SDValue, 8> Ops;
if (!IsSibcall && isTailCall) {
- Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
- DAG.getIntPtrConstant(0, true), InFlag, dl);
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytesToPop, true),
+ DAG.getIntPtrConstant(0, true), InFlag, dl);
InFlag = Chain.getValue(1);
}
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
- return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+ return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
}
- Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
- unsigned NumBytesForCalleeToPush;
+ unsigned NumBytesForCalleeToPop;
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
getTargetMachine().Options.GuaranteedTailCallOpt))
- NumBytesForCalleeToPush = NumBytes; // Callee pops everything
- else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+ NumBytesForCalleeToPop = NumBytes; // Callee pops everything
+ else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+ !Subtarget->getTargetTriple().isOSMSVCRT() &&
SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
// This is common for Darwin/X86, Linux & Mingw32 targets.
// For MSVC Win32 targets, the caller pops the hidden struct pointer.
- NumBytesForCalleeToPush = 4;
+ NumBytesForCalleeToPop = 4;
else
- NumBytesForCalleeToPush = 0; // Callee pops nothing.
+ NumBytesForCalleeToPop = 0; // Callee pops nothing.
// Returns a flag for retval copy to use.
if (!IsSibcall) {
Chain = DAG.getCALLSEQ_END(Chain,
- DAG.getIntPtrConstant(NumBytes, true),
- DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+ DAG.getIntPtrConstant(NumBytesToPop, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPop,
true),
InFlag, dl);
InFlag = Chain.getValue(1);
if (isCalleeStructRet || isCallerStructRet)
return false;
- // An stdcall caller is expected to clean up its arguments; the callee
- // isn't going to do that.
- if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
+ // An stdcall/thiscall caller is expected to clean up its arguments; the
+ // callee isn't going to do that.
+ // FIXME: this is more restrictive than needed. We could produce a tailcall
+ // when the stack adjustment matches. For example, with a thiscall that takes
+ // only one argument.
+ if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
+ CallerCC == CallingConv::X86_ThisCall))
return false;
// Do not sibcall optimize vararg calls unless all arguments are passed via
}
}
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isX86CCUnsigned(unsigned X86CC) {
+ switch (X86CC) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case X86::COND_E: return true;
+ case X86::COND_G: return false;
+ case X86::COND_GE: return false;
+ case X86::COND_L: return false;
+ case X86::COND_LE: return false;
+ case X86::COND_NE: return true;
+ case X86::COND_B: return true;
+ case X86::COND_A: return true;
+ case X86::COND_BE: return true;
+ case X86::COND_AE: return true;
+ }
+ llvm_unreachable("covered switch fell through?!");
+}
+
/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
return false;
}
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0 || BitSize > 64)
+ return false;
+ return true;
+}
+
/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
return false;
unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
// Do not handle 64-bit element shuffles with palignr.
/// specifies a shuffle of elements that is suitable for input to 128/256-bit
/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
/// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
- bool Commuted = false) {
- if (!HasFp256 && VT.is256BitVector())
- return false;
+static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
unsigned NumElems = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
if (NumLaneElems != 2 && NumLaneElems != 4)
return false;
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ bool symetricMaskRequired =
+ (VT.getSizeInBits() >= 256) && (EltSize == 32);
+
// VSHUFPSY divides the resulting vector into 4 chunks.
// The sources are also splitted into 4 chunks, and each destination
// chunk must come from a different source chunk.
//
// DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
//
+ SmallVector<int, 4> MaskVal(NumLaneElems, -1);
unsigned HalfLaneElems = NumLaneElems/2;
for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
for (unsigned i = 0; i != NumLaneElems; ++i) {
// For VSHUFPSY, the mask of the second half must be the same as the
// first but with the appropriate offsets. This works in the same way as
// VPERMILPS works with masks.
- if (NumElems != 8 || l == 0 || Mask[i] < 0)
+ if (!symetricMaskRequired || Idx < 0)
continue;
- if (!isUndefOrEqual(Idx, Mask[i]+l))
+ if (MaskVal[i] < 0) {
+ MaskVal[i] = Idx - l;
+ continue;
+ }
+ if ((signed)(Idx - l) != MaskVal[i])
return false;
}
}
return true;
}
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+ // TODO: Deal with AVX's VINSERTPS
+ if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+ return false;
+
+ unsigned CorrectPosV1 = 0;
+ unsigned CorrectPosV2 = 0;
+ for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+ if (Mask[i] == i)
+ ++CorrectPosV1;
+ else if (Mask[i] == i + 4)
+ ++CorrectPosV2;
+
+ if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+ // We have 3 elements from one vector, and one from another.
+ return true;
+
+ return false;
+}
+
//
// Some special combinations that can be optimized.
//
static
SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG) {
- MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT VT = SVOp->getSimpleValueType(0);
SDLoc dl(SVOp);
if (VT != MVT::v8i32 && VT != MVT::v8f32)
static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
bool HasInt256, bool V2IsSplat = false) {
- if (VT.is512BitVector())
- return false;
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Unsupported vector type for unpckh");
+ assert(VT.getSizeInBits() >= 128 &&
+ "Unsupported vector type for unpckl");
+ // AVX defines UNPCK* to operate independently on 128-bit lanes.
+ unsigned NumLanes;
+ unsigned NumOf256BitLanes;
unsigned NumElts = VT.getVectorNumElements();
- if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+ if (VT.is256BitVector()) {
+ if (NumElts != 4 && NumElts != 8 &&
+ (!HasInt256 || (NumElts != 16 && NumElts != 32)))
return false;
+ NumLanes = 2;
+ NumOf256BitLanes = 1;
+ } else if (VT.is512BitVector()) {
+ assert(VT.getScalarType().getSizeInBits() >= 32 &&
+ "Unsupported vector type for unpckh");
+ NumLanes = 2;
+ NumOf256BitLanes = 2;
+ } else {
+ NumLanes = 1;
+ NumOf256BitLanes = 1;
+ }
- // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
- // independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
+ unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+ unsigned NumLaneElts = NumEltsInStride/NumLanes;
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (V2IsSplat) {
- if (!isUndefOrEqual(BitI1, NumElts))
+ for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+ for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+ for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l256*NumEltsInStride+l+i];
+ int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+ if (!isUndefOrEqual(BitI, j+l256*NumElts))
return false;
- } else {
- if (!isUndefOrEqual(BitI1, j + NumElts))
+ if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+ return false;
+ if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
return false;
}
}
}
-
return true;
}
/// specifies a shuffle of elements that is suitable for input to UNPCKH.
static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
bool HasInt256, bool V2IsSplat = false) {
- unsigned NumElts = VT.getVectorNumElements();
-
- if (VT.is512BitVector())
- return false;
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ assert(VT.getSizeInBits() >= 128 &&
"Unsupported vector type for unpckh");
- if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+ // AVX defines UNPCK* to operate independently on 128-bit lanes.
+ unsigned NumLanes;
+ unsigned NumOf256BitLanes;
+ unsigned NumElts = VT.getVectorNumElements();
+ if (VT.is256BitVector()) {
+ if (NumElts != 4 && NumElts != 8 &&
+ (!HasInt256 || (NumElts != 16 && NumElts != 32)))
return false;
+ NumLanes = 2;
+ NumOf256BitLanes = 1;
+ } else if (VT.is512BitVector()) {
+ assert(VT.getScalarType().getSizeInBits() >= 32 &&
+ "Unsupported vector type for unpckh");
+ NumLanes = 2;
+ NumOf256BitLanes = 2;
+ } else {
+ NumLanes = 1;
+ NumOf256BitLanes = 1;
+ }
- // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
- // independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
+ unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+ unsigned NumLaneElts = NumEltsInStride/NumLanes;
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (V2IsSplat) {
- if (isUndefOrEqual(BitI1, NumElts))
+ for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+ for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+ for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l256*NumEltsInStride+l+i];
+ int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+ if (!isUndefOrEqual(BitI, j+l256*NumElts))
return false;
- } else {
- if (!isUndefOrEqual(BitI1, j+NumElts))
+ if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+ return false;
+ if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
return false;
}
}
return true;
}
+// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
+// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
+static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
+ if (!VT.is512BitVector())
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfSize = NumElts/2;
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
+ *Imm = 1;
+ return true;
+ }
+ }
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
+ *Imm = 0;
+ return true;
+ }
+ }
+ return false;
+}
+
/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
/// The first half comes from the second half of V1 and the second half from the
/// the second half of V2.
-static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
if (!HasFp256 || !VT.is256BitVector())
return false;
/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
- MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT VT = SVOp->getSimpleValueType(0);
unsigned HalfSize = VT.getVectorNumElements()/2;
}
// Symetric in-lane mask. Each lane has 4 elements (for imm8)
-static bool isPermImmMask(ArrayRef<int> Mask, EVT VT, unsigned& Imm8) {
+static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
if (EltSize < 32)
return false;
/// to the same elements of the low, but to the higher half of the source.
/// In VPERMILPD the two lanes could be shuffled independently of each other
/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
- if (!HasFp256)
+static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ if (VT.getSizeInBits() < 256 || EltSize < 32)
return false;
-
+ bool symetricMaskRequired = (EltSize == 32);
unsigned NumElts = VT.getVectorNumElements();
- // Only match 256-bit with 32/64-bit types
- if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
- return false;
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned LaneSize = NumElts/NumLanes;
+ // 2 or 4 elements in one lane
+
+ SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
for (unsigned l = 0; l != NumElts; l += LaneSize) {
for (unsigned i = 0; i != LaneSize; ++i) {
if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
return false;
- if (NumElts != 8 || l == 0)
- continue;
- // VPERMILPS handling
- if (Mask[i] < 0)
- continue;
- if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
- return false;
+ if (symetricMaskRequired) {
+ if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
+ ExpectedMaskVal[i] = Mask[i+l] - l;
+ continue;
+ }
+ if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
+ return false;
+ }
}
}
-
return true;
}
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
- MVT VT = N->getValueType(0).getSimpleVT();
+ MVT VT = N->getSimpleValueType(0);
unsigned ElSize = VT.getVectorElementType().getSizeInBits();
bool Result = (Index * ElSize) % vecWidth == 0;
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
- MVT VT = N->getValueType(0).getSimpleVT();
+ MVT VT = N->getSimpleValueType(0);
unsigned ElSize = VT.getVectorElementType().getSizeInBits();
bool Result = (Index * ElSize) % vecWidth == 0;
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
/// Handles 128-bit and 256-bit.
static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getValueType(0).getSimpleVT();
+ MVT VT = N->getSimpleValueType(0);
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ assert((VT.getSizeInBits() >= 128) &&
"Unsupported vector type for PSHUF/SHUFP");
// Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- assert((NumLaneElts == 2 || NumLaneElts == 4) &&
- "Only supports 2 or 4 elements per lane");
+ assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
+ "Only supports 2, 4 or 8 elements per lane");
- unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
+ unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
unsigned Mask = 0;
for (unsigned i = 0; i != NumElts; ++i) {
int Elt = N->getMaskElt(i);
/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getValueType(0).getSimpleVT();
+ MVT VT = N->getSimpleValueType(0);
assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
"Unsupported vector type for PSHUFHW");
/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getValueType(0).getSimpleVT();
+ MVT VT = N->getSimpleValueType(0);
assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
"Unsupported vector type for PSHUFHW");
/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
- MVT VT = SVOp->getValueType(0).getSimpleVT();
- unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+ MVT VT = SVOp->getSimpleValueType(0);
+ unsigned EltSize = VT.is512BitVector() ? 1 :
+ VT.getVectorElementType().getSizeInBits() >> 3;
unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
int Val = 0;
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
- MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
+ MVT VecVT = N->getOperand(0).getSimpleValueType();
MVT ElVT = VecVT.getVectorElementType();
unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
- MVT VecVT = N->getValueType(0).getSimpleVT();
+ MVT VecVT = N->getSimpleValueType(0);
MVT ElVT = VecVT.getVectorElementType();
unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
/// their permute mask.
static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG) {
- MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT VT = SVOp->getSimpleValueType(0);
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> MaskVec;
/// isScalarLoadToVector - Returns true if the node is a scalar load that
/// is promoted to a vector. It also returns the LoadSDNode by reference if
/// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
return false;
N = N->getOperand(0).getNode();
if (Subtarget->hasInt256()) { // AVX2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else {
// 256-bit logic and arithmetic instructions in AVX are all
// floating-point, no support for integer ops. Emit fp zeroed vectors.
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
}
+ } else if (VT.is512BitVector()) { // AVX-512
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+ Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+ } else if (VT.getScalarType() == MVT::i1) {
+ assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
} else
llvm_unreachable("Unexpected vector type");
if (VT.is256BitVector()) {
if (HasInt256) { // AVX2
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else { // AVX
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
}
/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
}
/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
// Generate shuffles which repeat i16 and i8 several times until they can be
// represented by v4f32 and then be manipulated by target suported shuffles.
static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
- EVT VT = V.getValueType();
+ MVT VT = V.getSimpleValueType();
int NumElems = VT.getVectorNumElements();
SDLoc dl(V);
/// getLegalSplat - Generate a legal splat with supported x86 shuffles
static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
- MVT VT = V.getValueType().getSimpleVT();
+ MVT VT = V.getSimpleValueType();
SDLoc dl(V);
if (VT.is128BitVector()) {
/// PromoteSplat - Splat is promoted to target supported vector shuffles.
static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
- MVT SrcVT = SV->getValueType(0).getSimpleVT();
+ MVT SrcVT = SV->getSimpleValueType(0);
SDValue V1 = SV->getOperand(0);
SDLoc dl(SV);
bool IsZero,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- MVT VT = V2.getValueType().getSimpleVT();
+ MVT VT = V2.getSimpleValueType();
SDValue V1 = IsZero
? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
unsigned NumElems = VT.getVectorNumElements();
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
- MVT ShufVT = V.getValueType().getSimpleVT();
+ MVT ShufVT = V.getSimpleValueType();
unsigned NumElems = ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
bool IsUnary;
static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
unsigned NumElems =
- SVOp->getValueType(0).getSimpleVT().getVectorNumElements();
+ SVOp->getSimpleValueType(0).getVectorNumElements();
unsigned NumZeros = getNumOfConsecutiveZeros(
SVOp, NumElems, false /* check zeros from right */, DAG,
SVOp->getMaskElt(0));
static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
unsigned NumElems =
- SVOp->getValueType(0).getSimpleVT().getVectorNumElements();
+ SVOp->getSimpleValueType(0).getVectorNumElements();
unsigned NumZeros = getNumOfConsecutiveZeros(
SVOp, NumElems, true /* check zeros from left */, DAG,
NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
// Although the logic below support any bitwidth size, there are no
// shift instructions which handle more than 128-bit vectors.
- if (!SVOp->getValueType(0).getSimpleVT().is128BitVector())
+ if (!SVOp->getSimpleValueType(0).is128BitVector())
return false;
if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
return SDValue();
SDLoc dl(Op);
- SDValue V(0, 0);
+ SDValue V;
bool First = true;
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
}
if ((i & 1) != 0) {
- SDValue ThisElt(0, 0), LastElt(0, 0);
+ SDValue ThisElt, LastElt;
bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
if (LastIsNonZero) {
LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
return SDValue();
SDLoc dl(Op);
- SDValue V(0, 0);
+ SDValue V;
bool First = true;
for (unsigned i = 0; i < 8; ++i) {
bool isNonZero = (NonZeros & (1 << i)) != 0;
TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
}
-SDValue
-X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
- SelectionDAG &DAG) const {
+static SDValue
+LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
// Check if the scalar load can be widened into a vector load. And if
// the address is "base + cst" see if the cst can be "absorbed" into
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
- SDLoc &DL, SelectionDAG &DAG) {
+ SDLoc &DL, SelectionDAG &DAG,
+ bool isAfterLegalize) {
EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
- LoadSDNode *LDBase = NULL;
+ LoadSDNode *LDBase = nullptr;
unsigned LastLoadedElt = -1U;
// For each element in the initializer, see if we've found a load or an undef.
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
if (LastLoadedElt == NumElems - 1) {
+
+ if (isAfterLegalize &&
+ !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+ return SDValue();
+
SDValue NewLd = SDValue();
+
if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(),
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
- array_lengthof(Ops), MVT::i64,
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
LDBase->getPointerInfo(),
LDBase->getAlignment(),
false/*isVolatile*/, true/*ReadMem*/,
if (!Subtarget->hasFp256())
return SDValue();
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
unsigned ScalarSize = CVT.getSizeInBits();
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
- const Constant *C = 0;
+ const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
return SDValue();
}
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+ SDValue ExtIdx) {
+ int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+ if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+ return Idx;
+
+ // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+ // lowered this:
+ // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+ // to:
+ // (extract_vector_elt (vector_shuffle<2,u,u,u>
+ // (extract_subvector (v8f32 %vreg0), Constant<4>),
+ // undef)
+ // Constant<0>)
+ // In this case the vector is the extract_subvector expression and the index
+ // is 2, as specified by the shuffle.
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+ SDValue ShuffleVec = SVOp->getOperand(0);
+ MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+ assert(ShuffleVecVT.getVectorElementType() ==
+ ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+ int ShuffleIdx = SVOp->getMaskElt(Idx);
+ if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+ ExtractedFromVec = ShuffleVec;
+ return ShuffleIdx;
+ }
+ return Idx;
+}
+
static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
// Skip if insert_vec_elt is not supported.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+ // Quit if non-constant index.
+ if (!isa<ConstantSDNode>(ExtIdx))
+ return SDValue();
+ int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
// Quit if extracted from vector of different type.
if (ExtractedFromVec.getValueType() != VT)
return SDValue();
- // Quit if non-constant index.
- if (!isa<ConstantSDNode>(ExtIdx))
- return SDValue();
-
- if (VecIn1.getNode() == 0)
+ if (!VecIn1.getNode())
VecIn1 = ExtractedFromVec;
else if (VecIn1 != ExtractedFromVec) {
- if (VecIn2.getNode() == 0)
+ if (!VecIn2.getNode())
VecIn2 = ExtractedFromVec;
else if (VecIn2 != ExtractedFromVec)
// Quit if more than 2 vectors to shuffle
return SDValue();
}
- unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
-
if (ExtractedFromVec == VecIn1)
Mask[i] = Idx;
else if (ExtractedFromVec == VecIn2)
Mask[i] = Idx + NumElems;
}
- if (VecIn1.getNode() == 0)
+ if (!VecIn1.getNode())
return SDValue();
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
SDValue
X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
"Unexpected type in LowerBUILD_VECTORvXi1!");
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- Ops, VT.getVectorNumElements());
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
if (ISD::isBuildVectorAllOnes(Op.getNode())) {
SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- Ops, VT.getVectorNumElements());
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
bool AllContants = true;
uint64_t Immediate = 0;
+ int NonConstIdx = -1;
+ bool IsSplat = true;
+ unsigned NumNonConsts = 0;
+ unsigned NumConsts = 0;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (In.getOpcode() == ISD::UNDEF)
continue;
if (!isa<ConstantSDNode>(In)) {
AllContants = false;
- break;
+ NonConstIdx = idx;
+ NumNonConsts++;
}
- if (cast<ConstantSDNode>(In)->getZExtValue())
+ else {
+ NumConsts++;
+ if (cast<ConstantSDNode>(In)->getZExtValue())
Immediate |= (1ULL << idx);
+ }
+ if (In != Op.getOperand(0))
+ IsSplat = false;
}
if (AllContants) {
DAG.getIntPtrConstant(0));
}
- if (!isSplatVector(Op.getNode()))
- llvm_unreachable("Unsupported predicate operation");
-
- SDValue In = Op.getOperand(0);
- SDValue EFLAGS, X86CC;
- if (In.getOpcode() == ISD::SETCC) {
- SDValue Op0 = In.getOperand(0);
- SDValue Op1 = In.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get();
- bool isFP = Op1.getValueType().isFloatingPoint();
- unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
-
- assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation");
-
- X86CC = DAG.getConstant(X86CCVal, MVT::i8);
- EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG);
- EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
- } else if (In.getOpcode() == X86ISD::SETCC) {
- X86CC = In.getOperand(0);
- EFLAGS = In.getOperand(1);
- } else {
- // The algorithm:
- // Bit1 = In & 0x1
- // if (Bit1 != 0)
- // ZF = 0
- // else
- // ZF = 1
- // if (ZF == 0)
- // res = allOnes ### CMOVNE -1, %res
- // else
- // res = allZero
- MVT InVT = In.getValueType().getSimpleVT();
- SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
- EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
- X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- }
-
- if (VT == MVT::v16i1) {
- SDValue Cst1 = DAG.getConstant(-1, MVT::i16);
- SDValue Cst0 = DAG.getConstant(0, MVT::i16);
- SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16,
- Cst0, Cst1, X86CC, EFLAGS);
- return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
- }
-
- if (VT == MVT::v8i1) {
- SDValue Cst1 = DAG.getConstant(-1, MVT::i32);
- SDValue Cst0 = DAG.getConstant(0, MVT::i32);
- SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32,
- Cst0, Cst1, X86CC, EFLAGS);
- CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp);
- return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
- }
- llvm_unreachable("Unsupported predicate operation");
+ if (NumNonConsts == 1 && NonConstIdx != 0) {
+ SDValue DstVec;
+ if (NumConsts) {
+ SDValue VecAsImm = DAG.getConstant(Immediate,
+ MVT::getIntegerVT(VT.getSizeInBits()));
+ DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
+ }
+ else
+ DstVec = DAG.getUNDEF(VT);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ Op.getOperand(NonConstIdx),
+ DAG.getIntPtrConstant(NonConstIdx));
+ }
+ if (!IsSplat && (NonConstIdx != 0))
+ llvm_unreachable("Unsupported BUILD_VECTOR operation");
+ MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
+ SDValue Select;
+ if (IsSplat)
+ Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
+ DAG.getConstant(-1, SelectVT),
+ DAG.getConstant(0, SelectVT));
+ else
+ Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
+ DAG.getConstant((Immediate | 1), SelectVT),
+ DAG.getConstant(Immediate, SelectVT));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Select);
}
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
MVT ExtVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
return Op;
- return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+ if (!VT.is512BitVector())
+ return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
}
SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
- if (VT.is256BitVector()) {
+ if (VT.is256BitVector() || VT.is512BitVector()) {
SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
Item, DAG.getIntPtrConstant(0));
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
- if (VT.is256BitVector()) {
- SmallVector<SDValue, 32> V;
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ SmallVector<SDValue, 64> V;
for (unsigned i = 0; i != NumElems; ++i)
V.push_back(Op.getOperand(i));
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
// Build both the lower and upper subvector.
- SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
- SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
- NumElems/2);
+ SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+ makeArrayRef(&V[0], NumElems/2));
+ SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+ makeArrayRef(&V[NumElems / 2], NumElems/2));
// Recreate the wider vector with the lower and upper part.
- return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ if (VT.is256BitVector())
+ return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
}
// Let legalizer expand 2-wide build_vectors.
V[i] = Op.getOperand(i);
// Check for elements which are consecutive loads.
- SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+ SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
if (LD.getNode())
return LD;
// to create 256-bit vectors from two other 128-bit ones.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
- MVT ResVT = Op.getValueType().getSimpleVT();
+ MVT ResVT = Op.getSimpleValueType();
assert((ResVT.is256BitVector() ||
ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
if(ResVT.is256BitVector())
return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+ if (Op.getNumOperands() == 4) {
+ MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
+ ResVT.getVectorNumElements()/2);
+ SDValue V3 = Op.getOperand(2);
+ SDValue V4 = Op.getOperand(3);
+ return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
+ Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
+ }
return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getNumOperands() == 2);
+ MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
+ assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+ (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
+ Op.getNumOperands() == 4)));
- // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors
+ // AVX can use the vinsertf128 instruction to create 256-bit vectors
// from two other 128-bit ones.
+
+ // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
return LowerAVXCONCAT_VECTORS(Op, DAG);
}
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
SDLoc dl(SVOp);
- MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT VT = SVOp->getSimpleValueType(0);
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();
+ // There is no blend with immediate in AVX-512.
+ if (VT.is512BitVector())
+ return SDValue();
+
if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
return SDValue();
if (!Subtarget->hasInt256() && VT == MVT::v16i16)
return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
}
+/// In vector type \p VT, return true if the element at index \p InputIdx
+/// falls on a different 128-bit lane than \p OutputIdx.
+static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
+ unsigned OutputIdx) {
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
+}
+
+/// Generate a PSHUFB if possible. Selects elements from \p V1 according to
+/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to
+/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p
+/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
+/// zero.
+static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
+ SelectionDAG &DAG) {
+ MVT VT = V1.getSimpleValueType();
+ assert(VT.is128BitVector() || VT.is256BitVector());
+
+ MVT EltVT = VT.getVectorElementType();
+ unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
+ unsigned NumElts = VT.getVectorNumElements();
+
+ SmallVector<SDValue, 32> PshufbMask;
+ for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
+ int InputIdx = MaskVals[OutputIdx];
+ unsigned InputByteIdx;
+
+ if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
+ InputByteIdx = 0x80;
+ else {
+ // Cross lane is not allowed.
+ if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
+ return SDValue();
+ InputByteIdx = InputIdx * EltSizeInBytes;
+ // Index is an byte offset within the 128-bit lane.
+ InputByteIdx &= 0xf;
+ }
+
+ for (unsigned j = 0; j < EltSizeInBytes; ++j) {
+ PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
+ if (InputByteIdx != 0x80)
+ ++InputByteIdx;
+ }
+ }
+
+ MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
+ if (ShufVT != VT)
+ V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
+ return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
+}
+
// v8i16 shuffles - Prefer shuffles in the following order:
// 1. [all] pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
// Determine if more than 1 of the words in each of the low and high quadwords
// of the result come from the same quadword of one of the two inputs. Undef
// mask values count as coming from any quadword, for better codegen.
+ //
+ // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
+ // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
unsigned LoQuad[] = { 0, 0, 0, 0 };
unsigned HiQuad[] = { 0, 0, 0, 0 };
+ // Indices of quads used.
std::bitset<4> InputQuads;
for (unsigned i = 0; i < 8; ++i) {
unsigned *Quad = i < 4 ? LoQuad : HiQuad;
// For SSSE3, If all 8 words of the result come from only 1 quadword of each
// of the two input vectors, shuffle them into one input vector so only a
- // single pshufb instruction is necessary. If There are more than 2 input
+ // single pshufb instruction is necessary. If there are more than 2 input
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
// mask, and elements that come from V1 in the V2 mask, so that the two
// results can be OR'd together.
bool TwoInputs = V1Used && V2Used;
- for (unsigned i = 0; i != 8; ++i) {
- int EltIdx = MaskVals[i] * 2;
- int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
- int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
- pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
- }
- V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
- V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, &pshufbMask[0], 16));
+ V1 = getPSHUFB(MaskVals, V1, dl, DAG);
if (!TwoInputs)
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
- pshufbMask.clear();
- for (unsigned i = 0; i != 8; ++i) {
- int EltIdx = MaskVals[i] * 2;
- int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
- int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
- pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
- }
- V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
- V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
- DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, &pshufbMask[0], 16));
+ CommuteVectorShuffleMask(MaskVals, 8);
+ V2 = getPSHUFB(MaskVals, V2, dl, DAG);
V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
}
return NewV;
}
+/// \brief v16i16 shuffles
+///
+/// FIXME: We only support generation of a single pshufb currently. We can
+/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
+/// well (e.g 2 x pshufb + 1 x por).
+static SDValue
+LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ SDLoc dl(SVOp);
+
+ if (V2.getOpcode() != ISD::UNDEF)
+ return SDValue();
+
+ SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+ return getPSHUFB(MaskVals, V1, dl, DAG);
+}
+
// v16i8 shuffles - Prefer shuffles in the following order:
// 1. [ssse3] 1 x pshufb
// 2. [ssse3] 2 x pshufb + 1 x por
}
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, &pshufbMask[0], 16));
+ MVT::v16i8, pshufbMask));
// As PSHUFB will zero elements with negative indices, it's safe to ignore
// the 2nd operand if it's undefined or zero.
}
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, &pshufbMask[0], 16));
+ MVT::v16i8, pshufbMask));
return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
}
SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT VT = SVOp->getSimpleValueType(0);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
SDLoc dl(SVOp);
CommuteVectorShuffleMask(MaskVals, 32);
V1 = V2;
}
- SmallVector<SDValue, 32> pshufbMask;
- for (unsigned i = 0; i != 32; i++) {
- int EltIdx = MaskVals[i];
- if (EltIdx < 0 || EltIdx >= 32)
- EltIdx = 0x80;
- else {
- if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
- // Cross lane is not allowed.
- return SDValue();
- EltIdx &= 0xf;
- }
- pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
- }
- return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v32i8, &pshufbMask[0], 32));
+ return getPSHUFB(MaskVals, V1, dl, DAG);
}
/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG) {
- MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT VT = SVOp->getSimpleValueType(0);
SDLoc dl(SVOp);
unsigned NumElems = VT.getVectorNumElements();
MVT NewVT;
/// getVZextMovL - Return a zero-extending vector move low node.
///
-static SDValue getVZextMovL(MVT VT, EVT OpVT,
+static SDValue getVZextMovL(MVT VT, MVT OpVT,
SDValue SrcOp, SelectionDAG &DAG,
const X86Subtarget *Subtarget, SDLoc dl) {
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
- LoadSDNode *LD = NULL;
+ LoadSDNode *LD = nullptr;
if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
LD = dyn_cast<LoadSDNode>(SrcOp);
if (!LD) {
if (NewOp.getNode())
return NewOp;
- MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT VT = SVOp->getSimpleValueType(0);
unsigned NumElems = VT.getVectorNumElements();
unsigned NumLaneElems = NumElems / 2;
}
// Construct the output using a BUILD_VECTOR.
- Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
- SVOps.size());
+ Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
} else if (InputUsed[0] < 0) {
// No input vectors were used! The result is undefined.
Output[l] = DAG.getUNDEF(NVT);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
SDLoc dl(SVOp);
- MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT VT = SVOp->getSimpleValueType(0);
assert(VT.is128BitVector() && "Unsupported vector size");
static
SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
// Canonizalize to v2f64.
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
bool HasSSE2) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
assert(VT != MVT::v2i64 && "unsupported shuffle type");
SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
"unsupported shuffle type");
SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
unsigned NumElems = VT.getVectorNumElements();
// Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
getShuffleSHUFImmediate(SVOp), DAG);
}
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+ SelectionDAG &DAG) {
+ // Generate an insertps instruction when inserting an f32 from memory onto a
+ // v4f32 or when copying a member from one v4f32 to another.
+ // We also use it for transferring i32 from one register to another,
+ // since it simply copies the same bits.
+ // If we're transfering an i32 from memory to a specific element in a
+ // register, we output a generic DAG that will match the PINSRD
+ // instruction.
+ // TODO: Optimize for AVX cases too (VINSERTPS)
+ MVT VT = SVOp->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ auto Mask = SVOp->getMask();
+ assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+ "unsupported vector type for insertps/pinsrd");
+
+ int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i < 4; });
+
+ SDValue From;
+ SDValue To;
+ unsigned DestIndex;
+ if (FromV1 == 1) {
+ From = V1;
+ To = V2;
+ DestIndex = std::find_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i < 4; }) -
+ Mask.begin();
+ } else {
+ From = V2;
+ To = V1;
+ DestIndex = std::find_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i >= 4; }) -
+ Mask.begin();
+ }
+
+ if (MayFoldLoad(From)) {
+ // Trivial case, when From comes from a load and is only used by the
+ // shuffle. Make it use insertps from the vector that we need from that
+ // load.
+ SDValue Addr = From.getOperand(1);
+ SDValue NewAddr =
+ DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+ DAG.getConstant(DestIndex * EVT.getStoreSize(),
+ Addr.getSimpleValueType()));
+
+ LoadSDNode *Load = cast<LoadSDNode>(From);
+ SDValue NewLoad =
+ DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Load->getMemOperand(), 0, EVT.getStoreSize()));
+
+ if (EVT == MVT::f32) {
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+ SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+ InsertpsMask);
+ } else { // EVT == MVT::i32
+ // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+ // instruction, to match the PINSRD instruction, which loads an i32 to a
+ // certain vector element.
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+ DAG.getConstant(DestIndex, MVT::i32));
+ }
+ }
+
+ // Vector-element-to-vector
+ unsigned SrcIndex = Mask[DestIndex] % 4;
+ SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
// Reduce a vector shuffle to zext.
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
if (!Subtarget->hasSSE41())
return SDValue();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
// Only AVX2 support 256-bit vector integer extending.
if (!Subtarget->hasInt256() && VT.is256BitVector())
return SDValue();
}
- LLVMContext *Context = DAG.getContext();
unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
- EVT NeVT = EVT::getIntegerVT(*Context, NBits);
- EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
+ MVT NeVT = MVT::getIntegerVT(NBits);
+ MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
return SDValue();
if (V1.getOpcode() == ISD::BITCAST &&
V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- V1.getOperand(0)
- .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+ V1.getOperand(0).getOperand(0)
+ .getSimpleValueType().getSizeInBits() == SignificantBits) {
// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
ConstantSDNode *CIdx =
// selection to fold it. Otherwise, we will short the conversion sequence.
if (CIdx && CIdx->getZExtValue() == 0 &&
(!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
- if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
+ MVT FullVT = V.getSimpleValueType();
+ MVT V1VT = V1.getSimpleValueType();
+ if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
// The "ext_vec_elt" node is wider than the result node.
// In this case we should extract subvector from V.
// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
- unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
- EVT FullVT = V.getValueType();
- EVT SubVecVT = EVT::getVectorVT(*Context,
- FullVT.getVectorElementType(),
+ unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
+ MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
FullVT.getVectorNumElements()/Ratio);
V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
DAG.getIntPtrConstant(0));
}
- V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+ V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
}
}
NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode()) {
- MVT NewVT = NewOp.getValueType().getSimpleVT();
+ MVT NewVT = NewOp.getSimpleValueType();
if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
NewVT, true, false))
return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
} else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode()) {
- MVT NewVT = NewOp.getValueType().getSimpleVT();
+ MVT NewVT = NewOp.getSimpleValueType();
if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
DAG, Subtarget, dl);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
unsigned NumElems = VT.getVectorNumElements();
bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
if (V1IsUndef && V2IsUndef)
return DAG.getUNDEF(VT);
- assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
+ // When we create a shuffle node we put the UNDEF node to second operand,
+ // but in some cases the first operand may be transformed to UNDEF.
+ // In this case we should just commute the node.
+ if (V1IsUndef)
+ return CommuteVectorShuffle(SVOp, DAG);
// Vector shuffle lowering takes 3 steps:
//
CommuteVectorShuffleMask(M, NumElems);
std::swap(V1, V2);
std::swap(V1IsSplat, V2IsSplat);
- Commuted = false;
if (isUNPCKLMask(M, VT, HasInt256))
return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
}
// Normalize the node to match x86 shuffle ops if needed
- if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
+ if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
return CommuteVectorShuffle(SVOp, DAG);
// The checks below are all present in isShuffleMaskLegal, but they are
getShufflePSHUFLWImmediate(SVOp),
DAG);
- if (isSHUFPMask(M, VT, HasFp256))
+ if (isSHUFPMask(M, VT))
return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
getShuffleSHUFImmediate(SVOp), DAG);
return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
// Handle VPERMILPS/D* permutations
- if (isVPERMILPMask(M, VT, HasFp256)) {
- if (HasInt256 && VT == MVT::v8i32)
+ if (isVPERMILPMask(M, VT)) {
+ if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
getShuffleSHUFImmediate(SVOp), DAG);
return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
getShuffleSHUFImmediate(SVOp), DAG);
}
+ unsigned Idx;
+ if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+ return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+ Idx*(NumElems/2), DAG, dl);
+
// Handle VPERM2F128/VPERM2I128 permutations
if (isVPERM2X128Mask(M, VT, HasFp256))
return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
if (BlendOp.getNode())
return BlendOp;
+ if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+ return getINSERTPS(SVOp, dl, DAG);
+
unsigned Imm8;
if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
VT.is512BitVector()) {
- EVT MaskEltVT = EVT::getIntegerVT(*DAG.getContext(),
- VT.getVectorElementType().getSizeInBits());
- EVT MaskVectorVT =
- EVT::getVectorVT(*DAG.getContext(),MaskEltVT, NumElems);
+ MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
+ MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
SmallVector<SDValue, 16> permclMask;
for (unsigned i = 0; i != NumElems; ++i) {
permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
}
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
- &permclMask[0], NumElems);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
if (V2IsUndef)
// Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
return DAG.getNode(X86ISD::VPERMV, dl, VT,
DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
- return DAG.getNode(X86ISD::VPERMV3, dl, VT,
- DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
+ return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
+ DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
}
//===--------------------------------------------------------------------===//
return NewOp;
}
+ if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
if (VT == MVT::v16i8) {
SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
if (NewOp.getNode())
}
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
- if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
+ if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
return SDValue();
if (VT.getSizeInBits() == 8) {
return SDValue();
}
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDLoc dl(Vec);
+ MVT VecVT = Vec.getSimpleValueType();
+ SDValue Idx = Op.getOperand(1);
+ MVT EltVT = Op.getSimpleValueType();
+
+ assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+
+ // variable index can't be handled in mask registers,
+ // extend vector to VR512
+ if (!isa<ConstantSDNode>(Idx)) {
+ MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
+ SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ ExtVT.getVectorElementType(), Ext, Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ unsigned MaxSift = rc->getSize()*8 - 1;
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ DAG.getConstant(MaxSift, MVT::i8));
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
+ DAG.getIntPtrConstant(0));
+}
+
SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
- if (!isa<ConstantSDNode>(Op.getOperand(1)))
- return SDValue();
-
SDValue Vec = Op.getOperand(0);
- MVT VecVT = Vec.getValueType().getSimpleVT();
+ MVT VecVT = Vec.getSimpleValueType();
+ SDValue Idx = Op.getOperand(1);
+
+ if (Op.getSimpleValueType() == MVT::i1)
+ return ExtractBitFromMaskVector(Op, DAG);
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ if (VecVT.is512BitVector() ||
+ (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+ VecVT.getVectorElementType().getSizeInBits() == 32)) {
+
+ MVT MaskEltVT =
+ MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+ MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+ MaskEltVT.getSizeInBits());
+
+ Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+ SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+ getZeroVector(MaskVT, Subtarget, DAG, dl),
+ Idx, DAG.getConstant(0, getPointerTy()));
+ SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
+ Perm, DAG.getConstant(0, getPointerTy()));
+ }
+ return SDValue();
+ }
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
- SDValue Idx = Op.getOperand(1);
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// Get the 128-bit vector.
Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
- EVT EltVT = VecVT.getVectorElementType();
+ MVT EltVT = VecVT.getVectorElementType();
unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
return Res;
}
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
// TODO: handle v16i8.
if (VT.getSizeInBits() == 16) {
SDValue Vec = Op.getOperand(0);
// SHUFPS the element to the lowest double word, then movss.
int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
- MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+ MVT VVT = Op.getOperand(0).getSimpleValueType();
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
DAG.getUNDEF(VVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
// to a f64mem, the whole operation is folded into a single MOVHPDmr.
int Mask[2] = { 1, -1 };
- MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+ MVT VVT = Op.getOperand(0).getSimpleValueType();
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
DAG.getUNDEF(VVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
}
static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
SDLoc dl(Op);
return SDValue();
}
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Elt = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ // Non constant index. Extend source and destination,
+ // insert element and then truncate the result.
+ MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
+ MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
+ SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(IdxVal, MVT::i8));
+ const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ unsigned MaxSift = rc->getSize()*8 - 1;
+ EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(MaxSift, MVT::i8));
+ EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
+ DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+}
SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
+
+ if (EltVT == MVT::i1)
+ return InsertBitToMaskVector(Op, DAG);
SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
}
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
- LLVMContext *Context = DAG.getContext();
SDLoc dl(Op);
- MVT OpVT = Op.getValueType().getSimpleVT();
+ MVT OpVT = Op.getSimpleValueType();
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
unsigned SizeFactor = OpVT.getSizeInBits()/128;
- EVT VT128 = EVT::getVectorVT(*Context,
- OpVT.getVectorElementType(),
+ MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
OpVT.getVectorNumElements() / SizeFactor);
Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
SDValue In = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- EVT ResVT = Op.getValueType();
- EVT InVT = In.getValueType();
+ MVT ResVT = Op.getSimpleValueType();
+ MVT InVT = In.getSimpleValueType();
if (Subtarget->hasFp256()) {
if (ResVT.is128BitVector() &&
SDValue SubVec = Op.getNode()->getOperand(1);
SDValue Idx = Op.getNode()->getOperand(2);
- if ((Op.getNode()->getValueType(0).is256BitVector() ||
- Op.getNode()->getValueType(0).is512BitVector()) &&
- SubVec.getNode()->getValueType(0).is128BitVector() &&
+ if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
+ Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
+ SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
isa<ConstantSDNode>(Idx)) {
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
}
- if (Op.getNode()->getValueType(0).is512BitVector() &&
- SubVec.getNode()->getValueType(0).is256BitVector() &&
+ if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
+ SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
isa<ConstantSDNode>(Idx)) {
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
if (InFlag) {
SDValue Ops[] = { Chain, TGA, *InFlag };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
} else {
SDValue Ops[] = { Chain, TGA };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
- return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
X86::RAX, X86II::MO_TLSGD);
}
SDValue Base;
if (is64Bit) {
- Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
X86II::MO_TLSLD, /*LocalDynamic=*/true);
} else {
SDValue InFlag;
Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
is64Bit ? 257 : 256));
- SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
- DAG.getIntPtrConstant(0),
- MachinePointerInfo(Ptr),
- false, false, false, 0);
+ SDValue ThreadPointer =
+ DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
+ MachinePointerInfo(Ptr), false, false, false, 0);
unsigned char OperandFlags = 0;
// Most TLS accesses are not RIP relative, even on x86-64. One exception is
// emit "addl x@ntpoff,%eax" (local exec)
// or "addl x@indntpoff,%eax" (initial exec)
// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
- SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
- GA->getValueType(0),
- GA->getOffset(), OperandFlags);
+ SDValue TGA =
+ DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
if (model == TLSModel::InitialExec) {
if (isPIC && !is64Bit) {
Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
- DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
Offset);
}
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
- MachinePointerInfo::getGOT(), false, false, false,
- 0);
+ MachinePointerInfo::getGOT(), false, false, false, 0);
}
// The address of the thread local variable is the add of the thread
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Args[] = { Chain, Offset };
- Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
+ Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
Chain.getValue(1));
}
- if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) {
+ if (Subtarget->isTargetKnownWindowsMSVC() ||
+ Subtarget->isTargetWindowsGNU()) {
// Just use the implicit TLS architecture
// Need to generate someting similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
// If GV is an alias then use the aliasee for determining
// thread-localness.
if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
- GV = GA->resolveAliasedGlobal(false);
+ GV = GA->getAliasedGlobal();
SDLoc dl(GA);
SDValue Chain = DAG.getEntryNode();
: Type::getInt32PtrTy(*DAG.getContext(),
257));
- SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) :
- (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) :
- DAG.getExternalSymbol("_tls_array", getPointerTy()));
+ SDValue TlsArray =
+ Subtarget->is64Bit()
+ ? DAG.getIntPtrConstant(0x58)
+ : (Subtarget->isTargetWindowsGNU()
+ ? DAG.getIntPtrConstant(0x2C)
+ : DAG.getExternalSymbol("_tls_array", getPointerTy()));
- SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
- MachinePointerInfo(Ptr),
- false, false, false, 0);
+ SDValue ThreadPointer =
+ DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
+ MachinePointerInfo(Ptr), false, false, false, 0);
// Load the _tls_index variable
SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
+ // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+ // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+ // during isel.
+ SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits - 1, MVT::i8));
SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
DAG.getConstant(VTBits - 1, MVT::i8))
: DAG.getConstant(0, VT);
SDValue Tmp2, Tmp3;
if (Op.getOpcode() == ISD::SHL_PARTS) {
Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
- Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
} else {
Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
- Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+ Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
}
+ // If the shift amount is larger or equal than the width of a part we can't
+ // rely on the results of shld/shrd. Insert a test and select the appropriate
+ // values for large shift amounts.
SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits, MVT::i8));
SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
if (Op.getOpcode() == ISD::SHL_PARTS) {
- Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
- Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
} else {
- Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
- Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
}
SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
+ return DAG.getMergeValues(Ops, dl);
}
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
- EVT SrcVT = Op.getOperand(0).getValueType();
+ MVT SrcVT = Op.getOperand(0).getSimpleValueType();
if (SrcVT.isVector())
return SDValue();
- assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
+ assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
// These are really Legal; return the operand so the caller accepts it as
SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
X86ISD::FILD, DL,
- Tys, Ops, array_lengthof(Ops),
- SrcVT, MMO);
+ Tys, Ops, SrcVT, MMO);
if (useSSE) {
Chain = Result.getValue(1);
MachineMemOperand::MOStore, SSFISize, SSFISize);
Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
- Ops, array_lengthof(Ops),
- Op.getValueType(), MMO);
+ Ops, Op.getValueType(), MMO);
Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(SSFI),
false, false, false, 0);
SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
SelectionDAG &DAG) const {
SDValue N0 = Op.getOperand(0);
- EVT SVT = N0.getValueType();
+ MVT SVT = N0.getSimpleValueType();
SDLoc dl(Op);
assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
"Custom UINT_TO_FP is not supported!");
- EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- SVT.getVectorNumElements());
+ MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
}
if (DAG.SignBitIsZero(N0))
return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
- EVT SrcVT = N0.getValueType();
- EVT DstVT = Op.getValueType();
+ MVT SrcVT = N0.getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG);
if (SrcVT == MVT::i32 && X86ScalarSSEf64)
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
- array_lengthof(Ops), MVT::i64, MMO);
+ MVT::i64, MMO);
APInt FF(32, 0x5F800000ULL);
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
MachineMemOperand::MOLoad, MemSize, MemSize);
- Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
- array_lengthof(Ops), DstTy, MMO);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
Chain = Value.getValue(1);
SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
// Build the FP_TO_INT*_IN_MEM
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, array_lengthof(Ops), DstTy,
- MMO);
+ Ops, DstTy, MMO);
return std::make_pair(FIST, StackSlot);
} else {
SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
MVT::i32, eax.getValue(2));
SDValue Ops[] = { eax, edx };
SDValue pair = IsReplace
- ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
- : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
+ : DAG.getMergeValues(Ops, DL);
return std::make_pair(pair, SDValue());
}
}
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
- MVT VT = Op->getValueType(0).getSimpleVT();
+ MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
- MVT InVT = In.getValueType().getSimpleVT();
+ MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
// Optimize vectors in AVX mode:
// Concat upper and lower parts.
//
- if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+ if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+ ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
return SDValue();
if (Subtarget->hasInt256())
- return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In);
+ return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
SDValue Undef = DAG.getUNDEF(InVT);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
-SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
- SelectionDAG &DAG) const {
+static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc DL(Op);
+ unsigned int NumElts = VT.getVectorNumElements();
+ if (NumElts != 8 && NumElts != 16)
+ return SDValue();
+
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+ EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ // Now we have only mask extension
+ assert(InVT.getVectorElementType() == MVT::i1);
+ SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
+ const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+ SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
+
+ SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
+ if (VT.is512BitVector())
+ return Brcst;
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
if (Subtarget->hasFp256()) {
SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
if (Res.getNode())
return SDValue();
}
-SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
- SelectionDAG &DAG) const {
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc DL(Op);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
- MVT SVT = In.getValueType().getSimpleVT();
+ MVT SVT = In.getSimpleValueType();
+
+ if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+ return LowerZERO_EXTEND_AVX512(Op, DAG);
if (Subtarget->hasFp256()) {
SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
return Res;
}
- if (!VT.is256BitVector() || !SVT.is128BitVector() ||
- VT.getVectorNumElements() != SVT.getVectorNumElements())
- return SDValue();
-
- assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
-
- // AVX2 has better support of integer extending.
- if (Subtarget->hasInt256())
- return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
-
- SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
- static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
- SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
- DAG.getVectorShuffle(MVT::v8i16, DL, In,
- DAG.getUNDEF(MVT::v8i16),
- &Mask[0]));
-
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+ assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+ VT.getVectorNumElements() != SVT.getVectorNumElements());
+ return SDValue();
}
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
- MVT SVT = In.getValueType().getSimpleVT();
+ MVT InVT = In.getSimpleValueType();
- if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
+ if (VT == MVT::i1) {
+ assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+ "Invalid scalar TRUNCATE operation");
+ if (InVT == MVT::i32)
+ return SDValue();
+ if (InVT.getSizeInBits() == 64)
+ In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
+ else if (InVT.getSizeInBits() < 32)
+ In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
+ }
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+ "Invalid TRUNCATE operation");
+
+ if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
+ if (VT.getVectorElementType().getSizeInBits() >=8)
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+
+ assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+ unsigned NumElts = InVT.getVectorNumElements();
+ assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
+ if (InVT.getSizeInBits() < 512) {
+ MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
+ In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+ InVT = ExtVT;
+ }
+
+ SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
+ const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+ SDValue CP = DAG.getConstantPool(C, getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
+ SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
+ SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
+ return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
+ }
+
+ if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget->hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
DAG.getIntPtrConstant(0));
}
- // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(2));
-
OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
-
- // The PSHUFD mask:
- static const int ShufMask1[] = {0, 2, 0, 0};
- SDValue Undef = DAG.getUNDEF(VT);
- OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
- OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
-
- // The MOVLHPS mask:
- static const int ShufMask2[] = {0, 1, 4, 5};
- return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
+ static const int ShufMask[] = {0, 2, 4, 6};
+ return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
}
- if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
+ if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
// On AVX2, v8i32 -> v8i16 becomed PSHUFB.
if (Subtarget->hasInt256()) {
In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
for (unsigned j = 0; j < 8; ++j)
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
}
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
- &pshufbMask[0], 32);
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
}
// Handle truncation of V256 to V128 using shuffles.
- if (!VT.is128BitVector() || !SVT.is256BitVector())
+ if (!VT.is128BitVector() || !InVT.is256BitVector())
return SDValue();
- assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
- "Invalid op");
assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
unsigned NumElems = VT.getVectorNumElements();
- EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- NumElems * 2);
+ MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
SmallVector<int, 16> MaskVec(NumElems * 2, -1);
// Prepare truncation shuffle mask
SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
SelectionDAG &DAG) const {
- MVT VT = Op.getValueType().getSimpleVT();
- if (VT.isVector()) {
- if (VT == MVT::v8i16)
- return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
- DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op),
- MVT::v8i32, Op.getOperand(0)));
- return SDValue();
- }
+ assert(!Op.getSimpleValueType().isVector());
std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
/*IsSigned=*/ true, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
- if (FIST.getNode() == 0) return Op;
+ if (!FIST.getNode()) return Op;
if (StackSlot.getNode())
// Load the result.
static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
- MVT SVT = In.getValueType().getSimpleVT();
+ MVT SVT = In.getSimpleValueType();
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
In, DAG.getUNDEF(SVT)));
}
-SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
LLVMContext *Context = DAG.getContext();
SDLoc dl(Op);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
MVT EltVT = VT;
unsigned NumElts = VT == MVT::f64 ? 2 : 4;
if (VT.isVector()) {
C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
APInt(32, ~(1U << 31))));
C = ConstantVector::getSplat(NumElts, C);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
}
-SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) {
LLVMContext *Context = DAG.getContext();
SDLoc dl(Op);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
MVT EltVT = VT;
unsigned NumElts = VT == MVT::f64 ? 2 : 4;
if (VT.isVector()) {
C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
APInt(32, 1U << 31)));
C = ConstantVector::getSplat(NumElts, C);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, Alignment);
if (VT.isVector()) {
- MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(ISD::XOR, dl, XORVT,
DAG.getNode(ISD::BITCAST, dl, XORVT,
return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
}
-SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
LLVMContext *Context = DAG.getContext();
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDLoc dl(Op);
- MVT VT = Op.getValueType().getSimpleVT();
- MVT SrcVT = Op1.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
+ MVT SrcVT = Op1.getSimpleValueType();
// If second operand is smaller, extend it first.
if (SrcVT.bitsLT(VT)) {
CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
}
Constant *C = ConstantVector::get(CV);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
}
C = ConstantVector::get(CV);
- CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
// Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, unsigned> VecInMap;
+ SmallVector<SDValue, 8> VecIns;
EVT VT = MVT::Other;
// Recognize a special case where a vector is casted into wide integer to
VT != VecInMap.begin()->first.getValueType())
return SDValue();
M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+ VecIns.push_back(ExtractedFromVec);
}
M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
}
"Not extracted from 128-/256-bit vector.");
unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
- SmallVector<SDValue, 8> VecIns;
for (DenseMap<SDValue, unsigned>::const_iterator
I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
// Quit if not all elements are used.
if (I->second != FullMask)
return SDValue();
- VecIns.push_back(I->first);
}
EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
VecIns.back(), VecIns.back());
}
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+ for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+ ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+
+ if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+ return true;
+ }
+ return false;
+}
+
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
SelectionDAG &DAG) const {
- SDLoc dl(Op);
+ if (Op.getValueType() == MVT::i1)
+ // KORTEST instruction should be selected
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, Op.getValueType()));
// CF and OF aren't always set the way we want. Determine which
// of these we need.
NeedOF = true;
break;
}
-
// See if we can use the EFLAGS value from the operand instead of
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
- if (Op.getResNo() != 0 || NeedOF || NeedCF)
+ if (Op.getResNo() != 0 || NeedOF || NeedCF) {
// Emit a CMP with 0, which is the TEST pattern.
+ //if (Op.getValueType() == MVT::i1)
+ // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
+ // DAG.getConstant(0, MVT::i1));
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, Op.getValueType()));
-
+ }
unsigned Opcode = 0;
unsigned NumOperands = 0;
// Truncate operations may prevent the merge of the SETCC instruction
- // and the arithmetic intruction before it. Attempt to truncate the operands
+ // and the arithmetic instruction before it. Attempt to truncate the operands
// of the arithmetic instruction and use a reduced bit-width instruction.
bool NeedTruncation = false;
SDValue ArithOp = Op;
Opcode = X86ISD::ADD;
NumOperands = 2;
break;
- case ISD::AND: {
- // If the primary and result isn't used, don't bother using X86ISD::AND,
- // because a TEST instruction will be better.
- bool NonFlagUse = false;
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = Op.getNode()->use_end(); UI != UE; ++UI) {
- SDNode *User = *UI;
- unsigned UOpNo = UI.getOperandNo();
- if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
- // Look pass truncate.
- UOpNo = User->use_begin().getOperandNo();
- User = *User->use_begin();
- }
-
- if (User->getOpcode() != ISD::BRCOND &&
- User->getOpcode() != ISD::SETCC &&
- !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
- NonFlagUse = true;
+ case ISD::SHL:
+ case ISD::SRL:
+ // If we have a constant logical shift that's only used in a comparison
+ // against zero turn it into an equivalent AND. This allows turning it into
+ // a TEST instruction later.
+ if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+ isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned ShAmt = Op->getConstantOperandVal(1);
+ if (ShAmt >= BitWidth) // Avoid undefined shifts.
break;
- }
+ APInt Mask = ArithOp.getOpcode() == ISD::SRL
+ ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+ : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ break;
+ SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+ DAG.getConstant(Mask, VT));
+ DAG.ReplaceAllUsesWith(Op, New);
+ Op = New;
}
+ break;
- if (!NonFlagUse)
+ case ISD::AND:
+ // If the primary and result isn't used, don't bother using X86ISD::AND,
+ // because a TEST instruction will be better.
+ if (!hasNonFlagsUse(Op))
break;
- }
// FALL THROUGH
case ISD::SUB:
case ISD::OR:
for (unsigned i = 0; i != NumOperands; ++i)
Ops.push_back(Op.getOperand(i));
- SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+ SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
DAG.ReplaceAllUsesWith(Op, New);
return SDValue(New.getNode(), 1);
}
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
- SelectionDAG &DAG) const {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+ SDLoc dl, SelectionDAG &DAG) const {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
if (C->getAPIntValue() == 0)
- return EmitTest(Op0, X86CC, DAG);
+ return EmitTest(Op0, X86CC, dl, DAG);
- SDLoc dl(Op0);
+ if (Op0.getValueType() == MVT::i1)
+ llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
+ }
+
if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+ // Do the comparison at i32 if it's smaller, besides the Atom case.
+ // This avoids subregister aliasing issues. Keep the smaller reference
+ // if we're optimizing for size, however, as that'll allow better folding
+ // of memory operations.
+ if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
+ !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+ AttributeSet::FunctionIndex, Attribute::MinSize) &&
+ !Subtarget->isAtom()) {
+ unsigned ExtendOp =
+ isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
+ Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+ }
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
// ones, and then concatenate the result back.
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
"Unsupported value type for operation");
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
}
-static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
- SDValue Cond;
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
Op.getValueType().getScalarType() == MVT::i1 &&
"Cannot set masked compare for this operation");
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
- SDLoc dl(Op);
-
+ unsigned Opc = 0;
bool Unsigned = false;
+ bool Swap = false;
unsigned SSECC;
switch (SetCCOpcode) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETNE: SSECC = 4; break;
- case ISD::SETEQ: SSECC = 0; break;
- case ISD::SETUGT: Unsigned = true;
- case ISD::SETGT: SSECC = 6; break; // NLE
- case ISD::SETULT: Unsigned = true;
- case ISD::SETLT: SSECC = 1; break;
- case ISD::SETUGE: Unsigned = true;
- case ISD::SETGE: SSECC = 5; break; // NLT
- case ISD::SETULE: Unsigned = true;
+ case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
+ case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
+ case ISD::SETLT: Swap = true; //fall-through
+ case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
+ case ISD::SETULT: SSECC = 1; Unsigned = true; break;
+ case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
+ case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
+ case ISD::SETULE: Unsigned = true; //fall-through
case ISD::SETLE: SSECC = 2; break;
}
- unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+
+ if (Swap)
+ std::swap(Op0, Op1);
+ if (Opc)
+ return DAG.getNode(Opc, dl, VT, Op0, Op1);
+ Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
return DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(SSECC, MVT::i8));
+}
+
+/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// operand \p Op1. If non-trivial (for example because it's not constant)
+/// return an empty value.
+static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
+{
+ BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+ if (!BV)
+ return SDValue();
+
+ MVT VT = Op1.getSimpleValueType();
+ MVT EVT = VT.getVectorElementType();
+ unsigned n = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> ULTOp1;
+
+ for (unsigned i = 0; i < n; ++i) {
+ ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+ if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
+ return SDValue();
+
+ // Avoid underflow.
+ APInt Val = Elt->getAPIntValue();
+ if (Val == 0)
+ return SDValue();
+
+ ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
+ }
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
}
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- SDValue Cond;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
- bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
+ bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
SDLoc dl(Op);
if (isFP) {
#ifndef NDEBUG
- MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
+ MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
if (Subtarget->hasAVX512()) {
if (Op1.getValueType().is512BitVector() ||
(MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
- return LowerIntVSETCC_AVX512(Op, DAG);
+ return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
// In AVX-512 architecture setcc returns mask with i1 elements,
// But there is no compare instruction for i8 and i16 elements.
// operations may be required for some comparisons.
unsigned Opc;
bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
-
+ bool Subus = false;
+
switch (SetCCOpcode) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETNE: Invert = true;
- case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break;
+ case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
case ISD::SETLT: Swap = true;
- case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break;
+ case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
case ISD::SETGE: Swap = true;
- case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+ case ISD::SETLE: Opc = X86ISD::PCMPGT;
Invert = true; break;
case ISD::SETULT: Swap = true;
- case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+ case ISD::SETUGT: Opc = X86ISD::PCMPGT;
FlipSigns = true; break;
case ISD::SETUGE: Swap = true;
- case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+ case ISD::SETULE: Opc = X86ISD::PCMPGT;
FlipSigns = true; Invert = true; break;
}
-
+
// Special case: Use min/max operations for SETULE/SETUGE
MVT VET = VT.getVectorElementType();
bool hasMinMax =
(Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
|| (Subtarget->hasSSE2() && (VET == MVT::i8));
-
+
if (hasMinMax) {
switch (SetCCOpcode) {
default: break;
case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
}
-
+
if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
}
-
+
+ bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+ if (!MinMax && hasSubus) {
+ // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+ // Op0 u<= Op1:
+ // t = psubus Op0, Op1
+ // pcmpeq t, <0..0>
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETULT: {
+ // If the comparison is against a constant we can turn this into a
+ // setule. With psubus, setule does not require a swap. This is
+ // beneficial because the constant in the register is no longer
+ // destructed as the destination so it can be hoisted out of a loop.
+ // Only do this pre-AVX since vpcmp* is no longer destructive.
+ if (Subtarget->hasAVX())
+ break;
+ SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
+ if (ULEOp1.getNode()) {
+ Op1 = ULEOp1;
+ Subus = true; Invert = false; Swap = false;
+ }
+ break;
+ }
+ // Psubus is better than flip-sign because it requires no inversion.
+ case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
+ case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
+ }
+
+ if (Subus) {
+ Opc = X86ISD::SUBUS;
+ FlipSigns = false;
+ }
+ }
+
if (Swap)
std::swap(Op0, Op1);
// If the logical-not of the result is required, perform that now.
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
-
+
if (MinMax)
Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+ if (Subus)
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+ getZeroVector(VT, Subtarget, DAG, dl));
+
return Result;
}
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
- assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+ assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+ && "SetCC type must be 8-bit or 1-bit integer");
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDLoc dl(Op);
X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
bool Invert = (CC == ISD::SETNE) ^
cast<ConstantSDNode>(Op1)->isNullValue();
- if (!Invert) return Op0;
+ if (!Invert)
+ return Op0;
CCode = X86::GetOppositeBranchCondition(CCode);
- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(CCode, MVT::i8),
+ Op0.getOperand(1));
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+ return SetCC;
}
}
+ if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
+ (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+ ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+ return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
+ }
- bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
+ bool isFP = Op1.getSimpleValueType().isFloatingPoint();
unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
if (X86CC == X86::COND_INVALID)
return SDValue();
- SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+ SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+ return SetCC;
}
// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
if (SSECC != 8) {
- unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd;
- SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1,
+ if (Subtarget->hasAVX512()) {
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
+ DAG.getConstant(SSECC, MVT::i8));
+ return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
+ }
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
DAG.getConstant(SSECC, MVT::i8));
SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
Res = DAG.getNOT(DL, Res, Res.getValueType());
ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
- if (N2C == 0 || !N2C->isNullValue())
+ if (!N2C || !N2C->isNullValue())
Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
return Res;
}
SDValue Cmp = Cond.getOperand(1);
unsigned Opc = Cmp.getOpcode();
- MVT VT = Op.getValueType().getSimpleVT();
+ MVT VT = Op.getSimpleValueType();
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
if (addTest) {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
}
// a < b ? -1 : 0 -> RES = ~setcc_carry
// condition is true.
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = { Op2, Op1, CC, Cond };
- return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
}
-SDValue X86TargetLowering::LowerSIGN_EXTEND_AVX512(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op->getValueType(0);
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
- EVT InVT = In.getValueType();
+ MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
- if (InVT.getVectorElementType().getSizeInBits() >=8 &&
- VT.getVectorElementType().getSizeInBits() >= 32)
+ unsigned int NumElts = VT.getVectorNumElements();
+ if (NumElts != 8 && NumElts != 16)
+ return SDValue();
+
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
- if (InVT.getVectorElementType() == MVT::i1) {
- unsigned int NumElts = InVT.getVectorNumElements();
- assert ((NumElts == 8 || NumElts == 16) &&
- "Unsupported SIGN_EXTEND operation");
- if (VT.getVectorElementType().getSizeInBits() >= 32) {
- Constant *C =
- ConstantInt::get(*DAG.getContext(),
- (NumElts == 8)? APInt(64, ~0ULL): APInt(32, ~0U));
- SDValue CP = DAG.getConstantPool(C, getPointerTy());
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- SDValue Ld = DAG.getLoad(VT.getScalarType(), dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
- return DAG.getNode(X86ISD::VBROADCASTM, dl, VT, In, Ld);
- }
- }
- return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+
+ MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
+ Constant *C = ConstantInt::get(*DAG.getContext(),
+ APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
+
+ SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
+ SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
+ if (VT.is512BitVector())
+ return Brcst;
+ return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
}
-SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
- SelectionDAG &DAG) const {
- MVT VT = Op->getValueType(0).getSimpleVT();
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
- MVT InVT = In.getValueType().getSimpleVT();
+ MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
return LowerSIGN_EXTEND_AVX512(Op, DAG);
if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
- (VT != MVT::v8i32 || InVT != MVT::v8i16))
+ (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+ (VT != MVT::v16i16 || InVT != MVT::v16i8))
return SDValue();
if (Subtarget->hasInt256())
- return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In);
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
// Optimize vectors in AVX mode
// Sign extend v8i16 to v8i32 and
MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
VT.getVectorNumElements()/2);
- OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
- OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
+ OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
+ OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
unsigned X86Opcode;
unsigned X86Cond;
SDVTList VTs;
+ // Keep this in sync with LowerXALUO, otherwise we might create redundant
+ // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
+ // X86ISD::INC).
switch (CondOpcode) {
case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
- case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::SADDO:
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+ if (C->isOne()) {
+ X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
+ break;
+ }
+ X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
- case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::SSUBO:
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+ if (C->isOne()) {
+ X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
+ break;
+ }
+ X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
default: llvm_unreachable("unexpected overflowing operator");
if (addTest) {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
}
Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
SDValue
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
- assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
- getTargetMachine().Options.EnableSegmentedStacks) &&
- "This should be used only on Windows targets or when segmented stacks "
- "are being used");
- assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool SplitStack = MF.shouldSplitStack();
+ bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+ SplitStack;
SDLoc dl(Op);
+ if (!Lower) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDNode* Node = Op.getNode();
+
+ unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+ " not tell us which reg is the stack pointer!");
+ EVT VT = Node->getValueType(0);
+ SDValue Tmp1 = SDValue(Node, 0);
+ SDValue Tmp2 = SDValue(Node, 1);
+ SDValue Tmp3 = Node->getOperand(2);
+ SDValue Chain = Tmp1.getOperand(0);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
+ SDLoc(Node));
+
+ SDValue Size = Tmp2.getOperand(1);
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+ const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+ unsigned StackAlign = TFI.getStackAlignment();
+ Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ if (Align > StackAlign)
+ Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+ DAG.getConstant(-(uint64_t)Align, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+
+ Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+ DAG.getIntPtrConstant(0, true), SDValue(),
+ SDLoc(Node));
+
+ SDValue Ops[2] = { Tmp1, Tmp2 };
+ return DAG.getMergeValues(Ops, dl);
+ }
+
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
- // FIXME: Ensure alignment here
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ EVT VT = Op.getNode()->getValueType(0);
bool Is64Bit = Subtarget->is64Bit();
EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
- if (getTargetMachine().Options.EnableSegmentedStacks) {
- MachineFunction &MF = DAG.getMachineFunction();
+ if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
if (Is64Bit) {
SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
SDValue Ops1[2] = { Value, Chain };
- return DAG.getMergeValues(Ops1, 2, dl);
+ return DAG.getMergeValues(Ops1, dl);
} else {
SDValue Flag;
unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
- Flag = Chain.getValue(1);
const X86RegisterInfo *RegInfo =
static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
- Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
- SPTy).getValue(1);
+ unsigned SPReg = RegInfo->getStackRegister();
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+ Chain = SP.getValue(1);
+
+ if (Align) {
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+ }
- SDValue Ops1[2] = { Chain.getValue(0), Chain };
- return DAG.getMergeValues(Ops1, 2, dl);
+ SDValue Ops1[2] = { SP, Chain };
+ return DAG.getMergeValues(Ops1, dl);
}
}
Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
MachinePointerInfo(SV, 16), false, false, 0);
MemOps.push_back(Store);
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- &MemOps[0], MemOps.size());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
InstOps.push_back(DAG.getConstant(Align, MVT::i32));
SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
- VTs, &InstOps[0], InstOps.size(),
- MVT::i64,
+ VTs, InstOps, MVT::i64,
MachinePointerInfo(SV),
/*Align=*/0,
/*Volatile=*/false,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
+// getTargetVShiftByConstNode - Handle vector element shifts where the shift
+// amount is a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
+ SDValue SrcOp, uint64_t ShiftAmt,
+ SelectionDAG &DAG) {
+ MVT ElementType = VT.getVectorElementType();
+
+ // Check for ShiftAmt >= element width
+ if (ShiftAmt >= ElementType.getSizeInBits()) {
+ if (Opc == X86ISD::VSRAI)
+ ShiftAmt = ElementType.getSizeInBits() - 1;
+ else
+ return DAG.getConstant(0, VT);
+ }
+
+ assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+ && "Unknown target vector shift-by-constant node");
+
+ // Fold this packed vector shift into a build vector if SrcOp is a
+ // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
+ if (VT == SrcOp.getSimpleValueType() &&
+ ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ unsigned NumElts = SrcOp->getNumOperands();
+ ConstantSDNode *ND;
+
+ switch(Opc) {
+ default: llvm_unreachable(nullptr);
+ case X86ISD::VSHLI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
+ }
+ break;
+ case X86ISD::VSRLI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
+ }
+ break;
+ case X86ISD::VSRAI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
+ }
+ break;
+ }
+
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+ }
+
+ return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
+}
+
// getTargetVShiftNode - Handle vector element shifts where the shift amount
// may or may not be a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
SDValue SrcOp, SDValue ShAmt,
SelectionDAG &DAG) {
assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
- if (isa<ConstantSDNode>(ShAmt)) {
- // Constant may be a TargetConstant. Use a regular constant.
- uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
- switch (Opc) {
- default: llvm_unreachable("Unknown target vector shift node");
- case X86ISD::VSHLI:
- case X86ISD::VSRLI:
- case X86ISD::VSRAI:
- return DAG.getNode(Opc, dl, VT, SrcOp,
- DAG.getConstant(ShiftAmt, MVT::i32));
- }
- }
+ // Catch shift-by-constant.
+ if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+ return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+ CShAmt->getZExtValue(), DAG);
// Change opcode to non-immediate version
switch (Opc) {
ShOps[0] = ShAmt;
ShOps[1] = DAG.getConstant(0, MVT::i32);
ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
// The return type has to be a 128-bit type with the same element
// type as the input type.
- MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse41_pmuldq:
+ case Intrinsic::x86_avx2_pmul_dq:
+ return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_pmulhu_w:
+ case Intrinsic::x86_avx2_pmulhu_w:
+ return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_pmulh_w:
+ case Intrinsic::x86_avx2_pmulh_w:
+ return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
// SSE2/AVX2 sub with unsigned saturation intrinsics
case Intrinsic::x86_sse2_psubus_b:
case Intrinsic::x86_sse2_psubus_w:
case Intrinsic::x86_avx2_permd:
case Intrinsic::x86_avx2_permps:
// Operands intentionally swapped. Mask is last operand to intrinsic,
- // but second operand for node/intruction.
+ // but second operand for node/instruction.
return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(1));
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
+ case Intrinsic::x86_avx512_kortestz_w:
+ case Intrinsic::x86_avx512_kortestc_w: {
+ unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
+ SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
+ SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+ SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
// SSE/AVX shift intrinsics
case Intrinsic::x86_sse2_psll_w:
Opcode = X86ISD::VSRAI;
break;
}
- return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
+ return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), DAG);
}
}
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, MVT::i8),
SDValue(PCMP.getNode(), 1));
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ return DAG.getNode(Opcode, dl, VTs, NewOps);
}
case Intrinsic::x86_fma_vfmadd_ps:
case Intrinsic::x86_fma_vfmadd_pd:
case Intrinsic::x86_fma_vfmaddsub_ps_256:
case Intrinsic::x86_fma_vfmaddsub_pd_256:
case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+ case Intrinsic::x86_fma_vfmsubadd_pd_256:
+ case Intrinsic::x86_fma_vfmadd_ps_512:
+ case Intrinsic::x86_fma_vfmadd_pd_512:
+ case Intrinsic::x86_fma_vfmsub_ps_512:
+ case Intrinsic::x86_fma_vfmsub_pd_512:
+ case Intrinsic::x86_fma_vfnmadd_ps_512:
+ case Intrinsic::x86_fma_vfnmadd_pd_512:
+ case Intrinsic::x86_fma_vfnmsub_ps_512:
+ case Intrinsic::x86_fma_vfnmsub_pd_512:
+ case Intrinsic::x86_fma_vfmaddsub_ps_512:
+ case Intrinsic::x86_fma_vfmaddsub_pd_512:
+ case Intrinsic::x86_fma_vfmsubadd_ps_512:
+ case Intrinsic::x86_fma_vfmsubadd_pd_512: {
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_fma_vfmadd_pd:
case Intrinsic::x86_fma_vfmadd_ps_256:
case Intrinsic::x86_fma_vfmadd_pd_256:
+ case Intrinsic::x86_fma_vfmadd_ps_512:
+ case Intrinsic::x86_fma_vfmadd_pd_512:
Opc = X86ISD::FMADD;
break;
case Intrinsic::x86_fma_vfmsub_ps:
case Intrinsic::x86_fma_vfmsub_pd:
case Intrinsic::x86_fma_vfmsub_ps_256:
case Intrinsic::x86_fma_vfmsub_pd_256:
+ case Intrinsic::x86_fma_vfmsub_ps_512:
+ case Intrinsic::x86_fma_vfmsub_pd_512:
Opc = X86ISD::FMSUB;
break;
case Intrinsic::x86_fma_vfnmadd_ps:
case Intrinsic::x86_fma_vfnmadd_pd:
case Intrinsic::x86_fma_vfnmadd_ps_256:
case Intrinsic::x86_fma_vfnmadd_pd_256:
+ case Intrinsic::x86_fma_vfnmadd_ps_512:
+ case Intrinsic::x86_fma_vfnmadd_pd_512:
Opc = X86ISD::FNMADD;
break;
case Intrinsic::x86_fma_vfnmsub_ps:
case Intrinsic::x86_fma_vfnmsub_pd:
case Intrinsic::x86_fma_vfnmsub_ps_256:
case Intrinsic::x86_fma_vfnmsub_pd_256:
+ case Intrinsic::x86_fma_vfnmsub_ps_512:
+ case Intrinsic::x86_fma_vfnmsub_pd_512:
Opc = X86ISD::FNMSUB;
break;
case Intrinsic::x86_fma_vfmaddsub_ps:
case Intrinsic::x86_fma_vfmaddsub_pd:
case Intrinsic::x86_fma_vfmaddsub_ps_256:
case Intrinsic::x86_fma_vfmaddsub_pd_256:
+ case Intrinsic::x86_fma_vfmaddsub_ps_512:
+ case Intrinsic::x86_fma_vfmaddsub_pd_512:
Opc = X86ISD::FMADDSUB;
break;
case Intrinsic::x86_fma_vfmsubadd_ps:
case Intrinsic::x86_fma_vfmsubadd_pd:
case Intrinsic::x86_fma_vfmsubadd_ps_256:
case Intrinsic::x86_fma_vfmsubadd_pd_256:
+ case Intrinsic::x86_fma_vfmsubadd_ps_512:
+ case Intrinsic::x86_fma_vfmsubadd_pd_512:
Opc = X86ISD::FMSUBADD;
break;
}
}
}
-static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget * Subtarget) {
+ SDLoc dl(Op);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ assert(C && "Invalid scale type");
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+ SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+ EVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+ return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget * Subtarget) {
+ SDLoc dl(Op);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ assert(C && "Invalid scale type");
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+ EVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ if (Src.getOpcode() == ISD::UNDEF)
+ Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+ SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+ return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain) {
+ SDLoc dl(Op);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ assert(C && "Invalid scale type");
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+ SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ EVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+ SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ return SDValue(Res, 1);
+}
+
+static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain) {
+ SDLoc dl(Op);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ assert(C && "Invalid scale type");
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+ SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ EVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ return SDValue(Res, 1);
+}
+
+// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
+// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
+// also used to custom lower READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
+ SelectionDAG &DAG, const X86Subtarget *Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+ SDValue LO, HI;
+
+ // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+ // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+ // and the EAX register is loaded with the low-order 32 bits.
+ if (Subtarget->is64Bit()) {
+ LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ SDValue Chain = HI.getValue(1);
+
+ if (Opcode == X86ISD::RDTSCP_DAG) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+ HI.getValue(2));
+ // Explicitly store the content of ECX at the location passed in input
+ // to the 'rdtscp' intrinsic.
+ Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+ MachinePointerInfo(), false, false, 0);
+ }
+
+ if (Subtarget->is64Bit()) {
+ // The EDX register is loaded with the high-order 32 bits of the MSR, and
+ // the EAX register is loaded with the low-order 32 bits.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SmallVector<SDValue, 2> Results;
+ SDLoc DL(Op);
+ getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, DL);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntNo) {
SDValue(Result.getNode(), 1) };
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
DAG.getVTList(Op->getValueType(1), MVT::Glue),
- Ops, array_lengthof(Ops));
+ Ops);
// Return { result, isValid, chain }.
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
-
+ //int_gather(index, base, scale);
+ case Intrinsic::x86_avx512_gather_qpd_512:
+ case Intrinsic::x86_avx512_gather_qps_512:
+ case Intrinsic::x86_avx512_gather_dpd_512:
+ case Intrinsic::x86_avx512_gather_qpi_512:
+ case Intrinsic::x86_avx512_gather_qpq_512:
+ case Intrinsic::x86_avx512_gather_dpq_512:
+ case Intrinsic::x86_avx512_gather_dps_512:
+ case Intrinsic::x86_avx512_gather_dpi_512: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
+ case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
+ case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
+ case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
+ case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
+ case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
+ case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
+ case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
+ }
+ SDValue Chain = Op.getOperand(0);
+ SDValue Index = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
+ SDValue Scale = Op.getOperand(4);
+ return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
+ }
+ //int_gather_mask(v1, mask, index, base, scale);
+ case Intrinsic::x86_avx512_gather_qps_mask_512:
+ case Intrinsic::x86_avx512_gather_qpd_mask_512:
+ case Intrinsic::x86_avx512_gather_dpd_mask_512:
+ case Intrinsic::x86_avx512_gather_dps_mask_512:
+ case Intrinsic::x86_avx512_gather_qpi_mask_512:
+ case Intrinsic::x86_avx512_gather_qpq_mask_512:
+ case Intrinsic::x86_avx512_gather_dpi_mask_512:
+ case Intrinsic::x86_avx512_gather_dpq_mask_512: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_avx512_gather_qps_mask_512:
+ Opc = X86::VGATHERQPSZrm; break;
+ case Intrinsic::x86_avx512_gather_qpd_mask_512:
+ Opc = X86::VGATHERQPDZrm; break;
+ case Intrinsic::x86_avx512_gather_dpd_mask_512:
+ Opc = X86::VGATHERDPDZrm; break;
+ case Intrinsic::x86_avx512_gather_dps_mask_512:
+ Opc = X86::VGATHERDPSZrm; break;
+ case Intrinsic::x86_avx512_gather_qpi_mask_512:
+ Opc = X86::VPGATHERQDZrm; break;
+ case Intrinsic::x86_avx512_gather_qpq_mask_512:
+ Opc = X86::VPGATHERQQZrm; break;
+ case Intrinsic::x86_avx512_gather_dpi_mask_512:
+ Opc = X86::VPGATHERDDZrm; break;
+ case Intrinsic::x86_avx512_gather_dpq_mask_512:
+ Opc = X86::VPGATHERDQZrm; break;
+ }
+ SDValue Chain = Op.getOperand(0);
+ SDValue Src = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Base = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+ Subtarget);
+ }
+ //int_scatter(base, index, v1, scale);
+ case Intrinsic::x86_avx512_scatter_qpd_512:
+ case Intrinsic::x86_avx512_scatter_qps_512:
+ case Intrinsic::x86_avx512_scatter_dpd_512:
+ case Intrinsic::x86_avx512_scatter_qpi_512:
+ case Intrinsic::x86_avx512_scatter_qpq_512:
+ case Intrinsic::x86_avx512_scatter_dpq_512:
+ case Intrinsic::x86_avx512_scatter_dps_512:
+ case Intrinsic::x86_avx512_scatter_dpi_512: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_avx512_scatter_qpd_512:
+ Opc = X86::VSCATTERQPDZmr; break;
+ case Intrinsic::x86_avx512_scatter_qps_512:
+ Opc = X86::VSCATTERQPSZmr; break;
+ case Intrinsic::x86_avx512_scatter_dpd_512:
+ Opc = X86::VSCATTERDPDZmr; break;
+ case Intrinsic::x86_avx512_scatter_dps_512:
+ Opc = X86::VSCATTERDPSZmr; break;
+ case Intrinsic::x86_avx512_scatter_qpi_512:
+ Opc = X86::VPSCATTERQDZmr; break;
+ case Intrinsic::x86_avx512_scatter_qpq_512:
+ Opc = X86::VPSCATTERQQZmr; break;
+ case Intrinsic::x86_avx512_scatter_dpq_512:
+ Opc = X86::VPSCATTERDQZmr; break;
+ case Intrinsic::x86_avx512_scatter_dpi_512:
+ Opc = X86::VPSCATTERDDZmr; break;
+ }
+ SDValue Chain = Op.getOperand(0);
+ SDValue Base = Op.getOperand(2);
+ SDValue Index = Op.getOperand(3);
+ SDValue Src = Op.getOperand(4);
+ SDValue Scale = Op.getOperand(5);
+ return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
+ }
+ //int_scatter_mask(base, mask, index, v1, scale);
+ case Intrinsic::x86_avx512_scatter_qps_mask_512:
+ case Intrinsic::x86_avx512_scatter_qpd_mask_512:
+ case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+ case Intrinsic::x86_avx512_scatter_dps_mask_512:
+ case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+ case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+ case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+ case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_avx512_scatter_qpd_mask_512:
+ Opc = X86::VSCATTERQPDZmr; break;
+ case Intrinsic::x86_avx512_scatter_qps_mask_512:
+ Opc = X86::VSCATTERQPSZmr; break;
+ case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+ Opc = X86::VSCATTERDPDZmr; break;
+ case Intrinsic::x86_avx512_scatter_dps_mask_512:
+ Opc = X86::VSCATTERDPSZmr; break;
+ case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+ Opc = X86::VPSCATTERQDZmr; break;
+ case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+ Opc = X86::VPSCATTERQQZmr; break;
+ case Intrinsic::x86_avx512_scatter_dpq_mask_512:
+ Opc = X86::VPSCATTERDQZmr; break;
+ case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+ Opc = X86::VPSCATTERDDZmr; break;
+ }
+ SDValue Chain = Op.getOperand(0);
+ SDValue Base = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Src = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+ }
+ // Read Time Stamp Counter (RDTSC).
+ case Intrinsic::x86_rdtsc:
+ // Read Time Stamp Counter and Processor ID (RDTSCP).
+ case Intrinsic::x86_rdtscp: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_rdtsc:
+ Opc = X86ISD::RDTSC_DAG; break;
+ case Intrinsic::x86_rdtscp:
+ Opc = X86ISD::RDTSCP_DAG; break;
+ }
+ SmallVector<SDValue, 2> Results;
+ getReadTimeStampCounter(Op.getNode(), dl, Opc, DAG, Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
+ }
// XTEST intrinsics.
case Intrinsic::x86_xtest: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
MFI->setReturnAddressIsTaken(true);
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc dl(Op);
EVT PtrVT = getPointerTy();
return FrameAddr;
}
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("esp", X86::ESP)
+ .Case("rsp", X86::RSP)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ report_fatal_error("Invalid register name global variable");
+}
+
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
const X86RegisterInfo *RegInfo =
MachinePointerInfo(TrmpAddr, 22),
false, false, 0);
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
} else {
const Function *Func =
cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
MachinePointerInfo(TrmpAddr, 6),
false, false, 1);
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
}
const TargetMachine &TM = MF.getTarget();
const TargetFrameLowering &TFI = *TM.getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
// Save FP Control Word to stack slot
SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
DAG.getVTList(MVT::Other),
- Ops, array_lengthof(Ops), MVT::i16,
- MMO);
+ Ops, MVT::i16, MMO);
// Load FP Control Word from stack slot
SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
}
static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
DAG.getConstant(X86::COND_E, MVT::i8),
Op.getValue(1)
};
- Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
// Finally xor with NumBits-1.
Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
}
static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
}
static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
Op = Op.getOperand(0);
DAG.getConstant(X86::COND_E, MVT::i8),
Op.getValue(1)
};
- return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
// ones, and then concatenate the result back.
static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
assert(VT.is256BitVector() && VT.isInteger() &&
"Unsupported value type for operation");
SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
- MVT EltVT = VT.getVectorElementType().getSimpleVT();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
}
static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getValueType().is256BitVector() &&
- Op.getValueType().isInteger() &&
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return Lower256IntArith(Op, DAG);
}
static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getValueType().is256BitVector() &&
- Op.getValueType().isInteger() &&
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return Lower256IntArith(Op, DAG);
}
static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget->hasInt256())
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
}
- assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
- "Only know how to lower V2I64/V4I64 multiply");
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+ "Only know how to lower V2I64/V4I64/V8I64 multiply");
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
// AhiBlo = psllqi(AhiBlo, 32);
// return AloBlo + AloBhi + AhiBlo;
- SDValue ShAmt = DAG.getConstant(32, MVT::i32);
-
- SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
- SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
+ SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+ SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
// Bit cast to 32-bit vectors for MULUDQ
- EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+ EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+ (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
- AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
- AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
+ AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+ AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
}
-SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWin64() && "Unexpected target");
EVT VT = Op.getValueType();
- EVT EltTy = VT.getVectorElementType();
- unsigned NumElts = VT.getVectorNumElements();
- SDValue N0 = Op.getOperand(0);
+ assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+ "Unexpected return type for lowering");
+
+ RTLIB::Libcall LC;
+ bool isSigned;
+ switch (Op->getOpcode()) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
+ case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
+ case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
+ case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
+ case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
+ case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+ }
+
SDLoc dl(Op);
+ SDValue InChain = DAG.getEntryNode();
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+ EVT ArgVT = Op->getOperand(i).getValueType();
+ assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+ "Unexpected argument type for lowering");
+ SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+ Entry.Node = StackPtr;
+ InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+ false, false, 16);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Entry.Ty = PointerType::get(ArgTy,0);
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+ }
+
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy());
- // Lower sdiv X, pow2-const.
- BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
- if (!C)
- return SDValue();
+ TargetLowering::CallLoweringInfo CLI(
+ InChain, static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+ isSigned, !isSigned, false, true, 0, getLibcallCallingConv(LC),
+ /*isTailCall=*/false,
+ /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG,
+ dl);
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
- APInt SplatValue, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
- HasAnyUndefs) ||
- EltTy.getSizeInBits() < SplatBitSize)
- return SDValue();
+ return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+ EVT VT = Op0.getValueType();
+ SDLoc dl(Op);
- if ((SplatValue != 0) &&
- (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
- unsigned lg2 = SplatValue.countTrailingZeros();
- // Splat the sign bit.
- SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
- SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
- // Add (N0 < 0) ? abs2 - 1 : 0;
- SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
- SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
- SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
- SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
- SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
-
- // If we're dividing by a positive value, we're done. Otherwise, we must
- // negate the result.
- if (SplatValue.isNonNegative())
- return SRA;
-
- SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
- SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
- return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+ assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+ // Get the high parts.
+ const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
+ SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+ SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+
+ // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+ // ints.
+ MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+ bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+ unsigned Opcode =
+ (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+ SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+ SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+
+ // Shuffle it back into the right order.
+ const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
+ SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
+ SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+
+ // If we have a signed multiply but no PMULDQ fix up the high parts of a
+ // unsigned multiply.
+ if (IsSigned && !Subtarget->hasSSE41()) {
+ SDValue ShAmt =
+ DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+ SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+ SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+ SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+ Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
}
- return SDValue();
+
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
}
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
(Subtarget->hasInt256() &&
- (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+ (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+ (Subtarget->hasAVX512() &&
+ (VT == MVT::v8i64 || VT == MVT::v16i32))) {
if (Op.getOpcode() == ISD::SHL)
- return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+ DAG);
if (Op.getOpcode() == ISD::SRL)
- return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+ DAG);
if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
- return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+ DAG);
}
if (VT == MVT::v16i8) {
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
- SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+ MVT::v8i16, R, ShiftAmt,
+ DAG);
SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
// Zero out the rightmost bits.
SmallVector<SDValue, 16> V(16,
DAG.getConstant(uint8_t(-1U << ShiftAmt),
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
- SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+ MVT::v8i16, R, ShiftAmt,
+ DAG);
SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
// Zero out the leftmost bits.
SmallVector<SDValue, 16> V(16,
DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRA) {
if (ShiftAmt == 7) {
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
if (Subtarget->hasInt256() && VT == MVT::v32i8) {
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
- SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+ MVT::v16i16, R, ShiftAmt,
+ DAG);
SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
// Zero out the rightmost bits.
SmallVector<SDValue, 32> V(32,
DAG.getConstant(uint8_t(-1U << ShiftAmt),
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
- SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+ MVT::v16i16, R, ShiftAmt,
+ DAG);
SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
// Zero out the leftmost bits.
SmallVector<SDValue, 32> V(32,
DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRA) {
if (ShiftAmt == 7) {
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
- unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+ unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
VT.getVectorNumElements();
unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
uint64_t ShiftAmt = 0;
for (unsigned i = 0; i != Ratio; ++i) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
- if (C == 0)
+ if (!C)
return SDValue();
// 6 == Log2(64)
ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
for (unsigned j = 0; j != Ratio; ++j) {
ConstantSDNode *C =
dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
- if (C == 0)
+ if (!C)
return SDValue();
// 6 == Log2(64)
ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
default:
llvm_unreachable("Unknown shift opcode!");
case ISD::SHL:
- return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+ DAG);
case ISD::SRL:
- return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+ DAG);
case ISD::SRA:
- return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
- DAG.getConstant(ShiftAmt, MVT::i32));
+ return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+ DAG);
}
}
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget* Subtarget) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
VT == MVT::v4i32 || VT == MVT::v8i16 ||
(Subtarget->hasInt256() &&
((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
- VT == MVT::v8i32 || VT == MVT::v16i16))) {
+ VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+ (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
SDValue BaseShAmt;
EVT EltVT = VT.getVectorElementType();
BaseShAmt = InVec.getOperand(1);
}
}
- if (BaseShAmt.getNode() == 0)
+ if (!BaseShAmt.getNode())
BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
DAG.getIntPtrConstant(0));
}
default:
llvm_unreachable("Unknown shift opcode!");
case ISD::SHL:
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: return SDValue();
case MVT::v2i64:
case MVT::v4i32:
case MVT::v4i64:
case MVT::v8i32:
case MVT::v16i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
}
case ISD::SRA:
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: return SDValue();
case MVT::v4i32:
case MVT::v8i16:
case MVT::v8i32:
case MVT::v16i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
}
case ISD::SRL:
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: return SDValue();
case MVT::v2i64:
case MVT::v4i32:
case MVT::v4i64:
case MVT::v8i32:
case MVT::v16i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
}
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
if (!Subtarget->is64Bit() &&
- (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+ (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
+ (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
- unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+ unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
VT.getVectorNumElements();
std::vector<SDValue> Vals(Ratio);
for (unsigned i = 0; i != Ratio; ++i)
return SDValue();
}
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+ SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
if (V.getNode())
return V;
+ if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
+ return Op;
// AVX2 has VPSLLV/VPSRAV/VPSRLV.
if (Subtarget->hasInt256()) {
if (Op.getOpcode() == ISD::SRL &&
return Op;
}
+ // If possible, lower this packed shift into a vector multiply instead of
+ // expanding it into a sequence of scalar shifts.
+ // Do this only if the vector shift count is a constant build_vector.
+ if (Op.getOpcode() == ISD::SHL &&
+ (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
+ ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ EVT SVT = VT.getScalarType();
+ unsigned SVTBits = SVT.getSizeInBits();
+ const APInt &One = APInt(SVTBits, 1);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ for (unsigned i=0; i !=NumElems; ++i) {
+ SDValue Op = Amt->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(Op);
+ continue;
+ }
+
+ ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+ const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
+ uint64_t ShAmt = C.getZExtValue();
+ if (ShAmt >= SVTBits) {
+ Elts.push_back(DAG.getUNDEF(SVT));
+ continue;
+ }
+ Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
+ }
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+ return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+ }
+
// Lower SHL with variable shift amount.
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
}
+
+ // If possible, lower this shift as a sequence of two shifts by
+ // constant plus a MOVSS/MOVSD instead of scalarizing it.
+ // Example:
+ // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+ //
+ // Could be rewritten as:
+ // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+ //
+ // The advantage is that the two shifts from the example would be
+ // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+ // the vector shift into four scalar shifts plus four pairs of vector
+ // insert/extract.
+ if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
+ ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ unsigned TargetOpcode = X86ISD::MOVSS;
+ bool CanBeSimplified;
+ // The splat value for the first packed shift (the 'X' from the example).
+ SDValue Amt1 = Amt->getOperand(0);
+ // The splat value for the second packed shift (the 'Y' from the example).
+ SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
+ Amt->getOperand(2);
+
+ // See if it is possible to replace this node with a sequence of
+ // two shifts followed by a MOVSS/MOVSD
+ if (VT == MVT::v4i32) {
+ // Check if it is legal to use a MOVSS.
+ CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+ Amt2 == Amt->getOperand(3);
+ if (!CanBeSimplified) {
+ // Otherwise, check if we can still simplify this node using a MOVSD.
+ CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+ Amt->getOperand(2) == Amt->getOperand(3);
+ TargetOpcode = X86ISD::MOVSD;
+ Amt2 = Amt->getOperand(2);
+ }
+ } else {
+ // Do similar checks for the case where the machine value type
+ // is MVT::v8i16.
+ CanBeSimplified = Amt1 == Amt->getOperand(1);
+ for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+ if (!CanBeSimplified) {
+ TargetOpcode = X86ISD::MOVSD;
+ CanBeSimplified = true;
+ Amt2 = Amt->getOperand(4);
+ for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt1 == Amt->getOperand(i);
+ for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+ CanBeSimplified = Amt2 == Amt->getOperand(j);
+ }
+ }
+
+ if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+ isa<ConstantSDNode>(Amt2)) {
+ // Replace this node with two shifts followed by a MOVSS/MOVSD.
+ EVT CastVT = MVT::v4i32;
+ SDValue Splat1 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
+ SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+ SDValue Splat2 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
+ SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+ if (TargetOpcode == X86ISD::MOVSD)
+ CastVT = MVT::v2i64;
+ SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
+ SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+ SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
+ BitCast1, DAG);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
+ }
+
if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
// r = VSELECT(r, psllw(r & (char16)15, 4), a);
SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
- M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
- DAG.getConstant(4, MVT::i32), DAG);
+ M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
M = DAG.getNode(ISD::BITCAST, dl, VT, M);
R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
// r = VSELECT(r, psllw(r & (char16)63, 2), a);
M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
- M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
- DAG.getConstant(2, MVT::i32), DAG);
+ M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
M = DAG.getNode(ISD::BITCAST, dl, VT, M);
R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
return R;
}
+ // It's worth extending once and using the v8i32 shifts for 16-bit types, but
+ // the extra overheads to get from v16i8 to v8i32 make the existing SSE
+ // solution better.
+ if (Subtarget->hasInt256() && VT == MVT::v8i16) {
+ MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
+ unsigned ExtOpc =
+ Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ R = DAG.getNode(ExtOpc, dl, NewVT, R);
+ Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
+ }
+
// Decompose 256-bit shifts into smaller 128-bit shifts.
if (VT.is256BitVector()) {
unsigned NumElems = VT.getVectorNumElements();
- MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
// Extract the two vectors
for (unsigned i = NumElems/2; i != NumElems; ++i)
Amt2Csts.push_back(Amt->getOperand(i));
- Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
- &Amt1Csts[0], NumElems/2);
- Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
- &Amt2Csts[0], NumElems/2);
+ Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+ Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
} else {
// Variable shift amount
Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
if (!Subtarget->hasSSE2() || !VT.isVector())
return SDValue();
unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
ExtraVT.getScalarType().getSizeInBits();
- SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: return SDValue();
case MVT::v8i32:
case MVT::v16i16:
SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
- MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
EVT ExtraEltVT = ExtraVT.getVectorElementType();
// fall through
case MVT::v4i32:
case MVT::v8i16: {
- // (sext (vzext x)) -> (vsext x)
SDValue Op0 = Op.getOperand(0);
SDValue Op00 = Op0.getOperand(0);
SDValue Tmp1;
// Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
if (Op0.getOpcode() == ISD::BITCAST &&
- Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
+ Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ // (sext (vzext x)) -> (vsext x)
Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
- if (Tmp1.getNode()) {
- SDValue Tmp1Op0 = Tmp1.getOperand(0);
- assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
- "This optimization is invalid without a VZEXT.");
- return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+ if (Tmp1.getNode()) {
+ EVT ExtraEltVT = ExtraVT.getVectorElementType();
+ // This folding is only valid when the in-reg type is a vector of i8,
+ // i16, or i32.
+ if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
+ ExtraEltVT == MVT::i32) {
+ SDValue Tmp1Op0 = Tmp1.getOperand(0);
+ assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+ "This optimization is invalid without a VZEXT.");
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+ }
+ Op0 = Tmp1;
+ }
}
// If the above didn't work, then just use Shift-Left + Shift-Right.
- Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG);
- return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
+ Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
+ DAG);
+ return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
+ DAG);
}
}
}
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- EVT T = Op.getValueType();
+ MVT T = Op.getSimpleValueType();
SDLoc DL(Op);
unsigned Reg = 0;
unsigned size = 0;
- switch(T.getSimpleVT().SimpleTy) {
+ switch(T.SimpleTy) {
default: llvm_unreachable("Invalid value type!");
case MVT::i8: Reg = X86::AL; size = 1; break;
case MVT::i16: Reg = X86::AX; size = 2; break;
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
- Ops, array_lengthof(Ops), T, MMO);
+ Ops, T, MMO);
SDValue cpOut =
DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
return cpOut;
}
-static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- assert(Subtarget->is64Bit() && "Result not type legalized?");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue TheChain = Op.getOperand(0);
- SDLoc dl(Op);
- SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
- SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
- SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
- rax.getValue(2));
- SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
- DAG.getConstant(32, MVT::i8));
- SDValue Ops[] = {
- DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
- rdx.getValue(1)
- };
- return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
-}
-
-SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
- EVT SrcVT = Op.getOperand(0).getValueType();
- EVT DstVT = Op.getValueType();
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
Subtarget->hasMMX() && "Unexpected custom BITCAST");
assert((DstVT == MVT::i64 ||
cast<AtomicSDNode>(Node)->getMemoryVT(),
Node->getOperand(0),
Node->getOperand(1), negOp,
- cast<AtomicSDNode>(Node)->getSrcValue(),
- cast<AtomicSDNode>(Node)->getAlignment(),
+ cast<AtomicSDNode>(Node)->getMemOperand(),
cast<AtomicSDNode>(Node)->getOrdering(),
cast<AtomicSDNode>(Node)->getSynchScope());
}
}
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getNode()->getValueType(0);
+ EVT VT = Op.getNode()->getSimpleValueType(0);
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
Op.getOperand(1), Op.getOperand(2));
}
-SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
// For MacOSX, we want to call an alternative entry point: __sincos_stret,
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- ArgListTy Args;
- ArgListEntry Entry;
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
// the small struct {f32, f32} is returned in (eax, edx). For f64,
// the results are returned via SRet in memory.
const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
- SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
Type *RetTy = isF64
? (Type*)StructType::get(ArgTy, ArgTy, NULL)
CallingConv::C, /*isTaillCall=*/false,
/*doesNotRet=*/false, /*isReturnValueUsed*/true,
Callee, Args, DAG, dl);
- std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
if (isF64)
// Returned in xmm0 and xmm1.
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
- case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
- case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
- case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG);
+ case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
+ case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+ case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
- case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::FRAME_TO_ARGS_OFFSET:
case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
+ case ISD::UMUL_LOHI:
+ case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
- case ISD::SHL: return LowerShift(Op, DAG);
+ case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
case ISD::SMULO:
case ISD::UMULO: return LowerXALUO(Op, DAG);
case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
- case ISD::BITCAST: return LowerBITCAST(Op, DAG);
+ case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
case ISD::ADDC:
case ISD::ADDE:
case ISD::SUBC:
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::ADD: return LowerADD(Op, DAG);
case ISD::SUB: return LowerSUB(Op, DAG);
- case ISD::SDIV: return LowerSDIV(Op, DAG);
- case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
+ case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
}
}
Node->getOperand(1), Zero, Zero,
cast<AtomicSDNode>(Node)->getMemOperand(),
cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
cast<AtomicSDNode>(Node)->getSynchScope());
Results.push_back(Swap.getValue(0));
Results.push_back(Swap.getValue(1));
SDValue Ops[] = { Chain, In1, In2L, In2H };
SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue Result =
- DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
+ DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
cast<MemSDNode>(Node)->getMemOperand());
SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
Results.push_back(Result.getValue(2));
}
case ISD::SUBE:
// We don't want to expand or promote these.
return;
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM:
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+ Results.push_back(V);
+ return;
+ }
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
std::pair<SDValue,SDValue> Vals =
FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
- if (FIST.getNode() != 0) {
+ if (FIST.getNode()) {
EVT VT = N->getValueType(0);
// Return a load from the stack slot.
- if (StackSlot.getNode() != 0)
+ if (StackSlot.getNode())
Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
MachinePointerInfo(),
false, false, false, 0));
Results.push_back(V);
return;
}
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default : llvm_unreachable("Do not know how to custom type "
+ "legalize this intrinsic operation!");
+ case Intrinsic::x86_rdtsc:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdtscp:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+ Results);
+ }
+ }
case ISD::READCYCLECOUNTER: {
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue TheChain = N->getOperand(0);
- SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
- SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
- rd.getValue(1));
- SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
- eax.getValue(2));
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- SDValue Ops[] = { eax, edx };
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
- array_lengthof(Ops)));
- Results.push_back(edx.getValue(1));
- return;
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
}
case ISD::ATOMIC_CMP_SWAP: {
EVT T = N->getValueType(0);
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
X86ISD::LCMPXCHG8_DAG;
- SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
- Ops, array_lengthof(Ops), T, MMO);
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
Regs64bit ? X86::RDX : X86::EDX,
HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
Results.push_back(cpOutH.getValue(1));
return;
}
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch (Opcode) {
- default: return NULL;
+ default: return nullptr;
case X86ISD::BSF: return "X86ISD::BSF";
case X86ISD::BSR: return "X86ISD::BSR";
case X86ISD::SHLD: return "X86ISD::SHLD";
case X86ISD::CMPMU: return "X86ISD::CMPMU";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
- case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd";
- case X86ISD::FSETCCss: return "X86ISD::FSETCCss";
+ case X86ISD::FSETCC: return "X86ISD::FSETCC";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
- case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
case X86ISD::VZEXT: return "X86ISD::VZEXT";
case X86ISD::VSEXT: return "X86ISD::VSEXT";
+ case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
+ case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM";
+ case X86ISD::VINSERT: return "X86ISD::VINSERT";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
- case X86ISD::BLSI: return "X86ISD::BLSI";
- case X86ISD::BLSMSK: return "X86ISD::BLSMSK";
- case X86ISD::BLSR: return "X86ISD::BLSR";
+ case X86ISD::BEXTR: return "X86ISD::BEXTR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
+ case X86ISD::TESTM: return "X86ISD::TESTM";
+ case X86ISD::TESTNM: return "X86ISD::TESTNM";
+ case X86ISD::KORTEST: return "X86ISD::KORTEST";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
+ case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
+ case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
+ case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
Reloc::Model R = getTargetMachine().getRelocationModel();
// X86 allows a sign-extended 32-bit immediate field as a displacement.
- if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+ if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
return false;
if (AM.BaseGV) {
return true;
}
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+ unsigned Bits = Ty->getScalarSizeInBits();
+
+ // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+ // particularly cheaper than those without.
+ if (Bits == 8)
+ return false;
+
+ // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+ // variable shifts just as cheap as scalar ones.
+ if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+ return false;
+
+ // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+ // fully general vector.
+ return true;
+}
+
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
return (SVT.getVectorNumElements() == 2 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isMOVLMask(M, SVT) ||
- isSHUFPMask(M, SVT, Subtarget->hasFp256()) ||
+ isSHUFPMask(M, SVT) ||
isPSHUFDMask(M, SVT) ||
isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
if (NumElts == 4 && SVT.is128BitVector()) {
return (isMOVLMask(Mask, SVT) ||
isCommutedMOVLMask(Mask, SVT, true) ||
- isSHUFPMask(Mask, SVT, Subtarget->hasFp256()) ||
- isSHUFPMask(Mask, SVT, Subtarget->hasFp256(), /* Commuted */ true));
+ isSHUFPMask(Mask, SVT) ||
+ isSHUFPMask(Mask, SVT, /* Commuted */ true));
}
return false;
}
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
OffsetDestReg = 0; // unused
OverflowDestReg = DestReg;
- offsetMBB = NULL;
+ offsetMBB = nullptr;
overflowMBB = thisMBB;
endMBB = thisMBB;
} else {
// Transfer the remainder of MBB and its successor edges to endMBB.
endMBB->splice(endMBB->begin(), thisMBB,
- llvm::next(MachineBasicBlock::iterator(MI)),
- thisMBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
// Make offsetMBB and overflowMBB successors of thisMBB
// Transfer the remainder of MBB and its successor edges to EndMBB.
EndMBB->splice(EndMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)),
- MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
// The original block will now fall through to the XMM save block.
MBB->addSuccessor(EndMBB);
}
+ // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+ // that was just emitted, but clearly shouldn't be "saved".
+ assert((MI->getNumOperands() <= 3 ||
+ !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
+ MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
+ && "Expected last argument to be EFLAGS");
unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
// In the XMM save block, save all the XMM argument registers.
- for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+ for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
MachineMemOperand *MMO =
F->getMachineMemOperand(
MachineBasicBlock* BB,
const TargetRegisterInfo* TRI) {
// Scan forward through BB for a use/def of EFLAGS.
- MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+ MachineBasicBlock::iterator miI(std::next(SelectItr));
for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
const MachineInstr& mi = *miI;
if (mi.readsRegister(X86::EFLAGS))
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
- llvm::next(MachineBasicBlock::iterator(MI)),
- BB->end());
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add the true and fallthrough blocks as its successors.
MachineFunction *MF = BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- assert(getTargetMachine().Options.EnableSegmentedStacks);
+ assert(MF->shouldSplitStack());
unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
MF->insert(MBBIter, mallocMBB);
MF->insert(MBBIter, continueMBB);
- continueMBB->splice(continueMBB->begin(), BB, llvm::next
- (MachineBasicBlock::iterator(MI)), BB->end());
+ continueMBB->splice(continueMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
continueMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add code to the main basic block to check if the stack limit has been hit,
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
- assert(!Subtarget->isTargetEnvMacho());
+ assert(!Subtarget->isTargetMacho());
// The lowering is pretty easy: we're just emitting the call to _alloca. The
// non-trivial part is impdef of ESP.
}
} else {
const char *StackProbeSymbol =
- Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
+ Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
.addExternalSymbol(StackProbeSymbol)
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
return MBB;
}
+// Replace 213-type (isel default) FMA3 instructions with 231-type for
+// accumulator loops. Writing back to the accumulator allows the coalescer
+// to remove extra copies in the loop.
+MachineBasicBlock *
+X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ MachineOperand &AddendOp = MI->getOperand(3);
+
+ // Bail out early if the addend isn't a register - we can't switch these.
+ if (!AddendOp.isReg())
+ return MBB;
+
+ MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Check whether the addend is defined by a PHI:
+ assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
+ MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
+ if (!AddendDef.isPHI())
+ return MBB;
+
+ // Look for the following pattern:
+ // loop:
+ // %addend = phi [%entry, 0], [%loop, %result]
+ // ...
+ // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
+
+ // Replace with:
+ // loop:
+ // %addend = phi [%entry, 0], [%loop, %result]
+ // ...
+ // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
+
+ for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
+ assert(AddendDef.getOperand(i).isReg());
+ MachineOperand PHISrcOp = AddendDef.getOperand(i);
+ MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
+ if (&PHISrcInst == MI) {
+ // Found a matching instruction.
+ unsigned NewFMAOpc = 0;
+ switch (MI->getOpcode()) {
+ case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
+ case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
+ case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
+ case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
+ case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
+ case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
+ case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
+ case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
+ case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
+ case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
+ case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
+ case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
+ case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
+ case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
+ case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
+ case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
+ case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
+ case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
+ case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
+ case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
+ case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
+ case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
+ case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
+ case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
+ default: llvm_unreachable("Unrecognized FMA variant.");
+ }
+
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(3))
+ .addOperand(MI->getOperand(2))
+ .addOperand(MI->getOperand(1));
+ MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+ MI->eraseFromParent();
+ }
+ }
+
+ return MBB;
+}
+
MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const {
case X86::CMOV_V8F32:
case X86::CMOV_V4F64:
case X86::CMOV_V4I64:
+ case X86::CMOV_V16F32:
+ case X86::CMOV_V8F64:
+ case X86::CMOV_V8I64:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
case X86::EH_SjLj_LongJmp32:
case X86::EH_SjLj_LongJmp64:
return emitEHSjLjLongJmp(MI, BB);
+
+ case TargetOpcode::STACKMAP:
+ case TargetOpcode::PATCHPOINT:
+ return emitPatchPoint(MI, BB);
+
+ case X86::VFMADDPDr213r:
+ case X86::VFMADDPSr213r:
+ case X86::VFMADDSDr213r:
+ case X86::VFMADDSSr213r:
+ case X86::VFMSUBPDr213r:
+ case X86::VFMSUBPSr213r:
+ case X86::VFMSUBSDr213r:
+ case X86::VFMSUBSSr213r:
+ case X86::VFNMADDPDr213r:
+ case X86::VFNMADDPSr213r:
+ case X86::VFNMADDSDr213r:
+ case X86::VFNMADDSSr213r:
+ case X86::VFNMSUBPDr213r:
+ case X86::VFNMSUBPSr213r:
+ case X86::VFNMSUBSDr213r:
+ case X86::VFNMSUBSSr213r:
+ case X86::VFMADDPDr213rY:
+ case X86::VFMADDPSr213rY:
+ case X86::VFMSUBPDr213rY:
+ case X86::VFMSUBPSr213rY:
+ case X86::VFNMADDPDr213rY:
+ case X86::VFNMADDPSr213rY:
+ case X86::VFNMSUBPDr213rY:
+ case X86::VFNMSUBPSr213rY:
+ return emitFMA3Instr(MI, BB);
}
}
}
}
-unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
- unsigned Depth) const {
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op,
+ const SelectionDAG &,
+ unsigned Depth) const {
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
if (Op.getOpcode() == X86ISD::SETCC_CARRY)
return Op.getValueType().getScalarType().getSizeInBits();
SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
- array_lengthof(Ops),
Ld->getMemoryVT(),
Ld->getPointerInfo(),
Ld->getAlignment(),
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
- return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+ return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
}
/// PerformTruncateCombine - Converts truncate operation to
return NewOp;
SDValue InputVector = N->getOperand(0);
+
// Detect whether we are trying to convert from mmx to i32 and the bitcast
// from mmx to v2i32 has a single usage.
if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
}
/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
-static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
- SDValue RHS, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static std::pair<unsigned, bool>
+matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
+ SelectionDAG &DAG, const X86Subtarget *Subtarget) {
if (!VT.isVector())
- return 0;
+ return std::make_pair(0, false);
+ bool NeedSplit = false;
switch (VT.getSimpleVT().SimpleTy) {
- default: return 0;
+ default: return std::make_pair(0, false);
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
if (!Subtarget->hasAVX2())
- return 0;
+ NeedSplit = true;
+ if (!Subtarget->hasAVX())
+ return std::make_pair(0, false);
+ break;
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
if (!Subtarget->hasSSE2())
- return 0;
+ return std::make_pair(0, false);
}
// SSE2 has only a small subset of the operations.
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ unsigned Opc = 0;
// Check for x CC y ? x : y.
if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
default: break;
case ISD::SETULT:
case ISD::SETULE:
- return hasUnsigned ? X86ISD::UMIN : 0;
+ Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
case ISD::SETUGT:
case ISD::SETUGE:
- return hasUnsigned ? X86ISD::UMAX : 0;
+ Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
case ISD::SETLT:
case ISD::SETLE:
- return hasSigned ? X86ISD::SMIN : 0;
+ Opc = hasSigned ? X86ISD::SMIN : 0; break;
case ISD::SETGT:
case ISD::SETGE:
- return hasSigned ? X86ISD::SMAX : 0;
+ Opc = hasSigned ? X86ISD::SMAX : 0; break;
}
// Check for x CC y ? y : x -- a min/max with reversed arms.
} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
default: break;
case ISD::SETULT:
case ISD::SETULE:
- return hasUnsigned ? X86ISD::UMAX : 0;
+ Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
case ISD::SETUGT:
case ISD::SETUGE:
- return hasUnsigned ? X86ISD::UMIN : 0;
+ Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
case ISD::SETLT:
case ISD::SETLE:
- return hasSigned ? X86ISD::SMAX : 0;
+ Opc = hasSigned ? X86ISD::SMAX : 0; break;
case ISD::SETGT:
case ISD::SETGE:
- return hasSigned ? X86ISD::SMIN : 0;
+ Opc = hasSigned ? X86ISD::SMIN : 0; break;
}
}
- return 0;
+ return std::make_pair(Opc, NeedSplit);
}
/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
EVT VT = LHS.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
- VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ VT != MVT::f80 && TLI.isTypeLegal(VT) &&
(Subtarget->hasSSE2() ||
(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
}
+ EVT CondVT = Cond.getValueType();
+ if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+ // lowering on AVX-512. In this case we convert it to
+ // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+ // The same situation for all 128 and 256-bit vectors of i8 and i16
+ EVT OpVT = LHS.getValueType();
+ if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
+ (OpVT.getVectorElementType() == MVT::i8 ||
+ OpVT.getVectorElementType() == MVT::i16)) {
+ Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
+ DCI.AddToWorklist(Cond.getNode());
+ return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
+ }
+ }
// If this is a select between two integer constants, try to do some
// optimizations.
if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
}
}
+ // Early exit check
+ if (!TLI.isTypeLegal(VT))
+ return SDValue();
+
// Match VSELECTs into subs with unsigned saturation.
- if (!DCI.isBeforeLegalize() &&
- N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
(Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
}
// Try to match a min/max vector operation.
- if (!DCI.isBeforeLegalize() &&
- N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
- if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
- return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+ if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
+ std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
+ unsigned Opc = ret.first;
+ bool NeedSplit = ret.second;
+
+ if (Opc && NeedSplit) {
+ unsigned NumElems = VT.getVectorNumElements();
+ // Extract the LHS vectors
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
+
+ // Extract the RHS vectors
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
+
+ // Create min/max for each subvector
+ LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
+ RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
+
+ // Merge the result
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
+ } else if (Opc)
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
+ }
// Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
- if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
- Cond.getOpcode() == ISD::SETCC) {
+ if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ // Check if SETCC has already been promoted
+ TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT &&
+ // Check that condition value type matches vselect operand type
+ CondVT == VT) {
assert(Cond.getValueType().isVector() &&
"vector select expects a vector selector!");
- EVT IntVT = Cond.getValueType();
bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
ISD::CondCode NewCC =
ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
Cond.getOperand(0).getValueType().isInteger());
- Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
std::swap(LHS, RHS);
TValIsAllOnes = FValIsAllOnes;
FValIsAllZeros = TValIsAllZeros;
if (TValIsAllOnes && FValIsAllZeros)
Ret = Cond;
else if (TValIsAllOnes)
- Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
- DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
+ Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
+ DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
else if (FValIsAllZeros)
- Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
- DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
+ Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
+ DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
}
}
+ // Try to fold this VSELECT into a MOVSS/MOVSD
+ if (N->getOpcode() == ISD::VSELECT &&
+ Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
+ if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
+ (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
+ bool CanFold = false;
+ unsigned NumElems = Cond.getNumOperands();
+ SDValue A = LHS;
+ SDValue B = RHS;
+
+ if (isZero(Cond.getOperand(0))) {
+ CanFold = true;
+
+ // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
+ // fold (vselect <0,-1> -> (movsd A, B)
+ for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
+ CanFold = isAllOnes(Cond.getOperand(i));
+ } else if (isAllOnes(Cond.getOperand(0))) {
+ CanFold = true;
+ std::swap(A, B);
+
+ // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
+ // fold (vselect <-1,0> -> (movsd B, A)
+ for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
+ CanFold = isZero(Cond.getOperand(i));
+ }
+
+ if (CanFold) {
+ if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
+ return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
+ }
+
+ if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
+ // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
+ // (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
+ // (v2i64 (bitcast B)))))
+ //
+ // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
+ // (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
+ // (v2f64 (bitcast B)))))
+ //
+ // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
+ // (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
+ // (v2i64 (bitcast A)))))
+ //
+ // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
+ // (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
+ // (v2f64 (bitcast A)))))
+
+ CanFold = (isZero(Cond.getOperand(0)) &&
+ isZero(Cond.getOperand(1)) &&
+ isAllOnes(Cond.getOperand(2)) &&
+ isAllOnes(Cond.getOperand(3)));
+
+ if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
+ isAllOnes(Cond.getOperand(1)) &&
+ isZero(Cond.getOperand(2)) &&
+ isZero(Cond.getOperand(3))) {
+ CanFold = true;
+ std::swap(LHS, RHS);
+ }
+
+ if (CanFold) {
+ EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
+ SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
+ SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
+ SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
+ NewB, DAG);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Select);
+ }
+ }
+ }
+ }
+
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
// to simplify previous instructions.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
if (BitWidth == 1)
return SDValue();
+ // Check all uses of that condition operand to check whether it will be
+ // consumed by non-BLEND instructions, which may depend on all bits are set
+ // properly.
+ for (SDNode::use_iterator I = Cond->use_begin(),
+ E = Cond->use_end(); I != E; ++I)
+ if (I->getOpcode() != ISD::VSELECT)
+ // TODO: Add other opcodes eventually lowered into BLEND.
+ return SDValue();
+
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
SDValue Op2 = Cmp.getOperand(1);
SDValue SetCC;
- const ConstantSDNode* C = 0;
+ const ConstantSDNode* C = nullptr;
bool needOppositeCond = (CC == X86::COND_E);
bool checkAgainstTrue = false; // Is it a comparison against 1?
(FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
SDValue Ops[] = { FalseOp, TrueOp,
DAG.getConstant(CC, MVT::i8), Flags };
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
- Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
}
// If this is a select between two integer constants, try to do some
// the DCI.xxxx conditions are provided to postpone the optimization as
// late as possible.
- ConstantSDNode *CmpAgainst = 0;
+ ConstantSDNode *CmpAgainst = nullptr;
if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
!isa<ConstantSDNode>(Cond.getOperand(0))) {
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
SDValue Ops[] = { FalseOp, Cond.getOperand(0),
DAG.getConstant(CC, MVT::i8), Cond };
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
- array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
}
}
}
}
/// \brief Returns a vector of 0s if the node in input is a vector logical
-/// shift by a constant amount which is known to be bigger than or equal
+/// shift by a constant amount which is known to be bigger than or equal
/// to the vector element size in bits.
static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
// SSE2/AVX2 logical shifts always return a vector of 0s
- // if the shift amount is bigger than or equal to
+ // if the shift amount is bigger than or equal to
// the element size. The constant shift amount will be
// encoded as a 8-bit immediate.
if (ShiftAmt.trunc(8).uge(MaxAmount))
if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
- bool is64BitFP = (CMP00.getValueType() == MVT::f64);
- X86ISD::NodeType NTOperator = is64BitFP ?
- X86ISD::FSETCCsd : X86ISD::FSETCCss;
// FIXME: need symbolic constants for these magic numbers.
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
- SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
+ if (Subtarget->hasAVX512()) {
+ SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
+ CMP01, DAG.getConstant(x86cc, MVT::i8));
+ if (N->getValueType(0) != MVT::i1)
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
+ FSetCC);
+ return FSetCC;
+ }
+ SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
+ CMP00.getValueType(), CMP00, CMP01,
DAG.getConstant(x86cc, MVT::i8));
- SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
- OnesOrZeroesF);
- SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
- DAG.getConstant(1, MVT::i32));
+
+ bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+ MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
+
+ if (is64BitFP && !Subtarget->is64Bit()) {
+ // On a 32-bit target, we cannot bitcast the 64-bit float to a
+ // 64-bit integer, since that's not a legal type. Since
+ // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+ // bits, but can do this little dance to extract the lowest 32 bits
+ // and work with those going forward.
+ SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+ OnesOrZeroesF);
+ SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
+ Vector64);
+ OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+ Vector32, DAG.getIntPtrConstant(0));
+ IntVT = MVT::i32;
+ }
+
+ SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
+ SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+ DAG.getConstant(1, IntVT));
SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
return OneBitOfTruth;
}
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
N1->getOperand(0));
SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
- N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
} else if (RHSTrunc) {
N1 = N1->getOperand(0);
}
if (R.getNode())
return R;
- // Create BLSI, and BLSR instructions
- // BLSI is X & (-X)
- // BLSR is X & (X-1)
- if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+ // Create BEXTR instructions
+ // BEXTR is ((X >> imm) & (2**size-1))
+ if (VT == MVT::i32 || VT == MVT::i64) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
- // Check LHS for neg
- if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
- isZero(N0.getOperand(0)))
- return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
-
- // Check RHS for neg
- if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
- isZero(N1.getOperand(0)))
- return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
-
- // Check LHS for X-1
- if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
- isAllOnes(N0.getOperand(1)))
- return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
-
- // Check RHS for X-1
- if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
- isAllOnes(N1.getOperand(1)))
- return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+ // Check for BEXTR.
+ if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
+ (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
+ ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (MaskNode && ShiftNode) {
+ uint64_t Mask = MaskNode->getZExtValue();
+ uint64_t Shift = ShiftNode->getZExtValue();
+ if (isMask_64(Mask)) {
+ uint64_t MaskSize = CountPopulation_64(Mask);
+ if (Shift + MaskSize <= VT.getSizeInBits())
+ return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+ DAG.getConstant(Shift | (MaskSize << 8), VT));
+ }
+ }
+ } // BEXTR
return SDValue();
}
static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
if (DCI.isBeforeLegalizeOps())
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
// look for psign/blend
if (VT == MVT::v2i64 || VT == MVT::v4i64) {
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool OptForSize = MF.getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+ // SHLD/SHRD instructions have lower register pressure, but on some
+ // platforms they have higher latency than the equivalent
+ // series of shifts/or that would otherwise be generated.
+ // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+ // have higher latencies and we are not optimizing for size.
+ if (!OptForSize && Subtarget->isSHLDSlow())
+ return SDValue();
+
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
std::swap(N0, N1);
if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
SDValue Ops[] = { N0.getOperand(0), Neg,
DAG.getConstant(X86::COND_GE, MVT::i8),
SDValue(Neg.getNode(), 1) };
- return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
- Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
}
return SDValue();
}
static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
if (DCI.isBeforeLegalizeOps())
return SDValue();
return RV;
}
- // Try forming BMI if it is available.
- if (!Subtarget->hasBMI())
- return SDValue();
-
- if (VT != MVT::i32 && VT != MVT::i64)
- return SDValue();
-
- assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
-
- // Create BLSMSK instructions by finding X ^ (X-1)
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
- if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
- isAllOnes(N0.getOperand(1)))
- return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
-
- if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
- isAllOnes(N1.getOperand(1)))
- return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
-
return SDValue();
}
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
}
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
- Chains.size());
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
Chains.push_back(Ch);
}
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
- Chains.size());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
}
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
SDNode* LdVal = St->getValue().getNode();
- LoadSDNode *Ld = 0;
+ LoadSDNode *Ld = nullptr;
int TokenFactorIndex = -1;
SmallVector<SDValue, 8> Ops;
SDNode* ChainVal = St->getChain().getNode();
SDValue NewChain = NewLd.getValue(1);
if (TokenFactorIndex != -1) {
Ops.push_back(NewChain);
- NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
- Ops.size());
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
St->getPointerInfo(),
if (TokenFactorIndex != -1) {
Ops.push_back(LoLd);
Ops.push_back(HiLd);
- NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
- Ops.size());
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
LoAddr = St->getBasePtr();
RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
return false;
- MVT VT = LHS.getValueType().getSimpleVT();
+ MVT VT = LHS.getSimpleValueType();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
}
}
+ if (N0.getOpcode() == ISD::TRUNCATE &&
+ N0.hasOneUse() &&
+ N0.getOperand(0).hasOneUse()) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+ N00.getOperand(0), N00.getOperand(1)),
+ DAG.getConstant(1, VT));
+ }
+ }
if (VT.is256BitVector()) {
SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
if (R.getNode())
// Optimize x == -y --> x+y == 0
// x != -y --> x+y != 0
-static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
return DAG.getSetCC(SDLoc(N), N->getValueType(0),
addV, DAG.getConstant(0, addV.getValueType()), CC);
}
+
+ if (VT.getScalarType() == MVT::i1) {
+ bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
+ if (!IsSEXT0 && !IsVZero0)
+ return SDValue();
+ bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if (!IsSEXT1 && !IsVZero1)
+ return SDValue();
+
+ if (IsSEXT0 && IsVZero1) {
+ assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
+ if (CC == ISD::SETEQ)
+ return DAG.getNOT(DL, LHS.getOperand(0), VT);
+ return LHS.getOperand(0);
+ }
+ if (IsSEXT1 && IsVZero0) {
+ assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
+ if (CC == ISD::SETEQ)
+ return DAG.getNOT(DL, RHS.getOperand(0), VT);
+ return RHS.getOperand(0);
+ }
+ }
+
return SDValue();
}
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
- return DAG.getNode(ISD::AND, DL, MVT::i8,
+static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
+ MVT VT) {
+ if (VT == MVT::i8)
+ return DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+ DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
+ DAG.getConstant(1, VT));
+ assert (VT == MVT::i1 && "Unexpected type for SECCC node");
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
- DAG.getConstant(1, MVT::i8));
+ DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
}
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
EFLAGS.getNode()->getVTList(),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
- return MaterializeSETB(DL, NewEFLAGS, DAG);
+ return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
}
}
// a zext and produces an all-ones bit which is more useful than 0/1 in some
// cases.
if (CC == X86::COND_B)
- return MaterializeSETB(DL, EFLAGS, DAG);
+ return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
SDValue Flags;
if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
!XTLI->getSubtarget()->is64Bit() &&
- !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ VT == MVT::i64) {
SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
Ld->getChain(), Op0, DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
- case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
+ case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
}
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+ if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+ if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+ std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+ std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+ if (AsmPieces.size() == 3)
+ return true;
+ else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+ return true;
+ }
+ }
+ return false;
+}
+
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
array_pod_sort(AsmPieces.begin(), AsmPieces.end());
- if (AsmPieces.size() == 4 &&
- AsmPieces[0] == "~{cc}" &&
- AsmPieces[1] == "~{dirflag}" &&
- AsmPieces[2] == "~{flags}" &&
- AsmPieces[3] == "~{fpsr}")
- return IntrinsicLowering::LowerToByteSwap(CI);
+ if (clobbersFlagRegisters(AsmPieces))
+ return IntrinsicLowering::LowerToByteSwap(CI);
}
break;
case 3:
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
array_pod_sort(AsmPieces.begin(), AsmPieces.end());
- if (AsmPieces.size() == 4 &&
- AsmPieces[0] == "~{cc}" &&
- AsmPieces[1] == "~{dirflag}" &&
- AsmPieces[2] == "~{flags}" &&
- AsmPieces[3] == "~{fpsr}")
+ if (clobbersFlagRegisters(AsmPieces))
return IntrinsicLowering::LowerToByteSwap(CI);
}
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
- if (CallOperandVal == NULL)
+ if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
std::string &Constraint,
std::vector<SDValue>&Ops,
SelectionDAG &DAG) const {
- SDValue Result(0, 0);
+ SDValue Result;
// Only support length 1 constraints for now.
if (Constraint.length() > 1) return;
// If we are in non-pic codegen mode, we allow the address of a global (with
// an optional displacement) to be used with 'i'.
- GlobalAddressSDNode *GA = 0;
+ GlobalAddressSDNode *GA = nullptr;
int64_t Offset = 0;
// Match either (GA), (GA+C), (GA+C1+C2), etc.
Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
// Not found as a standard register?
- if (Res.second == 0) {
+ if (!Res.second) {
// Map st(0) -> st(7) -> ST0
if (Constraint.size() == 7 && Constraint[0] == '{' &&
tolower(Constraint[1]) == 's' &&
return Res;
}
+
+int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
+ Type *Ty) const {
+ // Scaling factors are not free at all.
+ // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+ // will take 2 allocations in the out of order engine instead of 1
+ // for plain addressing mode, i.e. inst (reg1).
+ // E.g.,
+ // vaddps (%rsi,%drx), %ymm0, %ymm1
+ // Requires two allocations (one for the load, one for the computation)
+ // whereas:
+ // vaddps (%rsi), %ymm0, %ymm1
+ // Requires just 1 allocation, i.e., freeing allocations for other operations
+ // and having less micro operations to execute.
+ //
+ // For some X86 architectures, this is even worse because for instance for
+ // stores, the complex addressing mode forces the instruction to use the
+ // "load" ports instead of the dedicated "store" port.
+ // E.g., on Haswell:
+ // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+ // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+ if (isLegalAddressingMode(AM, Ty))
+ // Scale represents reg2 * scale, thus account for 1
+ // as soon as we use a second register.
+ return AM.Scale != 0;
+ return -1;
+}