#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
if (Subtarget->is64Bit()) {
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
- } else if (!UseSoftFloat) {
+ } else if (!TM.Options.UseSoftFloat) {
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
- if (!UseSoftFloat) {
+ if (!TM.Options.UseSoftFloat) {
// SSE has no i16 to fp conversion, only i32
if (X86ScalarSSEf32) {
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
if (Subtarget->is64Bit()) {
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
- } else if (!UseSoftFloat) {
+ } else if (!TM.Options.UseSoftFloat) {
// Since AVX is a superset of SSE3, only check for SSE here.
if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
// Expand FP_TO_UINT into a select.
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i16 , Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i64 , Expand);
if (Subtarget->hasBMI()) {
setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
} else {
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
}
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i8 , Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i16 , Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i64 , Expand);
if (Subtarget->hasLZCNT()) {
setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
} else {
if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
MVT::i64 : MVT::i32, Custom);
- else if (EnableSegmentedStacks)
+ else if (TM.Options.EnableSegmentedStacks)
setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
MVT::i64 : MVT::i32, Custom);
else
setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
MVT::i64 : MVT::i32, Expand);
- if (!UseSoftFloat && X86ScalarSSEf64) {
+ if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
addRegisterClass(MVT::f32, X86::FR32RegisterClass);
// cases we handle.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
addLegalFPImmediate(APFloat(+0.0f)); // xorps
- } else if (!UseSoftFloat && X86ScalarSSEf32) {
+ } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
addRegisterClass(MVT::f32, X86::FR32RegisterClass);
addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
- if (!UnsafeFPMath) {
+ if (!TM.Options.UnsafeFPMath) {
setOperationAction(ISD::FSIN , MVT::f64 , Expand);
setOperationAction(ISD::FCOS , MVT::f64 , Expand);
}
- } else if (!UseSoftFloat) {
+ } else if (!TM.Options.UseSoftFloat) {
// f32 and f64 in x87.
// Set up the FP register classes.
addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- if (!UnsafeFPMath) {
+ if (!TM.Options.UnsafeFPMath) {
setOperationAction(ISD::FSIN , MVT::f64 , Expand);
setOperationAction(ISD::FCOS , MVT::f64 , Expand);
}
setOperationAction(ISD::FMA, MVT::f32, Expand);
// Long double always uses X87.
- if (!UseSoftFloat) {
+ if (!TM.Options.UseSoftFloat) {
addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
}
- if (!UnsafeFPMath) {
+ if (!TM.Options.UnsafeFPMath) {
setOperationAction(ISD::FSIN , MVT::f80 , Expand);
setOperationAction(ISD::FCOS , MVT::f80 , Expand);
}
+ setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f80, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+ setOperationAction(ISD::FRINT, MVT::f80, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
setOperationAction(ISD::FMA, MVT::f80, Expand);
}
setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// with -msoft-float, disable use of MMX as well.
- if (!UseSoftFloat && Subtarget->hasMMX()) {
+ if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
// No operations on x86mmx supported, everything uses intrinsics.
}
setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
- if (!UseSoftFloat && Subtarget->hasXMM()) {
+ if (!TM.Options.UseSoftFloat && Subtarget->hasXMM()) {
addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
}
- if (!UseSoftFloat && Subtarget->hasXMMInt()) {
+ if (!TM.Options.UseSoftFloat && Subtarget->hasXMMInt()) {
addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
// FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
}
- if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
+ if (Subtarget->hasSSE41orAVX()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
- // Can turn SHL into an integer multiply.
- setOperationAction(ISD::SHL, MVT::v4i32, Custom);
- setOperationAction(ISD::SHL, MVT::v16i8, Custom);
-
setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
setOperationAction(ISD::VSELECT, MVT::v2i64, Legal);
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ // FIXME: these should be Legal but thats only for the case where
+ // the index is constant. For now custom expand to deal with that
if (Subtarget->is64Bit()) {
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
}
}
if (Subtarget->hasXMMInt()) {
- setOperationAction(ISD::SRL, MVT::v2i64, Custom);
- setOperationAction(ISD::SRL, MVT::v4i32, Custom);
- setOperationAction(ISD::SRL, MVT::v16i8, Custom);
setOperationAction(ISD::SRL, MVT::v8i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i8, Custom);
- setOperationAction(ISD::SHL, MVT::v2i64, Custom);
- setOperationAction(ISD::SHL, MVT::v4i32, Custom);
setOperationAction(ISD::SHL, MVT::v8i16, Custom);
+ setOperationAction(ISD::SHL, MVT::v16i8, Custom);
- setOperationAction(ISD::SRA, MVT::v4i32, Custom);
setOperationAction(ISD::SRA, MVT::v8i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v16i8, Custom);
+
+ if (Subtarget->hasAVX2()) {
+ setOperationAction(ISD::SRL, MVT::v2i64, Legal);
+ setOperationAction(ISD::SRL, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::SHL, MVT::v2i64, Legal);
+ setOperationAction(ISD::SHL, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::SRA, MVT::v4i32, Legal);
+ } else {
+ setOperationAction(ISD::SRL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v4i32, Custom);
+ }
}
- if (Subtarget->hasSSE42() || Subtarget->hasAVX())
+ if (Subtarget->hasSSE42orAVX())
setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
- if (!UseSoftFloat && Subtarget->hasAVX()) {
+ if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom);
- setOperationAction(ISD::SRL, MVT::v4i64, Custom);
- setOperationAction(ISD::SRL, MVT::v8i32, Custom);
setOperationAction(ISD::SRL, MVT::v16i16, Custom);
setOperationAction(ISD::SRL, MVT::v32i8, Custom);
- setOperationAction(ISD::SHL, MVT::v4i64, Custom);
- setOperationAction(ISD::SHL, MVT::v8i32, Custom);
setOperationAction(ISD::SHL, MVT::v16i16, Custom);
setOperationAction(ISD::SHL, MVT::v32i8, Custom);
- setOperationAction(ISD::SRA, MVT::v8i32, Custom);
setOperationAction(ISD::SRA, MVT::v16i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v32i8, Custom);
setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
- setOperationAction(ISD::VSELECT, MVT::v4i64, Legal);
- setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+
+ if (Subtarget->hasAVX2()) {
+ setOperationAction(ISD::ADD, MVT::v4i64, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v16i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v32i8, Legal);
- setOperationAction(ISD::ADD, MVT::v4i64, Custom);
- setOperationAction(ISD::ADD, MVT::v8i32, Custom);
- setOperationAction(ISD::ADD, MVT::v16i16, Custom);
- setOperationAction(ISD::ADD, MVT::v32i8, Custom);
+ setOperationAction(ISD::SUB, MVT::v4i64, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v16i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v32i8, Legal);
- setOperationAction(ISD::SUB, MVT::v4i64, Custom);
- setOperationAction(ISD::SUB, MVT::v8i32, Custom);
- setOperationAction(ISD::SUB, MVT::v16i16, Custom);
- setOperationAction(ISD::SUB, MVT::v32i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, Legal);
+ setOperationAction(ISD::MUL, MVT::v16i16, Legal);
+ // Don't lower v32i8 because there is no 128-bit byte mul
- setOperationAction(ISD::MUL, MVT::v4i64, Custom);
- setOperationAction(ISD::MUL, MVT::v8i32, Custom);
- setOperationAction(ISD::MUL, MVT::v16i16, Custom);
- // Don't lower v32i8 because there is no 128-bit byte mul
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+ setOperationAction(ISD::SRL, MVT::v4i64, Legal);
+ setOperationAction(ISD::SRL, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::SHL, MVT::v4i64, Legal);
+ setOperationAction(ISD::SHL, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::SRA, MVT::v8i32, Legal);
+ } else {
+ setOperationAction(ISD::ADD, MVT::v4i64, Custom);
+ setOperationAction(ISD::ADD, MVT::v8i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v16i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SUB, MVT::v4i64, Custom);
+ setOperationAction(ISD::SUB, MVT::v8i32, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, Custom);
+ // Don't lower v32i8 because there is no 128-bit byte mul
+
+ setOperationAction(ISD::SRL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i32, Custom);
+ }
// Custom lower several nodes for 256-bit types.
for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
setTargetDAGCombine(ISD::SINT_TO_FP);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
+ if (Subtarget->hasBMI())
+ setTargetDAGCombine(ISD::XOR);
computeRegisterProperties();
maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
- setPrefLoopAlignment(16);
+ setPrefLoopAlignment(4); // 2^4 bytes.
benefitFromCodePlacementOpt = true;
- setPrefFunctionAlignment(4);
+ setPrefFunctionAlignment(4); // 2^4 bytes.
}
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
/// means there isn't a need to check it against alignment requirement,
/// probably because the source does not need to be loaded. If
-/// 'NonScalarIntSafe' is true, that means it's safe to return a
+/// 'IsZeroVal' is true, that means it's safe to return a
/// non-scalar-integer type, e.g. empty string source, constant, or loaded
/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
/// constant so it does not need to be loaded.
EVT
X86TargetLowering::getOptimalMemOpType(uint64_t Size,
unsigned DstAlign, unsigned SrcAlign,
- bool NonScalarIntSafe,
+ bool IsZeroVal,
bool MemcpyStrSrc,
MachineFunction &MF) const {
// FIXME: This turns off use of xmm stores for memset/memcpy on targets like
// linux. This is because the stack realignment code can't handle certain
// cases like PR2962. This should be removed when PR2962 is fixed.
const Function *F = MF.getFunction();
- if (NonScalarIntSafe &&
+ if (IsZeroVal &&
!F->hasFnAttr(Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
/// FuncIsMadeTailCallSafe - Return true if the function is being made into
/// a tailcall target by changing its ABI.
-static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
+static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
+ bool GuaranteedTailCallOpt) {
return GuaranteedTailCallOpt && IsTailCallConvention(CC);
}
unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
- bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
+ bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
+ getTargetMachine().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
EVT ValVT;
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
return DAG.getLoad(ValVT, dl, Chain, FIN,
MachinePointerInfo::getFixedStack(FI),
- false, false, 0);
+ false, false, false, 0);
}
}
// places.
assert(VA.getValNo() != LastVal &&
"Don't support value assigned to multiple locs yet");
+ (void)LastVal;
LastVal = VA.getValNo();
if (VA.isRegLoc()) {
// If value is passed via pointer - do a load.
if (VA.getLocInfo() == CCValAssign::Indirect)
ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
- MachinePointerInfo(), false, false, 0);
+ MachinePointerInfo(), false, false, false, 0);
InVals.push_back(ArgValue);
}
unsigned StackSize = CCInfo.getNextStackOffset();
// Align stack specially for tail calls.
- if (FuncIsMadeTailCallSafe(CallConv))
+ if (FuncIsMadeTailCallSafe(CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt))
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
// If the function takes variable number of arguments, make a frame index for
bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
assert(!(NumXMMRegs && !Subtarget->hasXMM()) &&
"SSE register cannot be used when SSE is disabled!");
- assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
+ assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
+ NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
- if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM())
+ if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
+ !Subtarget->hasXMM())
// Kernel mode asks for SSE to be disabled, so don't push them
// on the stack.
TotalNumXMMRegs = 0;
}
// Some CCs need callee pop.
- if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) {
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// Load the "old" Return address.
OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
- false, false, 0);
+ false, false, false, 0);
return SDValue(OutRetAddr.getNode(), 1);
}
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
- if (!GuaranteedTailCallOpt && isTailCall)
+ if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
IsSibcall = true;
if (isTailCall)
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
- else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
+ else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+ IsTailCallConvention(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
int FI = 0;
// Do not flag preceding copytoreg stuff together with the following stuff.
InFlag = SDValue();
- if (GuaranteedTailCallOpt) {
+ if (getTargetMachine().Options.GuaranteedTailCallOpt) {
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (VA.isRegLoc())
if (ExtraLoad)
Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
MachinePointerInfo::getGOT(),
- false, false, 0);
+ false, false, false, 0);
}
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
unsigned char OpFlags = 0;
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPush;
- if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt))
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ getTargetMachine().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPush = NumBytes; // Callee pops everything
else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
// If this is a call to a struct-return function, the callee
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
- if (GuaranteedTailCallOpt) {
+ if (getTargetMachine().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
return true;
return false;
case X86ISD::MOVDDUP:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
- case X86ISD::UNPCKLPS:
- case X86ISD::UNPCKLPD:
- case X86ISD::VUNPCKLPSY:
- case X86ISD::VUNPCKLPDY:
- case X86ISD::PUNPCKLWD:
- case X86ISD::PUNPCKLBW:
- case X86ISD::PUNPCKLDQ:
- case X86ISD::PUNPCKLQDQ:
- case X86ISD::UNPCKHPS:
- case X86ISD::UNPCKHPD:
- case X86ISD::VUNPCKHPSY:
- case X86ISD::VUNPCKHPDY:
- case X86ISD::PUNPCKHWD:
- case X86ISD::PUNPCKHBW:
- case X86ISD::PUNPCKHDQ:
- case X86ISD::PUNPCKHQDQ:
- case X86ISD::VPERMILPS:
- case X86ISD::VPERMILPSY:
- case X86ISD::VPERMILPD:
- case X86ISD::VPERMILPDY:
- case X86ISD::VPERM2F128:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ case X86ISD::VPERMILP:
+ case X86ISD::VPERM2X128:
return true;
}
return false;
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
- case X86ISD::VPERMILPS:
- case X86ISD::VPERMILPSY:
- case X86ISD::VPERMILPD:
- case X86ISD::VPERMILPDY:
+ case X86ISD::VPERMILP:
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
}
case X86ISD::PALIGN:
case X86ISD::SHUFPD:
case X86ISD::SHUFPS:
- case X86ISD::VPERM2F128:
+ case X86ISD::VPERM2X128:
return DAG.getNode(Opc, dl, VT, V1, V2,
DAG.getConstant(TargetMask, MVT::i8));
}
case X86ISD::MOVLPD:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
- case X86ISD::UNPCKLPS:
- case X86ISD::UNPCKLPD:
- case X86ISD::VUNPCKLPSY:
- case X86ISD::VUNPCKLPDY:
- case X86ISD::PUNPCKLWD:
- case X86ISD::PUNPCKLBW:
- case X86ISD::PUNPCKLDQ:
- case X86ISD::PUNPCKLQDQ:
- case X86ISD::UNPCKHPS:
- case X86ISD::UNPCKHPD:
- case X86ISD::VUNPCKHPSY:
- case X86ISD::VUNPCKHPDY:
- case X86ISD::PUNPCKHWD:
- case X86ISD::PUNPCKHBW:
- case X86ISD::PUNPCKHDQ:
- case X86ISD::PUNPCKHQDQ:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
return DAG.getNode(Opc, dl, VT, V1, V2);
}
return SDValue();
static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
bool hasSSSE3OrAVX) {
int i, e = VT.getVectorNumElements();
- if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
+ if (VT.getSizeInBits() != 128)
return false;
// Do not handle v2i64 / v2f64 shuffles with palignr.
return true;
}
-/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to 256-bit
/// VSHUFPSY.
-static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT,
- const X86Subtarget *Subtarget) {
+static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool HasAVX, bool Commuted = false) {
int NumElems = VT.getVectorNumElements();
- if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+ if (!HasAVX || VT.getSizeInBits() != 256)
return false;
- if (NumElems != 8)
+ if (NumElems != 4 && NumElems != 8)
return false;
// VSHUFPSY divides the resulting vector into 4 chunks.
// DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
// Y3..Y0, Y3..Y0, X3..X0, X3..X0
//
- int QuarterSize = NumElems/4;
- int HalfSize = QuarterSize*2;
- for (int i = 0; i < QuarterSize; ++i)
- if (!isUndefOrInRange(Mask[i], 0, HalfSize))
- return false;
- for (int i = QuarterSize; i < QuarterSize*2; ++i)
- if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
- return false;
-
- // The mask of the second half must be the same as the first but with
- // the appropriate offsets. This works in the same way as VPERMILPS
- // works with masks.
- for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
- if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
- return false;
- int FstHalfIdx = i-HalfSize;
- if (Mask[FstHalfIdx] < 0)
- continue;
- if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
- return false;
- }
- for (int i = QuarterSize*3; i < NumElems; ++i) {
- if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
- return false;
- int FstHalfIdx = i-HalfSize;
- if (Mask[FstHalfIdx] < 0)
- continue;
- if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
- return false;
-
- }
-
- return true;
-}
-
-/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VSHUFPSY instruction.
-static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
-
- assert(NumElems == 8 && VT.getSizeInBits() == 256 &&
- "Only supports v8i32 and v8f32 types");
-
- int HalfSize = NumElems/2;
- unsigned Mask = 0;
- for (int i = 0; i != NumElems ; ++i) {
- if (SVOp->getMaskElt(i) < 0)
- continue;
- // The mask of the first half must be equal to the second one.
- unsigned Shamt = (i%HalfSize)*2;
- unsigned Elt = SVOp->getMaskElt(i) % HalfSize;
- Mask |= Elt << Shamt;
- }
-
- return Mask;
-}
-
-/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS
-/// version and the mask of the second half isn't binded with the first
-/// one.
-static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT,
- const X86Subtarget *Subtarget) {
- int NumElems = VT.getVectorNumElements();
-
- if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
- return false;
-
- if (NumElems != 4)
- return false;
-
- // VSHUFPSY divides the resulting vector into 4 chunks.
+ // VSHUFPDY divides the resulting vector into 4 chunks.
// The sources are also splitted into 4 chunks, and each destination
// chunk must come from a different source chunk.
//
// SRC1 => X3 X2 X1 X0
// SRC2 => Y3 Y2 Y1 Y0
//
- // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0
+ // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
//
- int QuarterSize = NumElems/4;
- int HalfSize = QuarterSize*2;
- for (int i = 0; i < QuarterSize; ++i)
- if (!isUndefOrInRange(Mask[i], 0, HalfSize))
- return false;
- for (int i = QuarterSize; i < QuarterSize*2; ++i)
- if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
- return false;
- for (int i = QuarterSize*2; i < QuarterSize*3; ++i)
- if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
- return false;
- for (int i = QuarterSize*3; i < NumElems; ++i)
- if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
- return false;
+ unsigned QuarterSize = NumElems/4;
+ unsigned HalfSize = QuarterSize*2;
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned LaneStart = l*HalfSize;
+ for (unsigned s = 0; s != 2; ++s) {
+ unsigned QuarterStart = s*QuarterSize;
+ unsigned Src = (Commuted) ? (1-s) : s;
+ unsigned SrcStart = Src*NumElems + LaneStart;
+ for (unsigned i = 0; i != QuarterSize; ++i) {
+ int Idx = Mask[i+QuarterStart+LaneStart];
+ if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize))
+ return false;
+ // For VSHUFPSY, the mask of the second half must be the same as the first
+ // but with the appropriate offsets. This works in the same way as
+ // VPERMILPS works with masks.
+ if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0)
+ continue;
+ if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize))
+ return false;
+ }
+ }
+ }
return true;
}
-/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VSHUFPDY instruction.
-static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
+/// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions.
+static unsigned getShuffleVSHUFPYImmediate(SDNode *N) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
EVT VT = SVOp->getValueType(0);
int NumElems = VT.getVectorNumElements();
- assert(NumElems == 4 && VT.getSizeInBits() == 256 &&
- "Only supports v4i64 and v4f64 types");
+ assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types");
+ assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types");
int HalfSize = NumElems/2;
+ unsigned Mul = (NumElems == 8) ? 2 : 1;
unsigned Mask = 0;
- for (int i = 0; i != NumElems ; ++i) {
- if (SVOp->getMaskElt(i) < 0)
+ for (int i = 0; i != NumElems; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ if (Elt < 0)
continue;
- int Elt = SVOp->getMaskElt(i) % HalfSize;
- Mask |= Elt << i;
+ Elt %= HalfSize;
+ unsigned Shamt = i;
+ // For VSHUFPSY, the mask of the first half must be equal to the second one.
+ if (NumElems == 8) Shamt %= HalfSize;
+ Mask |= Elt << (Shamt*Mul);
}
return Mask;
}
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
+ unsigned NumElems) {
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int idx = Mask[i];
+ if (idx < 0)
+ continue;
+ else if (idx < (int)NumElems)
+ Mask[i] = idx + NumElems;
+ else
+ Mask[i] = idx - NumElems;
+ }
+}
+
/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to 128-bit
-/// SHUFPS and SHUFPD.
-static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
- int NumElems = VT.getVectorNumElements();
+/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
+/// reverse of what x86 shuffles want.
+static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool Commuted = false) {
+ unsigned NumElems = VT.getVectorNumElements();
if (VT.getSizeInBits() != 128)
return false;
if (NumElems != 2 && NumElems != 4)
return false;
- int Half = NumElems / 2;
- for (int i = 0; i < Half; ++i)
- if (!isUndefOrInRange(Mask[i], 0, NumElems))
+ unsigned Half = NumElems / 2;
+ unsigned SrcStart = Commuted ? NumElems : 0;
+ for (unsigned i = 0; i != Half; ++i)
+ if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
return false;
- for (int i = Half; i < NumElems; ++i)
- if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+ SrcStart = Commuted ? 0 : NumElems;
+ for (unsigned i = Half; i != NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
return false;
return true;
return ::isSHUFPMask(M, N->getValueType(0));
}
-/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
-/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
-/// half elements to come from vector 1 (which would equal the dest.) and
-/// the upper half to come from vector 2.
-static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
- int NumElems = VT.getVectorNumElements();
-
- if (NumElems != 2 && NumElems != 4)
- return false;
-
- int Half = NumElems / 2;
- for (int i = 0; i < Half; ++i)
- if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
- return false;
- for (int i = Half; i < NumElems; ++i)
- if (!isUndefOrInRange(Mask[i], 0, NumElems))
- return false;
- return true;
-}
-
-static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return isCommutedSHUFPMask(M, N->getValueType(0));
-}
-
/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool V2IsSplat = false) {
- int NumElts = VT.getVectorNumElements();
+ bool HasAVX2, bool V2IsSplat = false) {
+ unsigned NumElts = VT.getVectorNumElements();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
- if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
+ if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
return false;
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- unsigned Start = 0;
- unsigned End = NumLaneElts;
- for (unsigned s = 0; s < NumLanes; ++s) {
- for (unsigned i = Start, j = s * NumLaneElts;
- i != End;
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+ i != (l+1)*NumLaneElts;
i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
return false;
}
}
- // Process the next 128 bits.
- Start += NumLaneElts;
- End += NumLaneElts;
}
return true;
}
-bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
SmallVector<int, 8> M;
N->getMask(M);
- return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
+ return ::isUNPCKLMask(M, N->getValueType(0), HasAVX2, V2IsSplat);
}
/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKH.
static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool V2IsSplat = false) {
- int NumElts = VT.getVectorNumElements();
+ bool HasAVX2, bool V2IsSplat = false) {
+ unsigned NumElts = VT.getVectorNumElements();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
- if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
+ if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
return false;
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- unsigned Start = 0;
- unsigned End = NumLaneElts;
for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
- i != End; i += 2, ++j) {
+ for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+ i != (l+1)*NumLaneElts; i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
if (!isUndefOrEqual(BitI, j))
return false;
}
}
- // Process the next 128 bits.
- Start += NumLaneElts;
- End += NumLaneElts;
}
return true;
}
-bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
SmallVector<int, 8> M;
N->getMask(M);
- return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
+ return ::isUNPCKHMask(M, N->getValueType(0), HasAVX2, V2IsSplat);
}
/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
/// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
- int NumElems = VT.getVectorNumElements();
- if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool HasAVX2) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
return false;
// For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
// FIXME: Need a better way to get rid of this, there's no latency difference
// between UNPCKLPD and MOVDDUP, the later should always be checked first and
// the former later. We should also remove the "_undef" special mask.
- if (NumElems == 4 && VT.getSizeInBits() == 256)
+ if (NumElts == 4 && VT.getSizeInBits() == 256)
return false;
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
// independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumLaneElts = NumElems / NumLanes;
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned s = 0; s < NumLanes; ++s) {
- for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
- i != NumLaneElts * (s + 1);
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+ i != (l+1)*NumLaneElts;
i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
return true;
}
-bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
+bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N, bool HasAVX2) {
SmallVector<int, 8> M;
N->getMask(M);
- return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
+ return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0), HasAVX2);
}
/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
/// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
- int NumElems = VT.getVectorNumElements();
- if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool HasAVX2) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
return false;
- for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (!isUndefOrEqual(BitI1, j))
- return false;
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+ i != (l+1)*NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
}
return true;
}
-bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
+bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N, bool HasAVX2) {
SmallVector<int, 8> M;
N->getMask(M);
- return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
+ return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0), HasAVX2);
}
/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
return ::isMOVLMask(M, N->getValueType(0));
}
-/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
/// as permutations between 128-bit chunks or halves. As an example: this
/// shuffle bellow:
/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
/// The first half comes from the second half of V1 and the second half from the
/// the second half of V2.
-static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
- const X86Subtarget *Subtarget) {
- if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool HasAVX) {
+ if (!HasAVX || VT.getSizeInBits() != 256)
return false;
// The shuffle result is divided into half A and half B. In total the two
return MatchA && MatchB;
}
-/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
-static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
+static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
int HalfSize = VT.getVectorNumElements()/2;
return (FstHalf | (SndHalf << 4));
}
-/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
/// Note that VPERMIL mask matching is different depending whether theunderlying
/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
/// to the same elements of the low, but to the higher half of the source.
/// In VPERMILPD the two lanes could be shuffled independently of each other
/// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
- const X86Subtarget *Subtarget) {
+static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool HasAVX) {
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits()/128;
- if (!Subtarget->hasAVX())
+ if (!HasAVX)
return false;
- // Only match 256-bit with 64-bit types
- if (VT.getSizeInBits() != 256 || NumElts != 4)
+ // Only match 256-bit with 32/64-bit types
+ if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
return false;
- // The mask on the high lane is independent of the low. Both can match
- // any element in inside its own lane, but can't cross.
int LaneSize = NumElts/NumLanes;
- for (int l = 0; l < NumLanes; ++l)
- for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
- int LaneStart = l*LaneSize;
- if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
+ for (int l = 0; l != NumLanes; ++l) {
+ int LaneStart = l*LaneSize;
+ for (int i = 0; i != LaneSize; ++i) {
+ if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize))
+ return false;
+ if (NumElts == 4 || l == 0)
+ continue;
+ // VPERMILPS handling
+ if (Mask[i] < 0)
+ continue;
+ if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize))
return false;
}
-
- return true;
-}
-
-/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
-/// Note that VPERMIL mask matching is different depending whether theunderlying
-/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
-/// to the same elements of the low, but to the higher half of the source.
-/// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
- const X86Subtarget *Subtarget) {
- unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
-
- if (!Subtarget->hasAVX())
- return false;
-
- // Only match 256-bit with 32-bit types
- if (VT.getSizeInBits() != 256 || NumElts != 8)
- return false;
-
- // The mask on the high lane should be the same as the low. Actually,
- // they can differ if any of the corresponding index in a lane is undef
- // and the other stays in range.
- int LaneSize = NumElts/NumLanes;
- for (int i = 0; i < LaneSize; ++i) {
- int HighElt = i+LaneSize;
- bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
- bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
-
- if (!HighValid || !LowValid)
- return false;
- if (Mask[i] < 0 || Mask[HighElt] < 0)
- continue;
- if (Mask[HighElt]-Mask[i] != LaneSize)
- return false;
}
return true;
}
-/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
-static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions.
+static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
int NumElts = VT.getVectorNumElements();
// where a mask will match because the same mask element is undef on the
// first half but valid on the second. This would get pathological cases
// such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
+ unsigned Shift = (LaneSize == 4) ? 2 : 1;
unsigned Mask = 0;
- for (int l = 0; l < NumLanes; ++l) {
- for (int i = 0; i < LaneSize; ++i) {
- int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
- if (MaskElt < 0)
- continue;
- if (MaskElt >= LaneSize)
- MaskElt -= LaneSize;
- Mask |= MaskElt << (i*2);
- }
+ for (int i = 0; i != NumElts; ++i) {
+ int MaskElt = SVOp->getMaskElt(i);
+ if (MaskElt < 0)
+ continue;
+ MaskElt %= LaneSize;
+ unsigned Shamt = i;
+ // VPERMILPSY, the mask of the first half must be equal to the second one
+ if (NumElts == 8) Shamt %= LaneSize;
+ Mask |= MaskElt << (Shamt*Shift);
}
return Mask;
}
-/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
-static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- EVT VT = SVOp->getValueType(0);
-
- int NumElts = VT.getVectorNumElements();
- int NumLanes = VT.getSizeInBits()/128;
-
- unsigned Mask = 0;
- int LaneSize = NumElts/NumLanes;
- for (int l = 0; l < NumLanes; ++l)
- for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
- int MaskElt = SVOp->getMaskElt(i);
- if (MaskElt < 0)
- continue;
- Mask |= (MaskElt-l*LaneSize) << i;
- }
-
- return Mask;
-}
-
/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
const X86Subtarget *Subtarget) {
- if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
+ if (!Subtarget->hasSSE3orAVX())
return false;
// The second vector must be undef
/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
const X86Subtarget *Subtarget) {
- if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
+ if (!Subtarget->hasSSE3orAVX())
return false;
// The second vector must be undef
/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to 256-bit
/// version of MOVDDUP.
-static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
- const X86Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
+static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool HasAVX) {
int NumElts = VT.getVectorNumElements();
- bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF;
- if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 ||
- !V2IsUndef || NumElts != 4)
+ if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
return false;
for (int i = 0; i != NumElts/2; ++i)
- if (!isUndefOrEqual(N->getMaskElt(i), 0))
+ if (!isUndefOrEqual(Mask[i], 0))
return false;
for (int i = NumElts/2; i != NumElts; ++i)
- if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2))
+ if (!isUndefOrEqual(Mask[i], NumElts/2))
return false;
return true;
}
/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
-unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- EVT VVT = N->getValueType(0);
- unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
int Val = 0;
unsigned i, e;
- for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
+ for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
Val = SVOp->getMaskElt(i);
if (Val >= 0)
break;
SVOp->getOperand(0), &MaskVec[0]);
}
-/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
-/// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
- unsigned NumElems = VT.getVectorNumElements();
- for (unsigned i = 0; i != NumElems; ++i) {
- int idx = Mask[i];
- if (idx < 0)
- continue;
- else if (idx < (int)NumElems)
- Mask[i] = idx + NumElems;
- else
- Mask[i] = idx - NumElems;
- }
-}
-
/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
/// match movhlps. The lower half elements should come from upper half of
/// V1 (and in order), and the upper half elements should come from the upper
return true;
}
+// Test whether the given value is a vector value which will be legalized
+// into a load.
+static bool WillBeConstantPoolLoad(SDNode *N) {
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ // Check for any non-constant elements.
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+ switch (N->getOperand(i).getNode()->getOpcode()) {
+ case ISD::UNDEF:
+ case ISD::ConstantFP:
+ case ISD::Constant:
+ break;
+ default:
+ return false;
+ }
+
+ // Vectors of all-zeros and all-ones are materialized with special
+ // instructions rather than being loaded.
+ return !ISD::isBuildVectorAllZeros(N) &&
+ !ISD::isBuildVectorAllOnes(N);
+}
+
/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
/// match movlp{s|d}. The lower half elements should come from lower half of
/// V1 (and in order), and the upper half elements should come from the upper
return false;
// Is V2 is a vector load, don't do this transformation. We will try to use
// load folding shufps op.
- if (ISD::isNON_EXTLoad(V2))
+ if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
return false;
unsigned NumElems = VT.getVectorNumElements();
}
/// getOnesVector - Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
-/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
-/// original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
+ DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
assert((VT.is128BitVector() || VT.is256BitVector())
&& "Expected a 128-bit or 256-bit vector type");
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
- SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
- Cst, Cst, Cst, Cst);
-
- if (VT.is256BitVector()) {
- SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
- Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
- Vec = Insert128BitVector(InsV, Vec,
- DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+ SDValue Vec;
+ if (VT.getSizeInBits() == 256) {
+ if (HasAVX2) { // AVX2
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+ } else { // AVX
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
+ Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
+ Vec = Insert128BitVector(InsV, Vec,
+ DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+ }
+ } else {
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
}
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
case X86ISD::SHUFPS:
case X86ISD::SHUFPD:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeSHUFPSMask(NumElems,
- cast<ConstantSDNode>(ImmN)->getZExtValue(),
- ShuffleMask);
- break;
- case X86ISD::PUNPCKHBW:
- case X86ISD::PUNPCKHWD:
- case X86ISD::PUNPCKHDQ:
- case X86ISD::PUNPCKHQDQ:
- DecodePUNPCKHMask(NumElems, ShuffleMask);
- break;
- case X86ISD::UNPCKHPS:
- case X86ISD::UNPCKHPD:
- case X86ISD::VUNPCKHPSY:
- case X86ISD::VUNPCKHPDY:
- DecodeUNPCKHPMask(NumElems, ShuffleMask);
+ DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
break;
- case X86ISD::PUNPCKLBW:
- case X86ISD::PUNPCKLWD:
- case X86ISD::PUNPCKLDQ:
- case X86ISD::PUNPCKLQDQ:
- DecodePUNPCKLMask(VT, ShuffleMask);
+ case X86ISD::UNPCKH:
+ DecodeUNPCKHMask(VT, ShuffleMask);
break;
- case X86ISD::UNPCKLPS:
- case X86ISD::UNPCKLPD:
- case X86ISD::VUNPCKLPSY:
- case X86ISD::VUNPCKLPDY:
- DecodeUNPCKLPMask(VT, ShuffleMask);
+ case X86ISD::UNPCKL:
+ DecodeUNPCKLMask(VT, ShuffleMask);
break;
case X86ISD::MOVHLPS:
DecodeMOVHLPSMask(NumElems, ShuffleMask);
return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
Depth+1);
}
- case X86ISD::VPERMILPS:
- ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- ShuffleMask);
- break;
- case X86ISD::VPERMILPSY:
- ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- ShuffleMask);
- break;
- case X86ISD::VPERMILPD:
- ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- ShuffleMask);
- break;
- case X86ISD::VPERMILPDY:
+ case X86ISD::VPERMILP:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
ShuffleMask);
break;
- case X86ISD::VPERM2F128:
+ case X86ISD::VPERM2X128:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
ShuffleMask);
EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(StartOffset),
- false, false, 0);
+ false, false, false, 0);
// Canonicalize it to a v4i32 or v8i32 shuffle.
SmallVector<int, 8> Mask;
if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
+ LDBase->isVolatile(), LDBase->isNonTemporal(),
+ LDBase->isInvariant(), 0);
return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(),
LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->getAlignment());
+ LDBase->isInvariant(), LDBase->getAlignment());
} else if (NumElems == 4 && LastLoadedElt == 1 &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
return SDValue();
}
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. We support two patterns:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load.
+/// The scalar load node is returned when a pattern is found,
+/// or SDValue() otherwise.
+static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
+ EVT VT = Op.getValueType();
+ SDValue V = Op;
+
+ if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ //A suspected load to be broadcasted.
+ SDValue Ld;
+
+ switch (V.getOpcode()) {
+ default:
+ // Unknown pattern found.
+ return SDValue();
+
+ case ISD::BUILD_VECTOR: {
+ // The BUILD_VECTOR node must be a splat.
+ if (!isSplatVector(V.getNode()))
+ return SDValue();
+
+ Ld = V.getOperand(0);
+
+ // The suspected load node has several users. Make sure that all
+ // of its users are from the BUILD_VECTOR node.
+ if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+ return SDValue();
+ break;
+ }
+
+ case ISD::VECTOR_SHUFFLE: {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+ // Shuffles must have a splat mask where the first element is
+ // broadcasted.
+ if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
+ return SDValue();
+
+ SDValue Sc = Op.getOperand(0);
+ if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return SDValue();
+
+ Ld = Sc.getOperand(0);
+
+ // The scalar_to_vector node and the suspected
+ // load node must have exactly one user.
+ if (!Sc.hasOneUse() || !Ld.hasOneUse())
+ return SDValue();
+ break;
+ }
+ }
+
+ // The scalar source must be a normal load.
+ if (!ISD::isNormalLoad(Ld.getNode()))
+ return SDValue();
+
+ bool Is256 = VT.getSizeInBits() == 256;
+ bool Is128 = VT.getSizeInBits() == 128;
+ unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+
+ if (hasAVX2) {
+ // VBroadcast to YMM
+ if (Is256 && (ScalarSize == 8 || ScalarSize == 16 ||
+ ScalarSize == 32 || ScalarSize == 64 ))
+ return Ld;
+
+ // VBroadcast to XMM
+ if (Is128 && (ScalarSize == 8 || ScalarSize == 32 ||
+ ScalarSize == 16 || ScalarSize == 64 ))
+ return Ld;
+ }
+
+ // VBroadcast to YMM
+ if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
+ return Ld;
+
+ // VBroadcast to XMM
+ if (Is128 && (ScalarSize == 32))
+ return Ld;
+
+
+ // Unsupported broadcast.
+ return SDValue();
+}
+
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
}
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
- // vectors or broken into v4i32 operations on 256-bit vectors.
+ // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+ // vpcmpeqd on 256-bit vectors.
if (ISD::isBuildVectorAllOnes(Op.getNode())) {
- if (Op.getValueType() == MVT::v4i32)
+ if (Op.getValueType() == MVT::v4i32 ||
+ (Op.getValueType() == MVT::v8i32 && Subtarget->hasAVX2()))
return Op;
- return getOnesVector(Op.getValueType(), DAG, dl);
+ return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl);
}
+ SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
+ if (Subtarget->hasAVX() && LD.getNode())
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+
unsigned EVTBits = ExtVT.getSizeInBits();
unsigned NumZero = 0;
DAG);
} else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
- assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
- EVT MiddleVT = MVT::v4i32;
+ unsigned NumBits = VT.getSizeInBits();
+ assert((NumBits == 128 || NumBits == 256) &&
+ "Expected an SSE or AVX value type!");
+ EVT MiddleVT = NumBits == 128 ? MVT::v4i32 : MVT::v8i32;
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
Subtarget->hasXMMInt(), DAG);
return LD;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) {
+ if (getSubtarget()->hasSSE41orAVX()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
// Determine if more than 1 of the words in each of the low and high quadwords
// of the result come from the same quadword of one of the two inputs. Undef
// mask values count as coming from any quadword, for better codegen.
- SmallVector<unsigned, 4> LoQuad(4);
- SmallVector<unsigned, 4> HiQuad(4);
+ unsigned LoQuad[] = { 0, 0, 0, 0 };
+ unsigned HiQuad[] = { 0, 0, 0, 0 };
BitVector InputQuads(4);
for (unsigned i = 0; i < 8; ++i) {
- SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
+ unsigned *Quad = i < 4 ? LoQuad : HiQuad;
int EltIdx = SVOp->getMaskElt(i);
MaskVals.push_back(EltIdx);
if (EltIdx < 0) {
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
- if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
+ if (Subtarget->hasSSSE3orAVX()) {
if (InputQuads.count() == 2 && V1Used && V2Used) {
BestLoQuad = InputQuads.find_first();
BestHiQuad = InputQuads.find_next(BestLoQuad);
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
- if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
+ if (Subtarget->hasSSSE3orAVX()) {
SmallVector<SDValue,16> pshufbMask;
// If we have elements from both input vectors, set the high bit of the
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
- (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFLWImmediate(NewV.getNode()),
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
- (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFHWImmediate(NewV.getNode()),
}
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
- if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) {
+ if (TLI.getSubtarget()->hasSSSE3orAVX()) {
SmallVector<SDValue,16> pshufbMask;
// If all result elements are from one input vector, then only translate
// from X.
if (NumHi == 3) {
// Normalize it so the 3 elements come from V1.
- CommuteVectorShuffleMask(PermMask, VT);
+ CommuteVectorShuffleMask(PermMask, 4);
std::swap(V1, V2);
}
V = V.getOperand(0);
if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
V = V.getOperand(0);
+ if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
+ V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
+ // BUILD_VECTOR (load), undef
+ V = V.getOperand(0);
if (MayFoldLoad(V))
return true;
return false;
// turns into:
// (MOVLPSmr addr:$src1, VR128:$src2)
// So, recognize this potential and also use MOVLPS or MOVLPD
- if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+ else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
CanFoldLoad = true;
- // Both of them can't be memory operations though.
- if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2))
- CanFoldLoad = false;
-
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
if (CanFoldLoad) {
if (HasXMMInt && NumElems == 2)
return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
if (NumElems == 4)
- return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
+ // If we don't care about the second element, procede to use movss.
+ if (SVOp->getMaskElt(1) != -1)
+ return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
}
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
// movl and movlp will both match v2i64, but v2i64 is never matched by
// movl earlier because we make it strict to avoid messing with the movlp load
// folding logic (see the code above getMOVLP call). Match it here then,
X86::getShuffleSHUFImmediate(SVOp), DAG);
}
-static inline unsigned getUNPCKLOpcode(EVT VT) {
- switch(VT.getSimpleVT().SimpleTy) {
- case MVT::v4i32: return X86ISD::PUNPCKLDQ;
- case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
- case MVT::v4f32: return X86ISD::UNPCKLPS;
- case MVT::v2f64: return X86ISD::UNPCKLPD;
- case MVT::v8i32: // Use fp unit for int unpack.
- case MVT::v8f32: return X86ISD::VUNPCKLPSY;
- case MVT::v4i64: // Use fp unit for int unpack.
- case MVT::v4f64: return X86ISD::VUNPCKLPDY;
- case MVT::v16i8: return X86ISD::PUNPCKLBW;
- case MVT::v8i16: return X86ISD::PUNPCKLWD;
- default:
- llvm_unreachable("Unknown type for unpckl");
- }
- return 0;
-}
-
-static inline unsigned getUNPCKHOpcode(EVT VT) {
- switch(VT.getSimpleVT().SimpleTy) {
- case MVT::v4i32: return X86ISD::PUNPCKHDQ;
- case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
- case MVT::v4f32: return X86ISD::UNPCKHPS;
- case MVT::v2f64: return X86ISD::UNPCKHPD;
- case MVT::v8i32: // Use fp unit for int unpack.
- case MVT::v8f32: return X86ISD::VUNPCKHPSY;
- case MVT::v4i64: // Use fp unit for int unpack.
- case MVT::v4f64: return X86ISD::VUNPCKHPDY;
- case MVT::v16i8: return X86ISD::PUNPCKHBW;
- case MVT::v8i16: return X86ISD::PUNPCKHWD;
- default:
- llvm_unreachable("Unknown type for unpckh");
- }
- return 0;
-}
-
-static inline unsigned getVPERMILOpcode(EVT VT) {
- switch(VT.getSimpleVT().SimpleTy) {
- case MVT::v4i32:
- case MVT::v4f32: return X86ISD::VPERMILPS;
- case MVT::v2i64:
- case MVT::v2f64: return X86ISD::VPERMILPD;
- case MVT::v8i32:
- case MVT::v8f32: return X86ISD::VPERMILPSY;
- case MVT::v4i64:
- case MVT::v4f64: return X86ISD::VPERMILPDY;
- default:
- llvm_unreachable("Unknown type for vpermil");
- }
- return 0;
-}
-
-/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
-/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
-/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
-static bool isVectorBroadcast(SDValue &Op) {
- EVT VT = Op.getValueType();
- bool Is256 = VT.getSizeInBits() == 256;
-
- assert((VT.getSizeInBits() == 128 || Is256) &&
- "Unsupported type for vbroadcast node");
-
- SDValue V = Op;
- if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
- V = V.getOperand(0);
-
- if (Is256 && !(V.hasOneUse() &&
- V.getOpcode() == ISD::INSERT_SUBVECTOR &&
- V.getOperand(0).getOpcode() == ISD::UNDEF))
- return false;
-
- if (Is256)
- V = V.getOperand(1);
-
- if (!V.hasOneUse())
- return false;
-
- // Check the source scalar_to_vector type. 256-bit broadcasts are
- // supported for 32/64-bit sizes, while 128-bit ones are only supported
- // for 32-bit scalars.
- if (V.getOpcode() != ISD::SCALAR_TO_VECTOR)
- return false;
-
- unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
- if (ScalarSize != 32 && ScalarSize != 64)
- return false;
- if (!Is256 && ScalarSize == 64)
- return false;
-
- V = V.getOperand(0);
- if (!MayFoldLoad(V))
- return false;
-
- // Return the load node
- Op = V;
- return true;
-}
-
static
SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI,
return Op;
// Use vbroadcast whenever the splat comes from a foldable load
- if (Subtarget->hasAVX() && isVectorBroadcast(V1))
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+ SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
+ if (Subtarget->hasAVX() && LD.getNode())
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
// Handle splats by matching through known shuffle masks
if ((Size == 128 && NumElem <= 4) ||
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
unsigned NumElems = VT.getVectorNumElements();
- bool isMMX = VT.getSizeInBits() == 64;
- bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
bool V1IsSplat = false;
bool V2IsSplat = false;
bool HasXMMInt = Subtarget->hasXMMInt();
+ bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX2 = Subtarget->hasAVX2();
MachineFunction &MF = DAG.getMachineFunction();
bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
- // Shuffle operations on MMX not supported.
- if (isMMX)
- return Op;
+ assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+ assert(V1.getOpcode() != ISD::UNDEF && "Op 1 of shuffle should not be undef");
// Vector shuffle lowering takes 3 steps:
//
// so the shuffle can be broken into other shuffles and the legalizer can
// try the lowering again.
//
- // The general ideia is that no vector_shuffle operation should be left to
+ // The general idea is that no vector_shuffle operation should be left to
// be matched during isel, all of them must be converted to a target specific
// node here.
// NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
// unpckh_undef). Only use pshufd if speed is more important than size.
- if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
- return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
- if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
- return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+ if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+ if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
- if (X86::isMOVDDUPMask(SVOp) &&
- (Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+ if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() &&
V2IsUndef && RelaxedMayFoldVectorLoad(V1))
return getMOVDDup(Op, dl, V1, DAG);
return getMOVHighToLow(Op, dl, DAG);
// Use to match splats
- if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
+ if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef &&
(VT == MVT::v2f64 || VT == MVT::v2i64))
- return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
if (X86::isPSHUFDMask(SVOp)) {
// The actual implementation will match the mask in the if above and then
bool isLeft = false;
unsigned ShAmt = 0;
SDValue ShVal;
- bool isShift = getSubtarget()->hasXMMInt() &&
- isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+ bool isShift = HasXMMInt && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
if (isShift && ShVal.hasOneUse()) {
// If the shifted value has multiple uses, it may be cheaper to use
// v_set0 + movlhps or movhlps, etc.
}
if (X86::isMOVLMask(SVOp)) {
- if (V1IsUndef)
- return V2;
if (ISD::isBuildVectorAllZeros(V1.getNode()))
return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
if (!X86::isMOVLPMask(SVOp)) {
}
// FIXME: fold these into legal mask.
- if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
+ if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp, HasAVX2))
return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
if (X86::isMOVHLPSMask(SVOp))
V2IsSplat = isSplatVector(V2.getNode());
// Canonicalize the splat or undef, if present, to be on the RHS.
- if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+ if (V1IsSplat && !V2IsSplat) {
Op = CommuteVectorShuffle(SVOp, DAG);
SVOp = cast<ShuffleVectorSDNode>(Op);
V1 = SVOp->getOperand(0);
V2 = SVOp->getOperand(1);
std::swap(V1IsSplat, V2IsSplat);
- std::swap(V1IsUndef, V2IsUndef);
Commuted = true;
}
- if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
+ SmallVector<int, 32> M;
+ SVOp->getMask(M);
+
+ if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
// Shuffling low element of v1 into undef, just return v1.
if (V2IsUndef)
return V1;
return getMOVL(DAG, dl, VT, V2, V1);
}
- if (X86::isUNPCKLMask(SVOp))
- return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
+ if (isUNPCKLMask(M, VT, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
- if (X86::isUNPCKHMask(SVOp))
- return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
+ if (isUNPCKHMask(M, VT, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
if (V2IsSplat) {
// Normalize mask so all entries that point to V2 points to its first
SDValue NewMask = NormalizeMask(SVOp, DAG);
ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
if (NSVOp != SVOp) {
- if (X86::isUNPCKLMask(NSVOp, true)) {
+ if (X86::isUNPCKLMask(NSVOp, HasAVX2, true)) {
return NewMask;
- } else if (X86::isUNPCKHMask(NSVOp, true)) {
+ } else if (X86::isUNPCKHMask(NSVOp, HasAVX2, true)) {
return NewMask;
}
}
SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
- if (X86::isUNPCKLMask(NewSVOp))
- return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
+ if (X86::isUNPCKLMask(NewSVOp, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG);
- if (X86::isUNPCKHMask(NewSVOp))
- return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
+ if (X86::isUNPCKHMask(NewSVOp, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG);
}
// Normalize the node to match x86 shuffle ops if needed
- if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
+ if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) ||
+ isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true)))
return CommuteVectorShuffle(SVOp, DAG);
// The checks below are all present in isShuffleMaskLegal, but they are
// inlined here right now to enable us to directly emit target specific
// nodes, and remove one by one until they don't return Op anymore.
- SmallVector<int, 16> M;
- SVOp->getMask(M);
- if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+ if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()))
return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
- X86::getShufflePALIGNRImmediate(SVOp),
+ getShufflePALIGNRImmediate(SVOp),
DAG);
if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
SVOp->getSplatIndex() == 0 && V2IsUndef) {
- if (VT == MVT::v2f64)
- return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
- if (VT == MVT::v2i64)
- return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
+ if (VT == MVT::v2f64 || VT == MVT::v2i64)
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
}
if (isPSHUFHWMask(M, VT))
return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
X86::getShuffleSHUFImmediate(SVOp), DAG);
- if (X86::isUNPCKL_v_undef_Mask(SVOp))
- return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
- if (X86::isUNPCKH_v_undef_Mask(SVOp))
- return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+ if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+ if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
//===--------------------------------------------------------------------===//
// Generate target specific nodes for 128 or 256-bit shuffles only
//
// Handle VMOVDDUPY permutations
- if (isMOVDDUPYMask(SVOp, Subtarget))
+ if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
- // Handle VPERMILPS* permutations
- if (isVPERMILPSMask(M, VT, Subtarget))
- return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
- getShuffleVPERMILPSImmediate(SVOp), DAG);
-
- // Handle VPERMILPD* permutations
- if (isVPERMILPDMask(M, VT, Subtarget))
- return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
- getShuffleVPERMILPDImmediate(SVOp), DAG);
-
- // Handle VPERM2F128 permutations
- if (isVPERM2F128Mask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
- getShuffleVPERM2F128Immediate(SVOp), DAG);
+ // Handle VPERMILPS/D* permutations
+ if (isVPERMILPMask(M, VT, HasAVX))
+ return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+ getShuffleVPERMILPImmediate(SVOp), DAG);
- // Handle VSHUFPSY permutations
- if (isVSHUFPSYMask(M, VT, Subtarget))
- return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
- getShuffleVSHUFPSYImmediate(SVOp), DAG);
+ // Handle VPERM2F128/VPERM2I128 permutations
+ if (isVPERM2X128Mask(M, VT, HasAVX))
+ return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
+ V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
- // Handle VSHUFPDY permutations
- if (isVSHUFPDYMask(M, VT, Subtarget))
+ // Handle VSHUFPS/DY permutations
+ if (isVSHUFPYMask(M, VT, HasAVX))
return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
- getShuffleVSHUFPDYImmediate(SVOp), DAG);
+ getShuffleVSHUFPYImmediate(SVOp), DAG);
//===--------------------------------------------------------------------===//
// Since no target specific shuffle was selected for this generic one,
Op.getOperand(0)),
Op.getOperand(1));
return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
- } else if (VT == MVT::i32) {
- // ExtractPS works with constant index.
+ } else if (VT == MVT::i32 || VT == MVT::i64) {
+ // ExtractPS/pextrq works with constant index.
if (isa<ConstantSDNode>(Op.getOperand(1)))
return Op;
}
assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
- if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
+ if (Subtarget->hasSSE41orAVX()) {
SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
if (Res.getNode())
return Res;
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
- } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
+ } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) &&
+ isa<ConstantSDNode>(N2)) {
// PINSR* works with constant index.
return Op;
}
return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
}
- if (Subtarget->hasSSE41() || Subtarget->hasAVX())
+ if (Subtarget->hasSSE41orAVX())
return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
if (EltVT == MVT::i8)
// load.
if (isGlobalStubReference(OpFlag))
Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(), false, false, 0);
+ MachinePointerInfo::getGOT(), false, false, false, 0);
return Result;
}
// load.
if (isGlobalStubReference(OpFlags))
Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(), false, false, 0);
+ MachinePointerInfo::getGOT(), false, false, false, 0);
// If there was a non-zero offset that we didn't fold, create an explicit
// addition for it.
SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getIntPtrConstant(0),
- MachinePointerInfo(Ptr), false, false, 0);
+ MachinePointerInfo(Ptr),
+ false, false, false, 0);
unsigned char OperandFlags = 0;
// Most TLS accesses are not RIP relative, even on x86-64. One exception is
if (model == TLSModel::InitialExec)
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
- MachinePointerInfo::getGOT(), false, false, 0);
+ MachinePointerInfo::getGOT(), false, false, false, 0);
// The address of the thread local variable is the add of the thread
// pointer with the offset of the variable.
// And our return value (tls address) is in the standard call return value
// location.
unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
- return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+ return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+ Chain.getValue(1));
}
assert(false &&
Op.getValueType(), MMO);
Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(SSFI),
- false, false, 0);
+ false, false, false, 0);
}
return Result;
LLVMContext *Context = DAG.getContext();
// Build some magic constants.
- std::vector<Constant*> CV0;
+ SmallVector<Constant*,4> CV0;
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
Constant *C0 = ConstantVector::get(CV0);
SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
- std::vector<Constant*> CV1;
+ SmallVector<Constant*,2> CV1;
CV1.push_back(
ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
CV1.push_back(
SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(),
- false, false, 16);
+ false, false, false, 16);
SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
MachinePointerInfo::getConstantPool(),
- false, false, 16);
+ false, false, false, 16);
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
// Add the halves; easiest way is to swap them into another reg first.
// Load the result.
return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
- FIST, StackSlot, MachinePointerInfo(), false, false, 0);
+ FIST, StackSlot, MachinePointerInfo(),
+ false, false, false, 0);
}
SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
// Load the result.
return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
- FIST, StackSlot, MachinePointerInfo(), false, false, 0);
+ FIST, StackSlot, MachinePointerInfo(),
+ false, false, false, 0);
}
SDValue X86TargetLowering::LowerFABS(SDValue Op,
EVT EltVT = VT;
if (VT.isVector())
EltVT = VT.getVectorElementType();
- std::vector<Constant*> CV;
+ SmallVector<Constant*,4> CV;
if (EltVT == MVT::f64) {
Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
- CV.push_back(C);
- CV.push_back(C);
+ CV.assign(2, C);
} else {
Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
- CV.push_back(C);
- CV.push_back(C);
- CV.push_back(C);
- CV.push_back(C);
+ CV.assign(4, C);
}
Constant *C = ConstantVector::get(CV);
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
- false, false, 16);
+ false, false, false, 16);
return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
}
DebugLoc dl = Op.getDebugLoc();
EVT VT = Op.getValueType();
EVT EltVT = VT;
- if (VT.isVector())
+ unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+ if (VT.isVector()) {
EltVT = VT.getVectorElementType();
- std::vector<Constant*> CV;
+ NumElts = VT.getVectorNumElements();
+ }
+ SmallVector<Constant*,8> CV;
if (EltVT == MVT::f64) {
Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
- CV.push_back(C);
- CV.push_back(C);
+ CV.assign(NumElts, C);
} else {
Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
- CV.push_back(C);
- CV.push_back(C);
- CV.push_back(C);
- CV.push_back(C);
+ CV.assign(NumElts, C);
}
Constant *C = ConstantVector::get(CV);
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
- false, false, 16);
+ false, false, false, 16);
if (VT.isVector()) {
+ MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(ISD::XOR, dl, MVT::v2i64,
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+ DAG.getNode(ISD::XOR, dl, XORVT,
+ DAG.getNode(ISD::BITCAST, dl, XORVT,
Op.getOperand(0)),
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask)));
+ DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
} else {
return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
}
// type, and that won't be f80 since that is not custom lowered.
// First get the sign bit of second operand.
- std::vector<Constant*> CV;
+ SmallVector<Constant*,4> CV;
if (SrcVT == MVT::f64) {
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
- false, false, 16);
+ false, false, false, 16);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
// Shift sign bit right or left if the two operands have different types.
CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
- false, false, 16);
+ false, false, false, 16);
SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
// Or the value with the sign bit.
// climbing the DAG back to the root, and it doesn't seem to be worth the
// effort.
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = Op.getNode()->use_end(); UI != UE; ++UI)
- if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() != ISD::CopyToReg &&
+ UI->getOpcode() != ISD::SETCC &&
+ UI->getOpcode() != ISD::STORE)
goto default_case;
if (ConstantSDNode *C =
}
} else if (Op1.getOpcode() == ISD::Constant) {
ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+ uint64_t AndRHSVal = AndRHS->getZExtValue();
SDValue AndLHS = Op0;
- if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+
+ if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
LHS = AndLHS.getOperand(0);
RHS = AndLHS.getOperand(1);
}
+
+ // Use BT if the immediate can't be encoded in a TEST instruction.
+ if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+ LHS = AndLHS;
+ RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
+ }
}
if (LHS.getNode()) {
UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
- }
- else if (SetCCOpcode == ISD::SETONE) {
+ } else if (SetCCOpcode == ISD::SETONE) {
SDValue ORD, NEQ;
ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
}
// Break 256-bit integer vector compare into smaller ones.
- if (!isFP && VT.getSizeInBits() == 256)
+ if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
return Lower256IntVSETCC(Op, DAG);
// We are handling one of the integer comparisons here. Since SSE only has
unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
bool Swap = false, Invert = false, FlipSigns = false;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
default: break;
- case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
- case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
- case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
- case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
+ case MVT::i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
+ case MVT::i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
+ case MVT::i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
+ case MVT::i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
}
switch (SetCCOpcode) {
// Check that the operation in question is available (most are plain SSE2,
// but PCMPGTQ and PCMPEQQ have different requirements).
- if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX())
+ if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42orAVX())
return SDValue();
- if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX())
+ if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41orAVX())
return SDValue();
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// If condition flag is set by a X86ISD::CMP, then use it as the condition
// setting operand in place of the X86ISD::SETCC.
- if (Cond.getOpcode() == X86ISD::SETCC ||
- Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
Cond = Cmp;
addTest = false;
}
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+ Cond.getOperand(0).getValueType() != MVT::i8)) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned X86Opcode;
+ unsigned X86Cond;
+ SDVTList VTs;
+ switch (CondOpcode) {
+ case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+ case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+ case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+ case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+ default: llvm_unreachable("unexpected overflowing operator");
+ }
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+ MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = X86Op.getValue(2);
+ else
+ Cond = X86Op.getValue(1);
+
+ CC = DAG.getConstant(X86Cond, MVT::i8);
+ addTest = false;
}
if (addTest) {
SDValue Dest = Op.getOperand(2);
DebugLoc dl = Op.getDebugLoc();
SDValue CC;
+ bool Inverted = false;
if (Cond.getOpcode() == ISD::SETCC) {
- SDValue NewCond = LowerSETCC(Cond, DAG);
- if (NewCond.getNode())
- Cond = NewCond;
+ // Check for setcc([su]{add,sub,mul}o == 0).
+ if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+ isa<ConstantSDNode>(Cond.getOperand(1)) &&
+ cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+ Cond.getOperand(0).getResNo() == 1 &&
+ (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+ Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+ Inverted = true;
+ Cond = Cond.getOperand(0);
+ } else {
+ SDValue NewCond = LowerSETCC(Cond, DAG);
+ if (NewCond.getNode())
+ Cond = NewCond;
+ }
}
#if 0
// FIXME: LowerXALUO doesn't handle these!!
// If condition flag is set by a X86ISD::CMP, then use it as the condition
// setting operand in place of the X86ISD::SETCC.
- if (Cond.getOpcode() == X86ISD::SETCC ||
- Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
break;
}
}
+ }
+ CondOpcode = Cond.getOpcode();
+ if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+ Cond.getOperand(0).getValueType() != MVT::i8)) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned X86Opcode;
+ unsigned X86Cond;
+ SDVTList VTs;
+ switch (CondOpcode) {
+ case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+ case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+ case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+ case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+ default: llvm_unreachable("unexpected overflowing operator");
+ }
+ if (Inverted)
+ X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+ MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = X86Op.getValue(2);
+ else
+ Cond = X86Op.getValue(1);
+
+ CC = DAG.getConstant(X86Cond, MVT::i8);
+ addTest = false;
} else {
unsigned CondOpc;
if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
CC = DAG.getConstant(CCode, MVT::i8);
Cond = Cond.getOperand(0).getOperand(1);
addTest = false;
+ } else if (Cond.getOpcode() == ISD::SETCC &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+ // For FCMP_OEQ, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ Cond.getOperand(0), Cond.getOperand(1));
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = DAG.getConstant(X86::COND_P, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ } else if (Cond.getOpcode() == ISD::SETCC &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+ // For FCMP_UNE, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_UNE.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+
+ SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ Cond.getOperand(0), Cond.getOperand(1));
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = DAG.getConstant(X86::COND_NP, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ Dest = FalseBB;
+ }
+ }
}
}
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
- EnableSegmentedStacks) &&
+ getTargetMachine().Options.EnableSegmentedStacks) &&
"This should be used only on Windows targets or when segmented stacks "
"are being used");
assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
bool Is64Bit = Subtarget->is64Bit();
EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
- if (EnableSegmentedStacks) {
+ if (getTargetMachine().Options.EnableSegmentedStacks) {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
- assert(!UseSoftFloat &&
+ assert(!getTargetMachine().Options.UseSoftFloat &&
!(DAG.getMachineFunction()
.getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
Subtarget->hasXMM());
Chain,
VAARG,
MachinePointerInfo(),
- false, false, 0);
+ false, false, false, 0);
}
SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
case Intrinsic::x86_avx_hsub_pd_256:
return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
// Fix vector shift instructions where the last operand is a non-immediate
// i32 value.
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:
case Intrinsic::x86_sse2_psrai_d:
NewIntNo = Intrinsic::x86_sse2_psra_d;
break;
+ case Intrinsic::x86_avx2_pslli_w:
+ NewIntNo = Intrinsic::x86_avx2_psll_w;
+ break;
+ case Intrinsic::x86_avx2_pslli_d:
+ NewIntNo = Intrinsic::x86_avx2_psll_d;
+ break;
+ case Intrinsic::x86_avx2_pslli_q:
+ NewIntNo = Intrinsic::x86_avx2_psll_q;
+ break;
+ case Intrinsic::x86_avx2_psrli_w:
+ NewIntNo = Intrinsic::x86_avx2_psrl_w;
+ break;
+ case Intrinsic::x86_avx2_psrli_d:
+ NewIntNo = Intrinsic::x86_avx2_psrl_d;
+ break;
+ case Intrinsic::x86_avx2_psrli_q:
+ NewIntNo = Intrinsic::x86_avx2_psrl_q;
+ break;
+ case Intrinsic::x86_avx2_psrai_w:
+ NewIntNo = Intrinsic::x86_avx2_psra_w;
+ break;
+ case Intrinsic::x86_avx2_psrai_d:
+ NewIntNo = Intrinsic::x86_avx2_psra_d;
+ break;
default: {
ShAmtVT = MVT::v2i32;
switch (IntNo) {
return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, getPointerTy(),
FrameAddr, Offset),
- MachinePointerInfo(), false, false, 0);
+ MachinePointerInfo(), false, false, false, 0);
}
// Just load the return address.
SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
- RetAddrFI, MachinePointerInfo(), false, false, 0);
+ RetAddrFI, MachinePointerInfo(), false, false, false, 0);
}
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
while (Depth--)
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
MachinePointerInfo(),
- false, false, 0);
+ false, false, false, 0);
return FrameAddr;
}
// Load FP Control Word from stack slot
SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
- MachinePointerInfo(), false, false, 0);
+ MachinePointerInfo(), false, false, false, 0);
// Transform as necessary
SDValue CWD1 =
return Lower256IntArith(Op, DAG);
}
-SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
+SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+ return Lower256IntArith(Op, DAG);
+
+ DebugLoc dl = Op.getDebugLoc();
+
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ if (VT == MVT::v4i64) {
+ assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2");
- // Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.getSizeInBits() == 256)
- return Lower256IntArith(Op, DAG);
+ // ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32);
+ // ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32);
+ // ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b );
+ // ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi );
+ // ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b );
+ //
+ // AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 );
+ // AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 );
+ // return AloBlo + AloBhi + AhiBlo;
+
+ SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
+ A, DAG.getConstant(32, MVT::i32));
+ SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
+ B, DAG.getConstant(32, MVT::i32));
+ SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
+ A, B);
+ SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
+ A, Bhi);
+ SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
+ Ahi, B);
+ AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
+ AloBhi, DAG.getConstant(32, MVT::i32));
+ AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
+ AhiBlo, DAG.getConstant(32, MVT::i32));
+ SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+ Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+ return Res;
+ }
assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
- DebugLoc dl = Op.getDebugLoc();
// ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
// ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
// AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
// return AloBlo + AloBhi + AhiBlo;
- SDValue A = Op.getOperand(0);
- SDValue B = Op.getOperand(1);
-
SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
A, DAG.getConstant(32, MVT::i32));
if (!Subtarget->hasXMMInt())
return SDValue();
- // Decompose 256-bit shifts into smaller 128-bit shifts.
- if (VT.getSizeInBits() == 256) {
- int NumElems = VT.getVectorNumElements();
- MVT EltVT = VT.getVectorElementType().getSimpleVT();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- // Extract the two vectors
- SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
- SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
-
- // Recreate the shift amount vectors
- SDValue Amt1, Amt2;
- if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
- // Constant shift amount
- SmallVector<SDValue, 4> Amt1Csts;
- SmallVector<SDValue, 4> Amt2Csts;
- for (int i = 0; i < NumElems/2; ++i)
- Amt1Csts.push_back(Amt->getOperand(i));
- for (int i = NumElems/2; i < NumElems; ++i)
- Amt2Csts.push_back(Amt->getOperand(i));
-
- Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
- &Amt1Csts[0], NumElems/2);
- Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
- &Amt2Csts[0], NumElems/2);
- } else {
- // Variable shift amount
- Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
- Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
- }
-
- // Issue new vector shifts for the smaller types
- V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
- V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
-
- // Concatenate the result back
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
- }
-
// Optimize shl/srl/sra with constant shift amount.
if (isSplatVector(Amt.getNode())) {
SDValue SclrAmt = Amt->getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
uint64_t ShiftAmt = C->getZExtValue();
+ if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SHL) {
+ // Make a large shift.
+ SDValue SHL =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+ // Zero out the rightmost bits.
+ SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U << ShiftAmt),
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SHL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ }
+
if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
+ if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRL) {
+ // Make a large shift.
+ SDValue SRL =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+ // Zero out the leftmost bits.
+ SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SRL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ }
+
if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+ if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRA) {
+ if (ShiftAmt == 7) {
+ // R s>> 7 === R s< 0
+ SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
+ }
+
+ // R s>> a === ((R u>> a) ^ m) - m
+ SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
+ MVT::i8));
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+ return Res;
+ }
+
+ if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
+ if (Op.getOpcode() == ISD::SHL) {
+ // Make a large shift.
+ SDValue SHL =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+ // Zero out the rightmost bits.
+ SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt),
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SHL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ }
+ if (Op.getOpcode() == ISD::SRL) {
+ // Make a large shift.
+ SDValue SRL =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+ // Zero out the leftmost bits.
+ SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SRL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ }
+ if (Op.getOpcode() == ISD::SRA) {
+ if (ShiftAmt == 7) {
+ // R s>> 7 === R s< 0
+ SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
+ }
+
+ // R s>> a === ((R u>> a) ^ m) - m
+ SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
+ MVT::i8));
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+ return Res;
+ }
+ }
}
}
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
- false, false, 16);
+ false, false, false, 16);
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
}
if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
+ assert((Subtarget->hasSSE2() || Subtarget->hasAVX()) &&
+ "Need SSE2 for pslli/pcmpeq.");
+
// a = a << 5;
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
Op.getOperand(1), DAG.getConstant(5, MVT::i32));
- ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15));
- ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63));
+ // Turn 'a' into a mask suitable for VSELECT
+ SDValue VSelM = DAG.getConstant(0x80, VT);
+ SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
+ OpVSel, VSelM);
- std::vector<Constant*> CVM1(16, CM1);
- std::vector<Constant*> CVM2(16, CM2);
- Constant *C = ConstantVector::get(CVM1);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
- SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, 16);
+ SDValue CM1 = DAG.getConstant(0x0f, VT);
+ SDValue CM2 = DAG.getConstant(0x3f, VT);
- // r = pblendv(r, psllw(r & (char16)15, 4), a);
- M = DAG.getNode(ISD::AND, dl, VT, R, M);
+ // r = VSELECT(r, psllw(r & (char16)15, 4), a);
+ SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
DAG.getConstant(4, MVT::i32));
- R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
// a += a
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+ OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
+ OpVSel, VSelM);
- C = ConstantVector::get(CVM2);
- CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
- M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, 16);
-
- // r = pblendv(r, psllw(r & (char16)63, 2), a);
- M = DAG.getNode(ISD::AND, dl, VT, R, M);
+ // r = VSELECT(r, psllw(r & (char16)63, 2), a);
+ M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
DAG.getConstant(2, MVT::i32));
- R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
// a += a
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
-
- // return pblendv(r, r+r, a);
- R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
- R, DAG.getNode(ISD::ADD, dl, VT, R, R));
+ OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
+ OpVSel, VSelM);
+
+ // return VSELECT(r, r+r, a);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
+ DAG.getNode(ISD::ADD, dl, VT, R, R), R);
return R;
}
+
+ // Decompose 256-bit shifts into smaller 128-bit shifts.
+ if (VT.getSizeInBits() == 256) {
+ int NumElems = VT.getVectorNumElements();
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ // Extract the two vectors
+ SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+
+ // Recreate the shift amount vectors
+ SDValue Amt1, Amt2;
+ if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+ // Constant shift amount
+ SmallVector<SDValue, 4> Amt1Csts;
+ SmallVector<SDValue, 4> Amt2Csts;
+ for (int i = 0; i < NumElems/2; ++i)
+ Amt1Csts.push_back(Amt->getOperand(i));
+ for (int i = NumElems/2; i < NumElems; ++i)
+ Amt2Csts.push_back(Amt->getOperand(i));
+
+ Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+ &Amt1Csts[0], NumElems/2);
+ Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+ &Amt2Csts[0], NumElems/2);
+ } else {
+ // Variable shift amount
+ Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
+ Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+ }
+
+ // Issue new vector shifts for the smaller types
+ V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+ V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+ // Concatenate the result back
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+ }
+
return SDValue();
}
SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
DebugLoc dl = Op.getDebugLoc();
- SDNode* Node = Op.getNode();
- EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
- EVT VT = Node->getValueType(0);
+ EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+ EVT VT = Op.getValueType();
+
if (Subtarget->hasXMMInt() && VT.isVector()) {
unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
ExtraVT.getScalarType().getSizeInBits();
switch (VT.getSimpleVT().SimpleTy) {
default:
return SDValue();
- case MVT::v4i32: {
+ case MVT::v4i32:
SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
break;
- }
- case MVT::v8i16: {
+ case MVT::v8i16:
SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w;
SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w;
break;
- }
+ case MVT::v8i32:
+ case MVT::v16i16:
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ if (!Subtarget->hasAVX2()) {
+ // needs to be split
+ int NumElems = VT.getVectorNumElements();
+ SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+ SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ EVT ExtraEltVT = ExtraVT.getVectorElementType();
+ int ExtraNumElems = ExtraVT.getVectorNumElements();
+ ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
+ ExtraNumElems/2);
+ SDValue Extra = DAG.getValueType(ExtraVT);
+
+ LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
+ LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+ }
+ if (VT == MVT::v8i32) {
+ SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_d;
+ SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_d;
+ } else {
+ SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_w;
+ SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_w;
+ }
}
SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(SHLIntrinsicsID, MVT::i32),
- Node->getOperand(0), ShAmt);
-
- // In case of 1 bit sext, no need to shr
- if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
+ Op.getOperand(0), ShAmt);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(SRAIntrinsicsID, MVT::i32),
void X86TargetLowering::
ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG, unsigned NewOp) const {
- EVT T = Node->getValueType(0);
DebugLoc dl = Node->getDebugLoc();
- assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+ assert (Node->getValueType(0) == MVT::i64 &&
+ "Only know how to expand i64 atomics");
SDValue Chain = Node->getOperand(0);
SDValue In1 = Node->getOperand(1);
EVT VT = N->getValueType(0);
// Return a load from the stack slot.
Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
- MachinePointerInfo(), false, false, 0));
+ MachinePointerInfo(),
+ false, false, false, 0));
}
return;
}
case X86ISD::PINSRW: return "X86ISD::PINSRW";
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::ANDNP: return "X86ISD::ANDNP";
- case X86ISD::PSIGNB: return "X86ISD::PSIGNB";
- case X86ISD::PSIGNW: return "X86ISD::PSIGNW";
- case X86ISD::PSIGND: return "X86ISD::PSIGND";
+ case X86ISD::PSIGN: return "X86ISD::PSIGN";
+ case X86ISD::BLENDV: return "X86ISD::BLENDV";
+ case X86ISD::HADD: return "X86ISD::HADD";
+ case X86ISD::HSUB: return "X86ISD::HSUB";
+ case X86ISD::FHADD: return "X86ISD::FHADD";
+ case X86ISD::FHSUB: return "X86ISD::FHSUB";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
case X86ISD::FRCP: return "X86ISD::FRCP";
- case X86ISD::FHADD: return "X86ISD::FHADD";
- case X86ISD::FHSUB: return "X86ISD::FHSUB";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
case X86ISD::ANDN: return "X86ISD::ANDN";
+ case X86ISD::BLSI: return "X86ISD::BLSI";
+ case X86ISD::BLSMSK: return "X86ISD::BLSMSK";
+ case X86ISD::BLSR: return "X86ISD::BLSR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
- case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD";
case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD";
case X86ISD::MOVSD: return "X86ISD::MOVSD";
case X86ISD::MOVSS: return "X86ISD::MOVSS";
- case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS";
- case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD";
- case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY";
- case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS";
- case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD";
- case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW";
- case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD";
- case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ";
- case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ";
- case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW";
- case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD";
- case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ";
- case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ";
+ case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
+ case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
- case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS";
- case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY";
- case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD";
- case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY";
- case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128";
+ case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
+ case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
EVT VT) const {
// Very little shuffling can be done for 64-bit vectors right now.
if (VT.getSizeInBits() == 64)
- return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX());
+ return false;
// FIXME: pshufb, blends, shifts.
return (VT.getVectorNumElements() == 2 ||
isPSHUFDMask(M, VT) ||
isPSHUFHWMask(M, VT) ||
isPSHUFLWMask(M, VT) ||
- isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) ||
- isUNPCKLMask(M, VT) ||
- isUNPCKHMask(M, VT) ||
- isUNPCKL_v_undef_Mask(M, VT) ||
- isUNPCKH_v_undef_Mask(M, VT));
+ isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) ||
+ isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
+ isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
+ isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
+ isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2()));
}
bool
return (isMOVLMask(Mask, VT) ||
isCommutedMOVLMask(Mask, VT, true) ||
isSHUFPMask(Mask, VT) ||
- isCommutedSHUFPMask(Mask, VT));
+ isSHUFPMask(Mask, VT, /* Commuted */ true));
}
return false;
}
MachineBasicBlock *
X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
unsigned numArgs, bool memArg) const {
- assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) &&
+ assert(Subtarget->hasSSE42orAVX() &&
"Target must have SSE4.2 or AVX features enabled");
DebugLoc dl = MI->getDebugLoc();
MachineFunction *MF = BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- assert(EnableSegmentedStacks);
+ assert(getTargetMachine().Options.EnableSegmentedStacks);
unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+ SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
sizeVReg = MI->getOperand(1).getReg(),
physSPReg = Is64Bit ? X86::RSP : X86::ESP;
// Add code to the main basic block to check if the stack limit has been hit,
// and if so, jump to mallocMBB otherwise to bumpMBB.
BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
- BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg)
+ BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
.addReg(tmpSPVReg).addReg(sizeVReg);
BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
- .addReg(tmpSPVReg);
+ .addReg(SPLimitVReg);
BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
- .addReg(tmpSPVReg);
+ .addReg(SPLimitVReg);
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
- .addReg(tmpSPVReg);
+ .addReg(SPLimitVReg);
BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
// Load the scalar.
SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
ScalarAddr, MachinePointerInfo(),
- false, false, 0);
+ false, false, false, 0);
// Replace the exact with the load.
DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
- if (!UnsafeFPMath &&
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
break;
std::swap(LHS, RHS);
case ISD::SETOLE:
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly.
- if (!UnsafeFPMath &&
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
break;
Opcode = X86ISD::FMIN;
case ISD::SETOGE:
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly.
- if (!UnsafeFPMath &&
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
break;
Opcode = X86ISD::FMAX;
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
- if (!UnsafeFPMath &&
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
break;
std::swap(LHS, RHS);
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
- if (!UnsafeFPMath &&
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
break;
case ISD::SETUGT:
// Converting this to a min would handle NaNs incorrectly.
- if (!UnsafeFPMath &&
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
(!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
break;
Opcode = X86ISD::FMIN;
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
- if (!UnsafeFPMath &&
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
// since the result of setcc_c is all zero's or all ones.
- if (N1C && N0.getOpcode() == ISD::AND &&
+ if (VT.isInteger() && !VT.isVector() &&
+ N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
}
}
+
+ // Hardware support for vector shifts is sparse which makes us scalarize the
+ // vector operations in many cases. Also, on sandybridge ADD is faster than
+ // shl.
+ // (shl V, 1) -> add V,V
+ if (isSplatVector(N1.getNode())) {
+ assert(N0.getValueType().isVector() && "Invalid vector shift type");
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
+ // We shift all of the values by one. In many cases we do not have
+ // hardware support for this operation. This is better expressed as an ADD
+ // of two values.
+ if (N1C && (1 == N1C->getZExtValue())) {
+ return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+ }
+ }
+
return SDValue();
}
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
- if (!VT.isVector() && VT.isInteger() &&
- N->getOpcode() == ISD::SHL)
- return PerformSHLCombine(N, DAG);
+ if (N->getOpcode() == ISD::SHL) {
+ SDValue V = PerformSHLCombine(N, DAG);
+ if (V.getNode()) return V;
+ }
// On X86 with SSE2 support, we can transform this to a vector shift if
// all elements are shifted by the same amount. We can't do this in legalize
if (!Subtarget->hasXMMInt())
return SDValue();
- if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+ (!Subtarget->hasAVX2() ||
+ (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
return SDValue();
SDValue ShAmtOp = N->getOperand(1);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
ValOp, BaseShAmt);
+ if (VT == MVT::v4i64)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v8i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v16i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
+ ValOp, BaseShAmt);
break;
case ISD::SRA:
if (VT == MVT::v4i32)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
ValOp, BaseShAmt);
+ if (VT == MVT::v8i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v16i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32),
+ ValOp, BaseShAmt);
break;
case ISD::SRL:
if (VT == MVT::v2i64)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
ValOp, BaseShAmt);
+ if (VT == MVT::v4i64)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v8i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v16i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
+ ValOp, BaseShAmt);
break;
}
return SDValue();
EVT VT = N->getValueType(0);
- // Create ANDN instructions
+ // Create ANDN, BLSI, and BLSR instructions
+ // BLSI is X & (-X)
+ // BLSR is X & (X-1)
if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
+ // Check LHS for neg
+ if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
+ isZero(N0.getOperand(0)))
+ return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
+
+ // Check RHS for neg
+ if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
+ isZero(N1.getOperand(0)))
+ return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+
+ // Check LHS for X-1
+ if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+ isAllOnes(N0.getOperand(1)))
+ return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+
+ // Check RHS for X-1
+ if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+ isAllOnes(N1.getOperand(1)))
+ return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+
return SDValue();
}
return R;
EVT VT = N->getValueType(0);
- if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
- return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// look for psign/blend
- if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
- if (VT == MVT::v2i64) {
- // Canonicalize pandn to RHS
- if (N0.getOpcode() == X86ISD::ANDNP)
- std::swap(N0, N1);
- // or (and (m, x), (pandn m, y))
- if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
- SDValue Mask = N1.getOperand(0);
- SDValue X = N1.getOperand(1);
- SDValue Y;
- if (N0.getOperand(0) == Mask)
- Y = N0.getOperand(1);
- if (N0.getOperand(1) == Mask)
- Y = N0.getOperand(0);
-
- // Check to see if the mask appeared in both the AND and ANDNP and
- if (!Y.getNode())
- return SDValue();
+ if (VT == MVT::v2i64 || VT == MVT::v4i64) {
+ if (!Subtarget->hasSSSE3orAVX() ||
+ (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
+ return SDValue();
- // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
- if (Mask.getOpcode() != ISD::BITCAST ||
- X.getOpcode() != ISD::BITCAST ||
- Y.getOpcode() != ISD::BITCAST)
- return SDValue();
+ // Canonicalize pandn to RHS
+ if (N0.getOpcode() == X86ISD::ANDNP)
+ std::swap(N0, N1);
+ // or (and (m, x), (pandn m, y))
+ if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
+ SDValue Mask = N1.getOperand(0);
+ SDValue X = N1.getOperand(1);
+ SDValue Y;
+ if (N0.getOperand(0) == Mask)
+ Y = N0.getOperand(1);
+ if (N0.getOperand(1) == Mask)
+ Y = N0.getOperand(0);
+
+ // Check to see if the mask appeared in both the AND and ANDNP and
+ if (!Y.getNode())
+ return SDValue();
- // Look through mask bitcast.
- Mask = Mask.getOperand(0);
- EVT MaskVT = Mask.getValueType();
+ // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
+ if (Mask.getOpcode() != ISD::BITCAST ||
+ X.getOpcode() != ISD::BITCAST ||
+ Y.getOpcode() != ISD::BITCAST)
+ return SDValue();
- // Validate that the Mask operand is a vector sra node. The sra node
- // will be an intrinsic.
- if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
- return SDValue();
+ // Look through mask bitcast.
+ Mask = Mask.getOperand(0);
+ EVT MaskVT = Mask.getValueType();
- // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
- // there is no psrai.b
- switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
- case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_sse2_psrai_d:
- break;
- default: return SDValue();
- }
+ // Validate that the Mask operand is a vector sra node. The sra node
+ // will be an intrinsic.
+ if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+ return SDValue();
- // Check that the SRA is all signbits.
- SDValue SraC = Mask.getOperand(2);
- unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
- unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
- if ((SraAmt + 1) != EltBits)
- return SDValue();
+ // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+ // there is no psrai.b
+ switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ break;
+ default: return SDValue();
+ }
- DebugLoc DL = N->getDebugLoc();
-
- // Now we know we at least have a plendvb with the mask val. See if
- // we can form a psignb/w/d.
- // psign = x.type == y.type == mask.type && y = sub(0, x);
- X = X.getOperand(0);
- Y = Y.getOperand(0);
- if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
- ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
- X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){
- unsigned Opc = 0;
- switch (EltBits) {
- case 8: Opc = X86ISD::PSIGNB; break;
- case 16: Opc = X86ISD::PSIGNW; break;
- case 32: Opc = X86ISD::PSIGND; break;
- default: break;
- }
- if (Opc) {
- SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1));
- return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign);
- }
- }
- // PBLENDVB only available on SSE 4.1
- if (!(Subtarget->hasSSE41() || Subtarget->hasAVX()))
- return SDValue();
+ // Check that the SRA is all signbits.
+ SDValue SraC = Mask.getOperand(2);
+ unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
+ unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+ if ((SraAmt + 1) != EltBits)
+ return SDValue();
- X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
- Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
- Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
- Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y);
- return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
+ DebugLoc DL = N->getDebugLoc();
+
+ // Now we know we at least have a plendvb with the mask val. See if
+ // we can form a psignb/w/d.
+ // psign = x.type == y.type == mask.type && y = sub(0, x);
+ X = X.getOperand(0);
+ Y = Y.getOperand(0);
+ if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
+ ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
+ X.getValueType() == MaskVT && X.getValueType() == Y.getValueType() &&
+ (EltBits == 8 || EltBits == 16 || EltBits == 32)) {
+ SDValue Sign = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X,
+ Mask.getOperand(1));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Sign);
}
+ // PBLENDVB only available on SSE 4.1
+ if (!Subtarget->hasSSE41orAVX())
+ return SDValue();
+
+ EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+ X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
+ Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
+ Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+ Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
}
}
+ if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
std::swap(N0, N1);
return SDValue();
}
+static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ // Create BLSMSK instructions by finding X ^ (X-1)
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+ isAllOnes(N0.getOperand(1)))
+ return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
+
+ if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+ isAllOnes(N1.getOperand(1)))
+ return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
+
+ return SDValue();
+}
+
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->getAlignment());
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Ld->getAlignment());
// Insert the word loaded into a vector.
SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If we are saving a concatination of two XMM registers, perform two stores.
+ // If we are saving a concatenation of two XMM registers, perform two stores.
// This is better in Sandy Bridge cause one 256-bit mem op is done via two
// 128-bit ones. If in the future the cost becomes only one memory access the
// first version would be better.
const Function *F = DAG.getMachineFunction().getFunction();
bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
- bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
+ bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
&& Subtarget->hasXMMInt();
if ((VT.isVector() ||
(VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->getAlignment());
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Ld->getAlignment());
SDValue NewChain = NewLd.getValue(1);
if (TokenFactorIndex != -1) {
Ops.push_back(NewChain);
SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
Ld->getPointerInfo(),
Ld->isVolatile(), Ld->isNonTemporal(),
- Ld->getAlignment());
+ Ld->isInvariant(), Ld->getAlignment());
SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
Ld->getPointerInfo().getWithOffset(4),
Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(),
MinAlign(Ld->getAlignment(), 4));
SDValue NewChain = LoLd.getValue(1);
/// set to A, RHS to B, and the routine returns 'true'.
/// Note that the binary operation should have the property that if one of the
/// operands is UNDEF then the result is UNDEF.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
// Look for the following pattern: if
// A = < float a0, float a1, float a2, float a3 >
// B = < float b0, float b1, float b2, float b3 >
return false;
EVT VT = LHS.getValueType();
- unsigned N = VT.getVectorNumElements();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for horizontal add/sub");
+
+ // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+ // operate independently on 128-bit lanes.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+ assert((NumLaneElts % 2 == 0) &&
+ "Vector type should have an even number of elements in each lane");
+ unsigned HalfLaneElts = NumLaneElts/2;
// View LHS in the form
// LHS = VECTOR_SHUFFLE A, B, LMask
// NOTE: in what follows a default initialized SDValue represents an UNDEF of
// type VT.
SDValue A, B;
- SmallVector<int, 8> LMask(N);
+ SmallVector<int, 16> LMask(NumElts);
if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
A = LHS.getOperand(0);
} else {
if (LHS.getOpcode() != ISD::UNDEF)
A = LHS;
- for (unsigned i = 0; i != N; ++i)
+ for (unsigned i = 0; i != NumElts; ++i)
LMask[i] = i;
}
// Likewise, view RHS in the form
// RHS = VECTOR_SHUFFLE C, D, RMask
SDValue C, D;
- SmallVector<int, 8> RMask(N);
+ SmallVector<int, 16> RMask(NumElts);
if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
C = RHS.getOperand(0);
} else {
if (RHS.getOpcode() != ISD::UNDEF)
C = RHS;
- for (unsigned i = 0; i != N; ++i)
+ for (unsigned i = 0; i != NumElts; ++i)
RMask[i] = i;
}
// If A and B occur in reverse order in RHS, then "swap" them (which means
// rewriting the mask).
if (A != C)
- for (unsigned i = 0; i != N; ++i) {
- unsigned Idx = RMask[i];
- if (Idx < N)
- RMask[i] += N;
- else if (Idx < 2*N)
- RMask[i] -= N;
- }
+ CommuteVectorShuffleMask(RMask, NumElts);
// At this point LHS and RHS are equivalent to
// LHS = VECTOR_SHUFFLE A, B, LMask
// RHS = VECTOR_SHUFFLE A, B, RMask
// Check that the masks correspond to performing a horizontal operation.
- for (unsigned i = 0; i != N; ++i) {
- unsigned LIdx = LMask[i], RIdx = RMask[i];
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int LIdx = LMask[i], RIdx = RMask[i];
// Ignore any UNDEF components.
- if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
- || (!B.getNode() && (LIdx >= N || RIdx >= N)))
+ if (LIdx < 0 || RIdx < 0 ||
+ (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+ (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
continue;
// Check that successive elements are being operated on. If not, this is
// not a horizontal operation.
- if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
- !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
+ unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
+ unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
+ int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
+ if (!(LIdx == Index && RIdx == Index + 1) &&
+ !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
return false;
}
SDValue RHS = N->getOperand(1);
// Try to synthesize horizontal adds from adds of shuffles.
- if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
- (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+ if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, true))
return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
return SDValue();
SDValue RHS = N->getOperand(1);
// Try to synthesize horizontal subs from subs of shuffles.
- if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
- (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+ if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, false))
return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
return SDValue();
DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
}
-static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
+/// PerformADDCombine - Do target-specific dag combines on integer adds.
+static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) &&
+ isHorizontalBinOp(Op0, Op1, true))
+ return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
+
+ return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
}
}
+ // Try to synthesize horizontal adds from adds of shuffles.
+ EVT VT = N->getValueType(0);
+ if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ isHorizontalBinOp(Op0, Op1, true))
+ return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
+
return OptimizeConditionalInDecrement(N, DAG);
}
case ISD::VSELECT:
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
- case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG);
- case ISD::SUB: return PerformSubCombine(N, DAG);
+ case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
+ case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
case ISD::SHL:
case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
+ case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
case X86ISD::SHUFPS: // Handle all target specific shuffles
case X86ISD::SHUFPD:
case X86ISD::PALIGN:
- case X86ISD::PUNPCKHBW:
- case X86ISD::PUNPCKHWD:
- case X86ISD::PUNPCKHDQ:
- case X86ISD::PUNPCKHQDQ:
- case X86ISD::UNPCKHPS:
- case X86ISD::UNPCKHPD:
- case X86ISD::VUNPCKHPSY:
- case X86ISD::VUNPCKHPDY:
- case X86ISD::PUNPCKLBW:
- case X86ISD::PUNPCKLWD:
- case X86ISD::PUNPCKLDQ:
- case X86ISD::PUNPCKLQDQ:
- case X86ISD::UNPCKLPS:
- case X86ISD::UNPCKLPD:
- case X86ISD::VUNPCKLPSY:
- case X86ISD::VUNPCKLPDY:
+ case X86ISD::UNPCKH:
+ case X86ISD::UNPCKL:
case X86ISD::MOVHLPS:
case X86ISD::MOVLHPS:
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
- case X86ISD::VPERMILPS:
- case X86ISD::VPERMILPSY:
- case X86ISD::VPERMILPD:
- case X86ISD::VPERMILPDY:
- case X86ISD::VPERM2F128:
+ case X86ISD::VPERMILP:
+ case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
}
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
+// Helper to match a string separated by whitespace.
+static bool END_WITH_NULL matchAsm(StringRef s, ...) {
+ va_list ap;
+ va_start(ap, s);
+ s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
+
+ while (const char *p = va_arg(ap, const char *)) {
+ StringRef piece(p);
+ if (!s.startswith(piece)) { // Check if the piece matches.
+ va_end(ap);
+ return false;
+ }
+
+ s = s.substr(piece.size());
+ StringRef::size_type i = s.find_first_not_of(" \t");
+ if (i == 0) { // We matched a prefix.
+ va_end(ap);
+ return false;
+ }
+ s = s.substr(i);
+ }
+
+ va_end(ap);
+ return s.empty();
+}
+
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
std::string AsmStr = IA->getAsmString();
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
SmallVector<StringRef, 4> AsmPieces;
SplitString(AsmStr, AsmPieces, ";\n");
switch (AsmPieces.size()) {
default: return false;
case 1:
- AsmStr = AsmPieces[0];
- AsmPieces.clear();
- SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace.
-
// FIXME: this should verify that we are targeting a 486 or better. If not,
- // we will turn this bswap into something that will be lowered to logical ops
- // instead of emitting the bswap asm. For now, we don't support 486 or lower
- // so don't worry about this.
+ // we will turn this bswap into something that will be lowered to logical
+ // ops instead of emitting the bswap asm. For now, we don't support 486 or
+ // lower so don't worry about this.
// bswap $0
- if (AsmPieces.size() == 2 &&
- (AsmPieces[0] == "bswap" ||
- AsmPieces[0] == "bswapq" ||
- AsmPieces[0] == "bswapl") &&
- (AsmPieces[1] == "$0" ||
- AsmPieces[1] == "${0:q}")) {
+ if (matchAsm(AsmPieces[0], "bswap", "$0", NULL) ||
+ matchAsm(AsmPieces[0], "bswapl", "$0", NULL) ||
+ matchAsm(AsmPieces[0], "bswapq", "$0", NULL) ||
+ matchAsm(AsmPieces[0], "bswap", "${0:q}", NULL) ||
+ matchAsm(AsmPieces[0], "bswapl", "${0:q}", NULL) ||
+ matchAsm(AsmPieces[0], "bswapq", "${0:q}", NULL)) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
return IntrinsicLowering::LowerToByteSwap(CI);
}
+
// rorw $$8, ${0:w} --> llvm.bswap.i16
if (CI->getType()->isIntegerTy(16) &&
- AsmPieces.size() == 3 &&
- (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
- AsmPieces[1] == "$$8," &&
- AsmPieces[2] == "${0:w}" &&
- IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}", NULL) ||
+ matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}", NULL))) {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
AsmPieces[0] == "~{cc}" &&
AsmPieces[1] == "~{dirflag}" &&
AsmPieces[2] == "~{flags}" &&
- AsmPieces[3] == "~{fpsr}") {
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
+ AsmPieces[3] == "~{fpsr}")
+ return IntrinsicLowering::LowerToByteSwap(CI);
}
break;
case 3:
if (CI->getType()->isIntegerTy(32) &&
- IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
- SmallVector<StringRef, 4> Words;
- SplitString(AsmPieces[0], Words, " \t,");
- if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
- Words[2] == "${0:w}") {
- Words.clear();
- SplitString(AsmPieces[1], Words, " \t,");
- if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" &&
- Words[2] == "$0") {
- Words.clear();
- SplitString(AsmPieces[2], Words, " \t,");
- if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
- Words[2] == "${0:w}") {
- AsmPieces.clear();
- const std::string &ConstraintsStr = IA->getConstraintString();
- SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
- std::sort(AsmPieces.begin(), AsmPieces.end());
- if (AsmPieces.size() == 4 &&
- AsmPieces[0] == "~{cc}" &&
- AsmPieces[1] == "~{dirflag}" &&
- AsmPieces[2] == "~{flags}" &&
- AsmPieces[3] == "~{fpsr}") {
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- }
- }
- }
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}", NULL) &&
+ matchAsm(AsmPieces[1], "rorl", "$$16,", "$0", NULL) &&
+ matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}", NULL)) {
+ AsmPieces.clear();
+ const std::string &ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ std::sort(AsmPieces.begin(), AsmPieces.end());
+ if (AsmPieces.size() == 4 &&
+ AsmPieces[0] == "~{cc}" &&
+ AsmPieces[1] == "~{dirflag}" &&
+ AsmPieces[2] == "~{flags}" &&
+ AsmPieces[3] == "~{fpsr}")
+ return IntrinsicLowering::LowerToByteSwap(CI);
}
if (CI->getType()->isIntegerTy(64)) {
Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
- SmallVector<StringRef, 4> Words;
- SplitString(AsmPieces[0], Words, " \t");
- if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
- Words.clear();
- SplitString(AsmPieces[1], Words, " \t");
- if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
- Words.clear();
- SplitString(AsmPieces[2], Words, " \t,");
- if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
- Words[2] == "%edx") {
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- }
- }
+ if (matchAsm(AsmPieces[0], "bswap", "%eax", NULL) &&
+ matchAsm(AsmPieces[1], "bswap", "%edx", NULL) &&
+ matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx", NULL))
+ return IntrinsicLowering::LowerToByteSwap(CI);
}
}
break;