#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/VariadicFunction.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
-static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl,
- unsigned vectorWidth) {
- assert((vectorWidth == 128 || vectorWidth == 256) &&
- "Unsupported vector width");
- EVT VT = Vec.getValueType();
- EVT ElVT = VT.getVectorElementType();
- unsigned Factor = VT.getSizeInBits()/vectorWidth;
- EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
- VT.getVectorNumElements()/Factor);
-
- // Extract from UNDEF is UNDEF.
- if (Vec.getOpcode() == ISD::UNDEF)
- return DAG.getUNDEF(ResultVT);
-
- // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
- unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
-
- // This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
- * ElemsPerChunk);
-
- // If the input is a buildvector just emit a smaller one.
- if (Vec.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
- ElemsPerChunk));
-
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
-}
-
-/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
-/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
-/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
-/// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
-/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert((Vec.getValueType().is256BitVector() ||
- Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
- return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
-}
-
-/// Generate a DAG to grab 256-bits from a 512-bit vector.
-static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
- return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
-}
-
-static SDValue InsertSubVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl, unsigned vectorWidth) {
- assert((vectorWidth == 128 || vectorWidth == 256) &&
- "Unsupported vector width");
- // Inserting UNDEF is Result
- if (Vec.getOpcode() == ISD::UNDEF)
- return Result;
- EVT VT = Vec.getValueType();
- EVT ElVT = VT.getVectorElementType();
- EVT ResultVT = Result.getValueType();
-
- // Insert the relevant vectorWidth bits.
- unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
-
- // This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
- * ElemsPerChunk);
-
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
-}
-
-/// Generate a DAG to put 128-bits into a vector > 128 bits. This
-/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
-/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
-/// simple superregister reference. Idx is an index in the 128 bits
-/// we want. It need not be aligned to a 128-bit boundary. That makes
-/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG,SDLoc dl) {
- assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
- return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
-}
-
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
- return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
-}
-
-/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
-/// instructions. This is used because creating CONCAT_VECTOR nodes of
-/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
-/// large BUILD_VECTORS.
-static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- SDLoc dl) {
- SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
-}
-
-static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- SDLoc dl) {
- SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
-}
-
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
// MMX-sized vectors (other than x86mmx) are expected to be expanded
// into smaller operations.
- setOperationAction(ISD::MULHS, MVT::v8i8, Expand);
- setOperationAction(ISD::MULHS, MVT::v4i16, Expand);
- setOperationAction(ISD::MULHS, MVT::v2i32, Expand);
- setOperationAction(ISD::MULHS, MVT::v1i64, Expand);
- setOperationAction(ISD::AND, MVT::v8i8, Expand);
- setOperationAction(ISD::AND, MVT::v4i16, Expand);
- setOperationAction(ISD::AND, MVT::v2i32, Expand);
- setOperationAction(ISD::AND, MVT::v1i64, Expand);
- setOperationAction(ISD::OR, MVT::v8i8, Expand);
- setOperationAction(ISD::OR, MVT::v4i16, Expand);
- setOperationAction(ISD::OR, MVT::v2i32, Expand);
- setOperationAction(ISD::OR, MVT::v1i64, Expand);
- setOperationAction(ISD::XOR, MVT::v8i8, Expand);
- setOperationAction(ISD::XOR, MVT::v4i16, Expand);
- setOperationAction(ISD::XOR, MVT::v2i32, Expand);
- setOperationAction(ISD::XOR, MVT::v1i64, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand);
+ for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
+ setOperationAction(ISD::MULHS, MMXTy, Expand);
+ setOperationAction(ISD::AND, MMXTy, Expand);
+ setOperationAction(ISD::OR, MMXTy, Expand);
+ setOperationAction(ISD::XOR, MMXTy, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand);
+ setOperationAction(ISD::SELECT, MMXTy, Expand);
+ setOperationAction(ISD::BITCAST, MMXTy, Expand);
+ }
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
- setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
- setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
- setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
- setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
- setOperationAction(ISD::BITCAST, MVT::v8i8, Expand);
- setOperationAction(ISD::BITCAST, MVT::v4i16, Expand);
- setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
- setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
- setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
- setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
- setOperationAction(ISD::FRINT, MVT::f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-
- setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
- setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
- setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+ for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ }
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);
setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
setOperationAction(ISD::SUB, MVT::v32i16, Legal);
setOperationAction(ISD::SUB, MVT::v64i8, Legal);
setOperationAction(ISD::MUL, MVT::v32i16, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
const MVT VT = (MVT::SimpleValueType)i;
setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
setOperationAction(ISD::AND, MVT::v8i32, Legal);
setOperationAction(ISD::OR, MVT::v8i32, Legal);
setOperationAction(ISD::XOR, MVT::v4i32, Legal);
}
- // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
- // of this type with custom code.
- for (MVT VT : MVT::vector_valuetypes())
- setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
-
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
- computeRegisterProperties();
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// On Darwin, -Os means optimize for size without hurting performance,
// do not reduce the limit.
return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
}
-// FIXME: Why this routine is here? Move to RegInfo!
-std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(MVT VT) const{
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
- return TargetLowering::findRepresentativeClass(VT);
+ return TargetLowering::findRepresentativeClass(TRI, VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
break;
const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
const Function* Fn = MF.getFunction();
if (Fn->hasExternalLinkage() &&
MFI->CreateFixedObject(1, StackSize, true));
}
+ MachineModuleInfo &MMI = MF.getMMI();
+ const Function *WinEHParent = nullptr;
+ if (IsWin64 && MMI.hasWinEHFuncInfo(Fn))
+ WinEHParent = MMI.getWinEHParent(Fn);
+ bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
+ bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
+
// Figure out if XMM registers are in use.
assert(!(MF.getTarget().Options.UseSoftFloat &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
- unsigned NumIntRegs =
- CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
- unsigned NumXMMRegs =
- CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
}
if (IsWin64) {
- const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ } else if (IsWinEHOutlined) {
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject(
+ /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false));
+
+ MMI.getWinEHFuncInfo(Fn)
+ .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] =
+ FuncInfo->getRegSaveFrameIndex();
+
+ // Store the second integer parameter (rdx) into rsp+16 relative to the
+ // stack pointer at the entry of the function.
+ SDValue RSFIN =
+ DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy());
+ unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
+ Chain = DAG.getStore(
+ Val.getValue(1), dl, Val, RSFIN,
+ MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()),
+ /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0);
}
if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
FuncInfo->setArgumentStackSize(StackSize);
+ if (IsWinEHParent) {
+ int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+ SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
+ MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
+ SDValue Neg2 = DAG.getConstant(-2, MVT::i64);
+ Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
+ MachinePointerInfo::getFixedStack(UnwindHelpFI),
+ /*isVolatile=*/true,
+ /*isNonTemporal=*/false, /*Alignment=*/0);
+ }
+
return Chain;
}
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
- unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
- const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+ // Win64 functions have extra shadow space for argument homing. Don't do the
+ // sibcall if the caller and callee have mismatched expectations for this
+ // space.
+ if (IsCalleeWin64 != IsCallerWin64)
+ return false;
+
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
return true;
return true;
}
-/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
-/// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
- unsigned NumElems) {
- for (unsigned i = 0; i != NumElems; ++i) {
- int idx = Mask[i];
- if (idx < 0)
- continue;
- else if (idx < (int)NumElems)
- Mask[i] = idx + NumElems;
- else
- Mask[i] = idx - NumElems;
- }
-}
-
/// isVEXTRACTIndex - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
/// suitable for instruction that extract 128 or 256 bit vectors
Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
} else if (VT.getScalarType() == MVT::i1) {
- assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
+
+ assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
+ && "Unexpected vector type");
+ assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
+ && "Unexpected vector type");
SDValue Cst = DAG.getConstant(0, MVT::i1);
- SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
} else
llvm_unreachable("Unexpected vector type");
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl,
+ unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ unsigned Factor = VT.getSizeInBits()/vectorWidth;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
+
+ // Extract from UNDEF is UNDEF.
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return DAG.getUNDEF(ResultVT);
+
+ // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
+ unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
+ * ElemsPerChunk);
+
+ // If the input is a buildvector just emit a smaller one.
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+ makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
+ ElemsPerChunk));
+
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
+}
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert((Vec.getValueType().is256BitVector() ||
+ Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl, unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ // Inserting UNDEF is Result
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return Result;
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
+
+ // Insert the relevant vectorWidth bits.
+ unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
+ * ElemsPerChunk);
+
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+ // For insertion into the zero index (low half) of a 256-bit vector, it is
+ // more efficient to generate a blend with immediate instead of an insert*128.
+ // We are still creating an INSERT_SUBVECTOR below with an undef node to
+ // extend the subvector to the size of the result vector. Make sure that
+ // we are not recursing on that node by checking for undef here.
+ if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+ Result.getOpcode() != ISD::UNDEF) {
+ EVT ResultVT = Result.getValueType();
+ SDValue ZeroIndex = DAG.getIntPtrConstant(0);
+ SDValue Undef = DAG.getUNDEF(ResultVT);
+ SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+ Vec, ZeroIndex);
+
+ // The blend instruction, and therefore its mask, depend on the data type.
+ MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+ if (ScalarType.isFloatingPoint()) {
+ // Choose either vblendps (float) or vblendpd (double).
+ unsigned ScalarSize = ScalarType.getSizeInBits();
+ assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+ unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+ SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+ }
+
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+ // AVX2 is needed for 256-bit integer blend support.
+ // Integers must be cast to 32-bit because there is only vpblendd;
+ // vpblendw can't be used for this because it has a handicapped mask.
+
+ // If we don't have AVX2, then cast to float. Using a wrong domain blend
+ // is still more efficient than using the wrong domain vinsertf128 that
+ // will be created by InsertSubVector().
+ MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+ SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
+ Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
+ Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+ return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
+ }
+
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
/// getOnesVector - Returns a vector of specified type with all bits set.
/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
SDLoc dl(Op);
SDValue V;
bool First = true;
+
+ // SSE4.1 - use PINSRB to insert each byte directly.
+ if (Subtarget->hasSSE41()) {
+ for (unsigned i = 0; i < 16; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v16i8);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v16i8, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i));
+ }
+ }
+
+ return V;
+ }
+
+ // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
if (ThisIsNonZero && First) {
return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
}
- SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
- if (Broadcast.getNode())
+ if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
return Broadcast;
unsigned EVTBits = ExtVT.getSizeInBits();
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
- if (VT.is256BitVector() || VT.is512BitVector()) {
+ if (VT.is512BitVector()) {
SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
Item, DAG.getIntPtrConstant(0));
}
- assert(VT.is128BitVector() && "Expected an SSE value type!");
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}
+ // We can't directly insert an i8 or i16 into a vector, so zero extend
+ // it to i32 first.
if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
if (VT.is256BitVector()) {
- SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
- Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+ if (Subtarget->hasAVX()) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ } else {
+ // Without AVX, we need to extend to a 128-bit vector and then
+ // insert into the 256-bit vector.
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+ SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+ Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+ }
} else {
assert(VT.is128BitVector() && "Expected an SSE value type!");
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
- if (EVTBits == 8 && NumElems == 16) {
- SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
- Subtarget, *this);
- if (V.getNode()) return V;
- }
+ if (EVTBits == 8 && NumElems == 16)
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+ Subtarget, *this))
+ return V;
- if (EVTBits == 16 && NumElems == 8) {
- SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
- Subtarget, *this);
- if (V.getNode()) return V;
- }
+ if (EVTBits == 16 && NumElems == 8)
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+ Subtarget, *this))
+ return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
- if (EVTBits == 32 && NumElems == 4) {
- SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
- if (V.getNode())
+ if (EVTBits == 32 && NumElems == 4)
+ if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
return V;
- }
// If element VT is == 32 bits, turn it into a number of shuffles.
SmallVector<SDValue, 8> V(NumElems);
V[i] = Op.getOperand(i);
// Check for elements which are consecutive loads.
- SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
- if (LD.getNode())
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
return LD;
// Check for a build vector from mostly shuffle plus few inserting.
- SDValue Sh = buildFromShuffleMostly(Op, DAG);
- if (Sh.getNode())
+ if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
unsigned NumElems = ResVT.getVectorNumElements();
- if(ResVT.is256BitVector())
+ if (ResVT.is256BitVector())
return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
if (Op.getNumOperands() == 4) {
return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
- MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG & DAG) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+ unsigned NumOfOperands = Op.getNumOperands();
+
+ assert(isPowerOf2_32(NumOfOperands) &&
+ "Unexpected number of operands in CONCAT_VECTORS");
+
+ if (NumOfOperands > 2) {
+ MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
+ ResVT.getVectorNumElements()/2);
+ SmallVector<SDValue, 2> Ops;
+ for (unsigned i = 0; i < NumOfOperands/2; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ Ops.clear();
+ for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+ }
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ if (IsZeroV1 && IsZeroV2)
+ return getZeroVector(ResVT, Subtarget, DAG, dl);
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0);
+ SDValue Undef = DAG.getUNDEF(ResVT);
+ unsigned NumElems = ResVT.getVectorNumElements();
+ SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8);
+
+ V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
+ V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
+ if (IsZeroV1)
+ return V2;
+
+ V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+ // Zero the upper bits of V1
+ V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
+ V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
+ if (IsZeroV2)
+ return V1;
+ return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
+
assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
// FALLTHROUGH
case MVT::v16i8:
case MVT::v32i8: {
+ assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+ "256-bit byte-blends require AVX2 support!");
+
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
static SDValue lowerVectorShuffleAsElementInsertion(
- MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
MVT ExtVT = VT;
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
"a sorted mask where the broadcast "
"comes from V1.");
- // Go up the chain of (vector) values to try and find a scalar load that
- // we can combine with the broadcast.
+ // Go up the chain of (vector) values to find a scalar load that we can
+ // combine with the broadcast.
for (;;) {
switch (V.getOpcode()) {
case ISD::CONCAT_VECTORS: {
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
- // If the scalar isn't a load we can't broadcast from it in AVX1, only with
- // AVX2.
+ // If the scalar isn't a load, we can't broadcast from it in AVX1.
+ // Only AVX2 has register broadcasts.
if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
return SDValue();
} else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
- // We can't broadcast from a vector register w/o AVX2, and we can only
+ // We can't broadcast from a vector register without AVX2, and we can only
// broadcast from the zero-element of a vector register.
return SDValue();
}
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
+static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
// If we have a single input, insert that into V1 if we can do so cheaply.
if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
return Insertion;
}
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
Mask, Subtarget, DAG))
return Broadcast;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
return Insertion;
// We have different paths for blend lowering, but they all must use the
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
Mask, Subtarget, DAG))
return Broadcast;
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
Mask, Subtarget, DAG))
return V;
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
Mask, Subtarget, DAG))
return Broadcast;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return V;
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
+ lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
return Unpack;
// We implement this with SHUFPS because it can blend from two vectors.
/// The exact breakdown of how to form these dword pairs and align them on the
/// correct sides is really tricky. See the comments within the function for
/// more of the details.
-static SDValue lowerV8I16SingleInputVectorShuffle(
- SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+ SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
+ MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+ assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
- // Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
- Mask, Subtarget, DAG))
- return Broadcast;
-
- // Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v8i16, V, V, Mask, DAG))
- return Shift;
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V, V, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
- if (isShuffleEquivalent(V, V, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
-
- // Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
- return Rotate;
-
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
int PSHUFDMask[] = {0, 1, 2, 3};
PSHUFDMask[ADWord] = BDWord;
PSHUFDMask[BDWord] = ADWord;
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+ V = DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
+ DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
// Adjust the mask to match the new locations of A and B.
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
- return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
- Mask);
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+ DAG);
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
// Now enact all the shuffles we've computed to move the inputs into their
// target half.
if (!isNoopShuffleMask(PSHUFLMask))
- V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
if (!isNoopShuffleMask(PSHUFHMask))
- V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
if (!isNoopShuffleMask(PSHUFDMask))
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+ V = DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
+ DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
// At this point, each half should contain all its inputs, and we can then
// Do a half shuffle for the low mask.
if (!isNoopShuffleMask(LoMask))
- V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(LoMask, DAG));
// Do a half shuffle with the high mask after shifting its values down.
if (M >= 0)
M -= 4;
if (!isNoopShuffleMask(HiMask))
- V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(HiMask, DAG));
return V;
int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
- if (NumV2Inputs == 0)
- return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+ if (NumV2Inputs == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
+ if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
+ Subtarget, DAG);
+ }
assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
"All single-input shuffles should be canonicalized to be V1-input "
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
return V;
return BitBlend;
if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
+ lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
Mask, Subtarget, DAG))
return Broadcast;
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(MVT::v16i8, DL, V1, V2, Mask, DAG))
+ lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
return Unpack;
}
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
Mask, Subtarget, DAG))
return V;
int LaneSize = Mask.size() / 2;
// If there are only inputs from one 128-bit lane, splitting will in fact be
- // less expensive. The flags track wether the given lane contains an element
+ // less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
bool LaneCrossing[2] = {false, false};
for (int i = 0, Size = Mask.size(); i < Size; ++i)
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ // TODO: If minimizing size and one of the inputs is a zero vector and the
+ // the zero vector has only one use, we could use a VPERM2X128 to save the
+ // instruction bytes needed to explicitly generate the zero vector.
+
// Blends are faster and handle all the non-lane-crossing cases.
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
- // Check for patterns which can be matched with a single insert of a 128-bit
- // subvector.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) ||
- isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
- }
- if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) {
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
- DAG.getIntPtrConstant(2));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // If either input operand is a zero vector, use VPERM2X128 because its mask
+ // allows us to replace the zero input with an implicit zero.
+ if (!IsV1Zero && !IsV2Zero) {
+ // Check for patterns which can be matched with a single insert of a 128-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
+ if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+ }
+
+ // Otherwise form a 128-bit permutation. After accounting for undefs,
+ // convert the 64-bit shuffle mask selection values into 128-bit
+ // selection bits by dividing the indexes by 2 and shifting into positions
+ // defined by a vperm2*128 instruction's immediate control byte.
+
+ // The immediate permute control byte looks like this:
+ // [1:0] - select 128 bits from sources for low half of destination
+ // [2] - ignore
+ // [3] - zero low half of destination
+ // [5:4] - select 128 bits from sources for high half of destination
+ // [6] - ignore
+ // [7] - zero high half of destination
+
+ int MaskLO = Mask[0];
+ if (MaskLO == SM_SentinelUndef)
+ MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
+
+ int MaskHI = Mask[2];
+ if (MaskHI == SM_SentinelUndef)
+ MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+
+ unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+
+ // If either input is a zero vector, replace it with an undef input.
+ // Shuffle mask values < 4 are selecting elements of V1.
+ // Shuffle mask values >= 4 are selecting elements of V2.
+ // Adjust each half of the permute mask by clearing the half that was
+ // selecting the zero vector and setting the zero mask bit.
+ if (IsV1Zero) {
+ V1 = DAG.getUNDEF(VT);
+ if (MaskLO < 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI < 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
+ }
+ if (IsV2Zero) {
+ V2 = DAG.getUNDEF(VT);
+ if (MaskLO >= 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI >= 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
}
- // Otherwise form a 128-bit permutation.
- // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
- unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
DAG.getConstant(PermMask, MVT::i8));
}
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
Mask, Subtarget, DAG))
return Broadcast;
if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
- // If we have a single input to the zero element, insert that into V1 if we
- // can do so cheaply.
- int NumV2Elements =
- std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
- if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
- return Insertion;
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG))
return Blend;
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
Mask, Subtarget, DAG))
return Broadcast;
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
Mask, Subtarget, DAG))
return Broadcast;
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
Mask, Subtarget, DAG))
return Broadcast;
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
Mask, Subtarget, DAG))
return Broadcast;
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
Mask, DAG);
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v16 case.
+ return lowerV8I16GeneralSingleInputVectorShuffle(
+ DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
Mask, Subtarget, DAG))
return Broadcast;
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = VT.getVectorNumElements();
+ int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
+ return M >= NumElts;
+ });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
// There is a really nice hard cut-over between AVX1 and AVX2 that means we can
// check for those subtargets here and avoid much of the subtarget querying in
// the per-vector-type lowering routines. With AVX1 we have essentially *zero*
"Cannot lower 512-bit vectors w/ basic ISA!");
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don't have supprot for
// Try to lower this to a blend-style vector shuffle. This can handle all
// constant condition cases.
- SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG);
- if (BlendOp.getNode())
+ if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
return BlendOp;
// Variable blends are only legal from SSE4.1 onward.
if (!Subtarget->hasSSE41())
return SDValue();
- // Some types for vselect were previously set to Expand, not Legal or
- // Custom. Return an empty SDValue so we fall-through to Expand, after
- // the Custom lowering phase.
- MVT VT = Op.getSimpleValueType();
- switch (VT.SimpleTy) {
+ // Only some types will be legal on some subtargets. If we can emit a legal
+ // VSELECT-matching blend, return Op, and but if we need to expand, return
+ // a null value.
+ switch (Op.getSimpleValueType().SimpleTy) {
default:
- break;
+ // Most of the vector types have blends past SSE4.1.
+ return Op;
+
+ case MVT::v32i8:
+ // The byte blends for AVX vectors were introduced only in AVX2.
+ if (Subtarget->hasAVX2())
+ return Op;
+
+ return SDValue();
+
case MVT::v8i16:
case MVT::v16i16:
+ // AVX-512 BWI and VLX features support VSELECT with i16 elements.
if (Subtarget->hasBWI() && Subtarget->hasVLX())
- break;
+ return Op;
+
+ // FIXME: We should custom lower this by fixing the condition and using i8
+ // blends.
return SDValue();
}
-
- // We couldn't create a "Blend with immediate" node.
- // This node should still be legal, but we'll have to emit a blendv*
- // instruction.
- return Op;
}
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
// into that, and then insert the subvector back into the result.
if (VT.is256BitVector() || VT.is512BitVector()) {
- // Get the desired 128-bit vector half.
+ // With a 256-bit vector, we can insert into the zero element efficiently
+ // using a blend if we have AVX or AVX2 and the right data type.
+ if (VT.is256BitVector() && IdxVal == 0) {
+ // TODO: It is worthwhile to cast integer to floating point and back
+ // and incur a domain crossing penalty if that's what we'll end up
+ // doing anyway after extracting to a 128-bit vector.
+ if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+ (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
+ SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ N2 = DAG.getIntPtrConstant(1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+ }
+ }
+
+ // Get the desired 128-bit vector chunk.
SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
- // Insert the element into the desired half.
+ // Insert the element into the desired chunk.
unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
DAG.getConstant(IdxIn128, MVT::i32));
- // Insert the changed part back to the 256-bit vector
+ // Insert the changed part back into the bigger vector
return Insert128BitVector(N0, V, IdxVal, DAG, dl);
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
}
if (EltVT == MVT::f32) {
- // Bits [7:6] of the constant are the source select. This will always be
- // zero here. The DAG Combiner may combine an extract_elt index into
- // these
- // bits. For example (insert (extract, 3), 2) could be matched by
- // putting
- // the '3' into bits [7:6] of X86ISD::INSERTPS.
- // Bits [5:4] of the constant are the destination select. This is the
- // value of the incoming immediate.
- // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into
+ // these bits. For example (insert (extract, 3), 2) could be matched by
+ // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool MinSize = F->hasFnAttribute(Attribute::MinSize);
+ if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ // If this is an insertion of 32-bits into the low 32-bits of
+ // a vector, we prefer to generate a blend with immediate rather
+ // than an insertps. Blends are simpler operations in hardware and so
+ // will always have equal or better performance than insertps.
+ // But if optimizing for size and there's a load folding opportunity,
+ // generate insertps because blendps does not have a 32-bit memory
+ // operand form.
+ N2 = DAG.getIntPtrConstant(1);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ }
N2 = DAG.getIntPtrConstant(IdxVal << 4);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ if (OpVT.getVectorElementType() == MVT::i1) {
+ if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal
+ return Op;
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0);
+ SDValue Undef = DAG.getUNDEF(OpVT);
+ unsigned NumElems = OpVT.getVectorNumElements();
+ SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8);
+
+ if (IdxVal == OpVT.getVectorNumElements() / 2) {
+ // Zero upper bits of the Vec
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+
+ SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+ SubVec, ZeroIdx);
+ Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
+ }
+ if (IdxVal == 0) {
+ SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+ SubVec, ZeroIdx);
+ // Zero upper bits of the Vec2
+ Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
+ Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits);
+ // Zero lower bits of the Vec
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ // Merge them together
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
+ }
+ }
return SDValue();
}
// Now we have only mask extension
assert(InVT.getVectorElementType() == MVT::i1);
SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
- const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+ const Constant *C = cast<ConstantSDNode>(Cst)->getConstantIntValue();
SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
}
SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
- const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+ const Constant *C = cast<ConstantSDNode>(Cst)->getConstantIntValue();
SDValue CP = DAG.getConstantPool(C, getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
EVT VT = Op1.getValueType();
SDValue CC;
- // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
- // are available. Otherwise fp cmovs get lowered into a less efficient branch
- // sequence later on.
+ // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+ // are available or VBLENDV if AVX is available.
+ // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
if (Cond.getOpcode() == ISD::SETCC &&
((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
(Subtarget->hasSSE1() && VT == MVT::f32)) &&
DAG.getConstant(SSECC, MVT::i8));
return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
}
+
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
DAG.getConstant(SSECC, MVT::i8));
+
+ // If we have AVX, we can use a variable vector select (VBLENDV) instead
+ // of 3 logic instructions for size savings and potentially speed.
+ // Unfortunately, there is no scalar form of VBLENDV.
+
+ // If either operand is a constant, don't try this. We can expect to
+ // optimize away at least one of the logic instructions later in that
+ // case, so that sequence would be faster than a variable blend.
+
+ // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+
+ if (Subtarget->hasAVX() &&
+ !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
+
+ // Convert to vectors, do a VSELECT, and convert back to scalar.
+ // All of the conversions should be optimized away.
+
+ EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+ SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+ SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+ SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+ EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+ VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
+
+ SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ VSel, DAG.getIntPtrConstant(0));
+ }
SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
SDValue Src2 = Op.getOperand(2);
SDValue Src0 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- SDValue RoundingMode = Op.getOperand(5);
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With supress-all-exceptions (sae) - 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ if (Op.getNumOperands() == 6) {
+ SDValue Sae = Op.getOperand(5);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- RoundingMode),
+ RoundingMode, Sae),
Mask, Src0, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps:
+ // Operands intentionally swapped. Mask is last operand to intrinsic,
+ // but second operand for node/instruction.
+ return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(1));
+
case Intrinsic::x86_avx512_mask_valign_q_512:
case Intrinsic::x86_avx512_mask_valign_d_512:
// Vector source operands are swapped.
}
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
- unsigned HintVal;
- if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
- (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
- llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+ unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+ assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
SDValue Chain = Op.getOperand(0);
SDValue Mask = Op.getOperand(2);
DAG);
}
- if (VT == MVT::v16i8) {
- if (Op.getOpcode() == ISD::SHL) {
- // Make a large shift.
- SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
- MVT::v8i16, R, ShiftAmt,
- DAG);
- SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
- // Zero out the rightmost bits.
- SmallVector<SDValue, 16> V(16,
- DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
- }
- if (Op.getOpcode() == ISD::SRL) {
- // Make a large shift.
- SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
- MVT::v8i16, R, ShiftAmt,
- DAG);
- SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
- // Zero out the leftmost bits.
- SmallVector<SDValue, 16> V(16,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
- }
- if (Op.getOpcode() == ISD::SRA) {
- if (ShiftAmt == 7) {
- // R s>> 7 === R s< 0
- SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
- }
-
- // R s>> a === ((R u>> a) ^ m) - m
- SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
- Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
- Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
- return Res;
- }
- llvm_unreachable("Unknown shift opcode.");
- }
+ if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
- if (Subtarget->hasInt256() && VT == MVT::v32i8) {
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
- SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
- MVT::v16i16, R, ShiftAmt,
- DAG);
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
// Zero out the rightmost bits.
- SmallVector<SDValue, 32> V(32,
- DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
+ SmallVector<SDValue, 32> V(
+ NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
- SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
- MVT::v16i16, R, ShiftAmt,
- DAG);
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
// Zero out the leftmost bits.
- SmallVector<SDValue, 32> V(32,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(
+ NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
// R s>> a === ((R u>> a) ^ m) - m
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(NumElts,
+ DAG.getConstant(128 >> ShiftAmt, MVT::i8));
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
- SDValue V;
assert(VT.isVector() && "Custom lowering only for vector shifts!");
assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
- V = LowerScalarImmediateShift(Op, DAG, Subtarget);
- if (V.getNode())
+ if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
return V;
- V = LowerScalarVariableShift(Op, DAG, Subtarget);
- if (V.getNode())
+ if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
return V;
if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
return Op;
+
// AVX2 has VPSLLV/VPSRAV/VPSRLV.
if (Subtarget->hasInt256()) {
if (Op.getOpcode() == ISD::SRL &&
return Op;
}
+ // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+ // shifts per-lane and then shuffle the partial results back together.
+ if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+ // Splat the shift amounts so the scalar shifts above will catch it.
+ SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+ SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+ SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+ SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+ return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+ }
+
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
// Do this only if the vector shift count is a constant build_vector.
Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
- }
+ }
// Decompose 256-bit shifts into smaller 128-bit shifts.
if (VT.is256BitVector()) {
SDValue Amt1, Amt2;
if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
// Constant shift amount
- SmallVector<SDValue, 4> Amt1Csts;
- SmallVector<SDValue, 4> Amt2Csts;
- for (unsigned i = 0; i != NumElems/2; ++i)
- Amt1Csts.push_back(Amt->getOperand(i));
- for (unsigned i = NumElems/2; i != NumElems; ++i)
- Amt2Csts.push_back(Amt->getOperand(i));
+ SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
+ ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
+ ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
-// Sign extension of the low part of vector elements. This may be used either
-// when sign extend instructions are not available or if the vector element
-// sizes already match the sign-extended size. If the vector elements are in
-// their pre-extended size and sign extend instructions are available, that will
-// be handled by LowerSIGN_EXTEND.
-SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc dl(Op);
- EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
- MVT VT = Op.getSimpleValueType();
-
- if (!Subtarget->hasSSE2() || !VT.isVector())
- return SDValue();
-
- unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
- ExtraVT.getScalarType().getSizeInBits();
-
- switch (VT.SimpleTy) {
- default: return SDValue();
- case MVT::v8i32:
- case MVT::v16i16:
- if (!Subtarget->hasFp256())
- return SDValue();
- if (!Subtarget->hasInt256()) {
- // needs to be split
- unsigned NumElems = VT.getVectorNumElements();
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- EVT ExtraEltVT = ExtraVT.getVectorElementType();
- unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
- ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
- ExtraNumElems/2);
- SDValue Extra = DAG.getValueType(ExtraVT);
-
- LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
- LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
- }
- // fall through
- case MVT::v4i32:
- case MVT::v8i16: {
- SDValue Op0 = Op.getOperand(0);
-
- // This is a sign extension of some low part of vector elements without
- // changing the size of the vector elements themselves:
- // Shift-Left + Shift-Right-Algebraic.
- SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
- BitsDiff, DAG);
- return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
- DAG);
- }
- }
-}
-
/// Returns true if the operand type is exactly twice the native width, and
/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
return needsCmpXchgNb(PTy->getElementType());
}
-bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+TargetLoweringBase::AtomicRMWExpansionKind
+X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
// and default to library calls otherwise.
- if (MemType->getPrimitiveSizeInBits() > NativeWidth)
- return needsCmpXchgNb(MemType);
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
+ return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg
+ : AtomicRMWExpansionKind::None;
+ }
AtomicRMWInst::BinOp Op = AI->getOperation();
switch (Op) {
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
// It's better to use xadd, xsub or xchg for these in all cases.
- return false;
+ return AtomicRMWExpansionKind::None;
case AtomicRMWInst::Or:
case AtomicRMWInst::And:
case AtomicRMWInst::Xor:
// If the atomicrmw's result isn't actually used, we can just add a "lock"
// prefix to a normal instruction for these operations.
- return !AI->use_empty();
+ return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg
+ : AtomicRMWExpansionKind::None;
case AtomicRMWInst::Nand:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMin:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
- return true;
+ return AtomicRMWExpansionKind::CmpXChg;
}
}
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
- case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return LowerCMP_SWAP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
- case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
// 9 ) EFLAGS (implicit-def)
assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
- assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
+ static_assert(X86::AddrNumOperands == 5,
+ "VAARG_64 assumes 5 address operands");
unsigned DestReg = MI->getOperand(0).getReg();
MachineOperand &Base = MI->getOperand(1);
// fallthrough --> copy0MBB
MachineBasicBlock *thisMBB = BB;
MachineFunction *F = BB->getParent();
+
+ // We also lower double CMOVs:
+ // (CMOV (CMOV F, T, cc1), T, cc2)
+ // to two successives branches. For that, we look for another CMOV as the
+ // following instruction.
+ //
+ // Without this, we would add a PHI between the two jumps, which ends up
+ // creating a few copies all around. For instance, for
+ //
+ // (sitofp (zext (fcmp une)))
+ //
+ // we would generate:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // movaps %xmm0, %xmm1
+ // jne .LBB5_2
+ // xorps %xmm1, %xmm1
+ // .LBB5_2:
+ // jp .LBB5_4
+ // movaps %xmm1, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ // because this custom-inserter would have generated:
+ //
+ // A
+ // | \
+ // | B
+ // | /
+ // C
+ // | \
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // B: empty
+ // C: Z = PHI [X, A], [Y, B]
+ // D: empty
+ // E: PHI [X, C], [Z, D]
+ //
+ // If we lower both CMOVs in a single step, we can instead generate:
+ //
+ // A
+ // | \
+ // | C
+ // | /|
+ // |/ |
+ // | |
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // D: empty
+ // E: PHI [X, A], [X, C], [Y, D]
+ //
+ // Which, in our sitofp/fcmp example, gives us something like:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // jne .LBB5_4
+ // jp .LBB5_4
+ // xorps %xmm0, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ MachineInstr *NextCMOV = nullptr;
+ MachineBasicBlock::iterator NextMIIt =
+ std::next(MachineBasicBlock::iterator(MI));
+ if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+ NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
+ NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
+ NextCMOV = &*NextMIIt;
+
+ MachineBasicBlock *jcc1MBB = nullptr;
+
+ // If we have a double CMOV, we lower it to two successive branches to
+ // the same block. EFLAGS is used by both, so mark it as live in the second.
+ if (NextCMOV) {
+ jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, jcc1MBB);
+ jcc1MBB->addLiveIn(X86::EFLAGS);
+ }
+
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, copy0MBB);
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
- if (!MI->killsRegister(X86::EFLAGS) &&
- !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
+
+ MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
+ if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
sinkMBB->addLiveIn(X86::EFLAGS);
}
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add the true and fallthrough blocks as its successors.
- BB->addSuccessor(copy0MBB);
+ if (NextCMOV) {
+ // The fallthrough block may be jcc1MBB, if we have a double CMOV.
+ BB->addSuccessor(jcc1MBB);
+
+ // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
+ // jump to the sinkMBB.
+ jcc1MBB->addSuccessor(copy0MBB);
+ jcc1MBB->addSuccessor(sinkMBB);
+ } else {
+ BB->addSuccessor(copy0MBB);
+ }
+
+ // The true block target of the first (or only) branch is always sinkMBB.
BB->addSuccessor(sinkMBB);
// Create the conditional branch instruction.
X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+ if (NextCMOV) {
+ unsigned Opc2 = X86::GetCondBranchFromCond(
+ (X86::CondCode)NextCMOV->getOperand(3).getImm());
+ BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
+ }
+
// copy0MBB:
// %FalseValue = ...
// # fallthrough to sinkMBB
// sinkMBB:
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
// ...
- BuildMI(*sinkMBB, sinkMBB->begin(), DL,
- TII->get(X86::PHI), MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
- .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+ MachineInstrBuilder MIB =
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
+ MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+ .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+ // If we have a double CMOV, the second Jcc provides the same incoming
+ // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
+ if (NextCMOV) {
+ MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
+ // Copy the PHI result to the register defined by the second CMOV.
+ BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
+ DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
+ .addReg(MI->getOperand(0).getReg());
+ NextCMOV->eraseFromParent();
+ }
MI->eraseFromParent(); // The pseudo instruction is gone now.
return sinkMBB;
// Calls into a routine in libgcc to allocate more space from the heap.
const uint32_t *RegMask =
- Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+ Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
const uint32_t *RegMask =
- Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+ Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
// Note that even with AVX we prefer the PSHUFD form of shuffle for integer
// vectors because it can have a load folded into it that UNPCK cannot. This
// doesn't preclude something switching to the shorter encoding post-RA.
- if (FloatDomain) {
- if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
- bool Lo = Mask.equals(0, 0);
+ //
+ // FIXME: Should teach these routines about AVX vector widths.
+ if (FloatDomain && VT.getSizeInBits() == 128) {
+ if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
+ bool Lo = Mask.equals({0, 0});
unsigned Shuffle;
MVT ShuffleVT;
// Check if we have SSE3 which will let us use MOVDDUP. That instruction
return true;
}
if (Subtarget->hasSSE3() &&
- (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
- bool Lo = Mask.equals(0, 0, 2, 2);
+ (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
+ bool Lo = Mask.equals({0, 0, 2, 2});
unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root->getOpcode() == Shuffle)
/*AddTo*/ true);
return true;
}
- if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
- bool Lo = Mask.equals(0, 0, 1, 1);
+ if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
+ bool Lo = Mask.equals({0, 0, 1, 1});
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root->getOpcode() == Shuffle)
// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.
- if (!FloatDomain &&
- (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
- Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
- Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
- Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
- 15))) {
+ if (!FloatDomain && VT.getSizeInBits() == 128 &&
+ (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
+ Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
+ Mask.equals(
+ {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
bool Lo = Mask[0] == 0;
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
if (Depth == 1 && Root->getOpcode() == Shuffle)
// in practice PSHUFB tends to be *very* fast so we're more aggressive.
if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
SmallVector<SDValue, 16> PSHUFBMask;
- assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
- int Ratio = 16 / Mask.size();
- for (unsigned i = 0; i < 16; ++i) {
+ int NumBytes = VT.getSizeInBits() / 8;
+ int Ratio = NumBytes / Mask.size();
+ for (int i = 0; i < NumBytes; ++i) {
if (Mask[i / Ratio] == SM_SentinelUndef) {
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
: 255;
PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
}
- Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+ Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input);
DCI.AddToWorklist(Op.getNode());
SDValue PSHUFBMaskOp =
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
+ DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
DCI.AddToWorklist(PSHUFBMaskOp.getNode());
- Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
+ Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
DCI.AddToWorklist(Op.getNode());
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
/*AddTo*/ true);
MVT VT = Op.getSimpleValueType();
if (!VT.isVector())
return false; // Bail if we hit a non-vector.
- // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
- // version should be added.
- if (VT.getSizeInBits() != 128)
- return false;
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
/// PSHUF-style masks that can be reused with such instructions.
static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
bool IsUnary;
- bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+ bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
(void)HaveMask;
assert(HaveMask);
+ // If we have more than 128-bits, only the low 128-bits of shuffle mask
+ // matter. Check that the upper masks are repeats and remove them.
+ if (VT.getSizeInBits() > 128) {
+ int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+ for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+ for (int j = 0; j < LaneElts; ++j)
+ assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+ "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+ Mask.resize(LaneElts);
+ }
+
switch (N.getOpcode()) {
case X86ISD::PSHUFD:
return Mask;
case X86ISD::UNPCKH:
// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
// shuffle into a preceding word shuffle.
- if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+ if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
+ V.getSimpleValueType().getScalarType() != MVT::i16)
return SDValue();
// Search for a half-shuffle which we can combine with.
break;
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
- assert(VT == MVT::v8i16);
- (void)VT;
+ assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
return SDValue(); // We combined away this shuffle, so we're done.
// See if this reduces to a PSHUFD which is no more expensive and can
// combine with more operations. Note that it has to at least flip the
// dwords as otherwise it would have been removed as a no-op.
- if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
+ if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
int DMask[] = {0, 1, 2, 3};
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
DMask[DOffset + 0] = DOffset + 1;
DMask[DOffset + 1] = DOffset + 0;
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+ MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
DCI.AddToWorklist(V.getNode());
- V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+ V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
getV4X86ShuffleImm8ForMask(DMask, DAG));
DCI.AddToWorklist(V.getNode());
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
}
// Look for shuffle patterns which can be implemented as a single unpack.
int MappedMask[8];
for (int i = 0; i < 8; ++i)
MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
- const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
- const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
- if (std::equal(std::begin(MappedMask), std::end(MappedMask),
- std::begin(UnpackLoMask)) ||
- std::equal(std::begin(MappedMask), std::end(MappedMask),
- std::begin(UnpackHiMask))) {
+ if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
// We can replace all three shuffles with an unpack.
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+ V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
DCI.AddToWorklist(V.getNode());
return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
: X86ISD::UNPCKH,
- DL, MVT::v8i16, V, V);
+ DL, VT, V, V);
}
}
}
}
}
- // Only handle 128 wide vector from here on.
- if (!VT.is128BitVector())
- return SDValue();
-
// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
// consecutive, non-overlapping, and in the right order.
}
}
- // If we know that this node is legal then we know that it is going to be
- // matched by one of the SSE/AVX BLEND instructions. These instructions only
- // depend on the highest bit in each word. Try to use SimplifyDemandedBits
- // to simplify previous instructions.
+ // We should generate an X86ISD::BLENDI from a vselect if its argument
+ // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+ // constants. This specific pattern gets generated when we split a
+ // selector for a 512 bit vector in a machine without AVX512 (but with
+ // 256-bit vectors), during legalization:
+ //
+ // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+ //
+ // Iff we find this pattern and the build_vectors are built from
+ // constants, we translate the vselect into a shuffle_vector that we
+ // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+ if ((N->getOpcode() == ISD::VSELECT ||
+ N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+ !DCI.isBeforeLegalize()) {
+ SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
+ // If this is a *dynamic* select (non-constant condition) and we can match
+ // this node with one of the variable blend instructions, restructure the
+ // condition so that the blends can use the high bit of each element and use
+ // SimplifyDemandedBits to simplify the condition operand.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
- // We explicitly check against SSE4.1, v8i16 and v16i16 because, although
- // vselect nodes may be marked as Custom, they might only be legal when
- // Cond is a build_vector of constants. This will be taken care in
- // a later condition.
- (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) &&
- Subtarget->hasSSE41() && VT != MVT::v16i16 && VT != MVT::v8i16) &&
- // Don't optimize vector of constants. Those are handled by
- // the generic code and all the bits must be properly set for
- // the generic optimizer.
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
if (BitWidth == 1)
return SDValue();
+ // We can only handle the cases where VSELECT is directly legal on the
+ // subtarget. We custom lower VSELECT nodes with constant conditions and
+ // this makes it hard to see whether a dynamic VSELECT will correctly
+ // lower, so we both check the operation's status and explicitly handle the
+ // cases where a *dynamic* blend will fail even though a constant-condition
+ // blend could be custom lowered.
+ // FIXME: We should find a better way to handle this class of problems.
+ // Potentially, we should combine constant-condition vselect nodes
+ // pre-legalization into shuffles and not mark as many types as custom
+ // lowered.
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+ // FIXME: We don't support i16-element blends currently. We could and
+ // should support them by making *all* the bits in the condition be set
+ // rather than just the high bit and using an i8-element blend.
+ if (VT.getScalarType() == MVT::i16)
+ return SDValue();
+ // Dynamic blending was only available from SSE4.1 onward.
+ if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+ return SDValue();
+ // Byte blends are only available in AVX2
+ if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
+ !Subtarget->hasAVX2())
+ return SDValue();
+
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
}
}
- // We should generate an X86ISD::BLENDI from a vselect if its argument
- // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
- // constants. This specific pattern gets generated when we split a
- // selector for a 512 bit vector in a machine without AVX512 (but with
- // 256-bit vectors), during legalization:
- //
- // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
- //
- // Iff we find this pattern and the build_vectors are built from
- // constants, we translate the vselect into a shuffle_vector that we
- // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
- if ((N->getOpcode() == ISD::VSELECT ||
- N->getOpcode() == X86ISD::SHRUNKBLEND) &&
- !DCI.isBeforeLegalize()) {
- SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
- if (Shuffle.getNode())
- return Shuffle;
- }
-
return SDValue();
}
return SDValue();
}
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+/// (X86or (X86setcc) (X86setcc))
+/// (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+ X86::CondCode &CC1, SDValue &Flags,
+ bool &isAnd) {
+ if (Cond->getOpcode() == X86ISD::CMP) {
+ ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
+ if (!CondOp1C || !CondOp1C->isNullValue())
+ return false;
+
+ Cond = Cond->getOperand(0);
+ }
+
+ isAnd = false;
+
+ SDValue SetCC0, SetCC1;
+ switch (Cond->getOpcode()) {
+ default: return false;
+ case ISD::AND:
+ case X86ISD::AND:
+ isAnd = true;
+ // fallthru
+ case ISD::OR:
+ case X86ISD::OR:
+ SetCC0 = Cond->getOperand(0);
+ SetCC1 = Cond->getOperand(1);
+ break;
+ };
+
+ // Make sure we have SETCC nodes, using the same flags value.
+ if (SetCC0.getOpcode() != X86ISD::SETCC ||
+ SetCC1.getOpcode() != X86ISD::SETCC ||
+ SetCC0->getOperand(1) != SetCC1->getOperand(1))
+ return false;
+
+ CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+ CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+ Flags = SetCC0->getOperand(1);
+ return true;
+}
+
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
}
}
+ // Fold and/or of setcc's to double CMOV:
+ // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+ // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+ //
+ // This combine lets us generate:
+ // cmovcc1 (jcc1 if we don't have CMOV)
+ // cmovcc2 (same)
+ // instead of:
+ // setcc1
+ // setcc2
+ // and/or
+ // cmovne (jne if we don't have CMOV)
+ // When we can't use the CMOV instruction, it might increase branch
+ // mispredicts.
+ // When we can use CMOV, or when there is no mispredict, this improves
+ // throughput and reduces register pressure.
+ //
+ if (CC == X86::COND_NE) {
+ SDValue Flags;
+ X86::CondCode CC0, CC1;
+ bool isAndSetCC;
+ if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+ if (isAndSetCC) {
+ std::swap(FalseOp, TrueOp);
+ CC0 = X86::GetOppositeBranchCondition(CC0);
+ CC1 = X86::GetOppositeBranchCondition(CC1);
+ }
+
+ SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8),
+ Flags};
+ SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+ SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags};
+ SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+ return CMOV;
+ }
+ }
+
return SDValue();
}
default: return SDValue();
// SSE/AVX/AVX2 blend intrinsics.
case Intrinsic::x86_avx2_pblendvb:
- case Intrinsic::x86_avx2_pblendw:
- case Intrinsic::x86_avx2_pblendd_128:
- case Intrinsic::x86_avx2_pblendd_256:
// Don't try to simplify this intrinsic if we don't have AVX2.
if (!Subtarget->hasAVX2())
return SDValue();
// FALL-THROUGH
- case Intrinsic::x86_avx_blend_pd_256:
- case Intrinsic::x86_avx_blend_ps_256:
case Intrinsic::x86_avx_blendv_pd_256:
case Intrinsic::x86_avx_blendv_ps_256:
// Don't try to simplify this intrinsic if we don't have AVX.
if (!Subtarget->hasAVX())
return SDValue();
// FALL-THROUGH
- case Intrinsic::x86_sse41_pblendw:
- case Intrinsic::x86_sse41_blendpd:
- case Intrinsic::x86_sse41_blendps:
case Intrinsic::x86_sse41_blendvps:
case Intrinsic::x86_sse41_blendvpd:
case Intrinsic::x86_sse41_pblendvb: {
// an and with a mask.
// We'd like to try to combine that into a shuffle with zero
// plus a bitcast, removing the and.
- if (N0.getOpcode() != ISD::BITCAST ||
+ if (N0.getOpcode() != ISD::BITCAST ||
N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
return SDValue();
unsigned ResSize = N1.getValueType().getScalarSizeInBits();
// Make sure the splat matches the mask we expect
- if (SplatBitSize > ResSize ||
+ if (SplatBitSize > ResSize ||
(SplatValue + 1).exactLogBase2() != (int)SrcSize)
return SDValue();
if (DCI.isBeforeLegalizeOps())
return SDValue();
- SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget);
- if (Zext.getNode())
+ if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
return Zext;
- SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
- if (R.getNode())
+ if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
return R;
EVT VT = N->getValueType(0);
// If A and B occur in reverse order in RHS, then "swap" them (which means
// rewriting the mask).
if (A != C)
- CommuteVectorShuffleMask(RMask, NumElts);
+ ShuffleVectorSDNode::commuteMask(RMask);
// At this point LHS and RHS are equivalent to
// LHS = VECTOR_SHUFFLE A, B, LMask
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
-
+
return SDValue();
}
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
- LHS.getValueType(), RHS, LHS.getOperand(1));
- return DAG.getSetCC(SDLoc(N), N->getValueType(0),
- addV, DAG.getConstant(0, addV.getValueType()), CC);
+ SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS,
+ LHS.getOperand(1));
+ return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
+ DAG.getConstant(0, addV.getValueType()), CC);
}
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
- RHS.getValueType(), LHS, RHS.getOperand(1));
- return DAG.getSetCC(SDLoc(N), N->getValueType(0),
- addV, DAG.getConstant(0, addV.getValueType()), CC);
+ SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS,
+ RHS.getOperand(1));
+ return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
+ DAG.getConstant(0, addV.getValueType()), CC);
}
- if (VT.getScalarType() == MVT::i1) {
- bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
- (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
- bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
- if (!IsSEXT0 && !IsVZero0)
- return SDValue();
- bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
- (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ if (VT.getScalarType() == MVT::i1 &&
+ (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
+ bool IsSEXT0 =
+ (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
- if (!IsSEXT1 && !IsVZero1)
- return SDValue();
+ if (!IsSEXT0 || !IsVZero1) {
+ // Swap the operands and update the condition code.
+ std::swap(LHS, RHS);
+ CC = ISD::getSetCCSwappedOperands(CC);
+
+ IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+ }
if (IsSEXT0 && IsVZero1) {
- assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
- if (CC == ISD::SETEQ)
+ assert(VT == LHS.getOperand(0).getValueType() &&
+ "Uexpected operand type");
+ if (CC == ISD::SETGT)
+ return DAG.getConstant(0, VT);
+ if (CC == ISD::SETLE)
+ return DAG.getConstant(1, VT);
+ if (CC == ISD::SETEQ || CC == ISD::SETGE)
return DAG.getNOT(DL, LHS.getOperand(0), VT);
+
+ assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+ "Unexpected condition code!");
return LHS.getOperand(0);
}
- if (IsSEXT1 && IsVZero0) {
- assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
- if (CC == ISD::SETEQ)
- return DAG.getNOT(DL, RHS.getOperand(0), VT);
- return RHS.getOperand(0);
- }
}
return SDValue();
if (MayFoldLoad(Ld)) {
// Extract the countS bits from the immediate so we can get the proper
// address when narrowing the vector load to a specific element.
- // When the second source op is a memory address, interps doesn't use
+ // When the second source op is a memory address, insertps doesn't use
// countS and just gets an f32 from that address.
unsigned DestIndex =
cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+
Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
- } else
- return SDValue();
- // Create this as a scalar to vector to match the instruction pattern.
- SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
- // countS bits are ignored when loading from memory on insertps, which
- // means we don't need to explicitly set them to 0.
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
- LoadScalarToVector, N->getOperand(2));
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+ // countS bits are ignored when loading from memory on insertps, which
+ // means we don't need to explicitly set them to 0.
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+ LoadScalarToVector, N->getOperand(2));
+ }
+ return SDValue();
}
static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
// pattern-matching possibilities related to scalar math ops in SSE/AVX.
// x86InstrInfo knows how to commute this back after instruction selection
// if it would help register allocation.
-
+
// TODO: If optimizing for size or a processor that doesn't suffer from
// partial register update stalls, this should be transformed into a MOVSD
// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
-namespace {
- // Helper to match a string separated by whitespace.
- bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
- s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
-
- for (unsigned i = 0, e = args.size(); i != e; ++i) {
- StringRef piece(*args[i]);
- if (!s.startswith(piece)) // Check if the piece matches.
- return false;
+// Helper to match a string separated by whitespace.
+static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
+ S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
- s = s.substr(piece.size());
- StringRef::size_type pos = s.find_first_not_of(" \t");
- if (pos == 0) // We matched a prefix.
- return false;
+ for (StringRef Piece : Pieces) {
+ if (!S.startswith(Piece)) // Check if the piece matches.
+ return false;
- s = s.substr(pos);
- }
+ S = S.substr(Piece.size());
+ StringRef::size_type Pos = S.find_first_not_of(" \t");
+ if (Pos == 0) // We matched a prefix.
+ return false;
- return s.empty();
+ S = S.substr(Pos);
}
- const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
+
+ return S.empty();
}
static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
// ops instead of emitting the bswap asm. For now, we don't support 486 or
// lower so don't worry about this.
// bswap $0
- if (matchAsm(AsmPieces[0], "bswap", "$0") ||
- matchAsm(AsmPieces[0], "bswapl", "$0") ||
- matchAsm(AsmPieces[0], "bswapq", "$0") ||
- matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
- matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
- matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
+ if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
return IntrinsicLowering::LowerToByteSwap(CI);
// rorw $$8, ${0:w} --> llvm.bswap.i16
if (CI->getType()->isIntegerTy(16) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
- (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
- matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
+ (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
+ matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
case 3:
if (CI->getType()->isIntegerTy(32) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
- matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
- matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
- matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
+ matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
+ matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
+ matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
- if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
- matchAsm(AsmPieces[1], "bswap", "%edx") &&
- matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
+ if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
+ matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
+ matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
return IntrinsicLowering::LowerToByteSwap(CI);
}
}
break;
case 'G':
case 'C':
- if (dyn_cast<ConstantFP>(CallOperandVal)) {
+ if (isa<ConstantFP>(CallOperandVal)) {
weight = CW_Constant;
}
break;
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
-std::pair<unsigned, const TargetRegisterClass*>
-X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ const std::string &Constraint,
MVT VT) const {
// First, see if this is a constraint that directly corresponds to an LLVM
// register class.
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass*> Res;
- Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {