#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/CommandLine.h"
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::VSELECT);
+ setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+ // [SU][MIN|MAX] are available for all NEON types apart from i64.
+ if (!VT.isFloatingPoint() &&
+ VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)
+ for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+ setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
- // The the values aren't constants, this isn't the pattern we're looking for.
+ // The values aren't constants, this isn't the pattern we're looking for.
if (!CFVal || !CTVal)
return Op;
if (N->getOpcode() != ISD::BUILD_VECTOR)
return false;
- for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
- SDNode *Elt = N->getOperand(i).getNode();
+ for (const SDValue &Elt : N->op_values()) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
unsigned HalfSize = EltSize / 2;
// cannot rely on the linker replacing the tail call with a return.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
- const Triple TT(getTargetMachine().getTargetTriple());
+ const Triple &TT = getTargetMachine().getTargetTriple();
if (GV->hasExternalWeakLinkage() &&
(!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
return false;
// If we're doing a tall call, use a TC_RETURN here rather than an
// actual call instruction.
- if (IsTailCall)
+ if (IsTailCall) {
+ MF.getFrameInfo()->setHasTailCall();
return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+ }
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
EltVT = MVT::i64;
VecVT = MVT::v2i64;
- // We want to materialize a mask with the the high bit set, but the AdvSIMD
+ // We want to materialize a mask with the high bit set, but the AdvSIMD
// immediate moves cannot materialize that in a single instruction for
// 64-bit elements. Instead, materialize zero and then negate it.
EltMask = 0;
/// operations would *not* be semantically equivalent.
static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
if (Cmp == Result)
- return true;
+ return (Cmp.getValueType() == MVT::f32 ||
+ Cmp.getValueType() == MVT::f64);
ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
assert(LHS.getValueType() == RHS.getValueType());
EVT VT = TVal.getValueType();
-
- // Try to match this select into a max/min operation, which have dedicated
- // opcode in the instruction set.
- // FIXME: This is not correct in the presence of NaNs, so we only enable this
- // in no-NaNs mode.
- if (getTargetMachine().Options.NoNaNsFPMath) {
- SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
- if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
- selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
- CC = ISD::getSetCCSwappedOperands(CC);
- std::swap(MinMaxLHS, MinMaxRHS);
- }
-
- if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
- selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
- switch (CC) {
- default:
- break;
- case ISD::SETGT:
- case ISD::SETGE:
- case ISD::SETUGT:
- case ISD::SETUGE:
- case ISD::SETOGT:
- case ISD::SETOGE:
- return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
- break;
- case ISD::SETLT:
- case ISD::SETLE:
- case ISD::SETULT:
- case ISD::SETULE:
- case ISD::SETOLT:
- case ISD::SETOLE:
- return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
- break;
- }
- }
- }
-
- // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
- // and do the comparison.
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
.Default(0);
if (Reg)
return Reg;
- report_fatal_error("Invalid register name global variable");
+ report_fatal_error(Twine("Invalid register name \""
+ + StringRef(RegName) + "\"."));
}
SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
AArch64TargetLowering::ConstraintType
-AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
default:
std::pair<unsigned, const TargetRegisterClass *>
AArch64TargetLowering::getRegForInlineAsmConstraint(
- const TargetRegisterInfo *TRI, const std::string &Constraint,
- MVT VT) const {
+ const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
unsigned Size = Constraint.size();
if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
- const std::string Reg =
- std::string(&Constraint[2], &Constraint[Size - 1]);
- int RegNo = atoi(Reg.c_str());
- if (RegNo >= 0 && RegNo <= 31) {
+ int RegNo;
+ bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
+ if (!Failed && RegNo >= 0 && RegNo <= 31) {
// v0 - v31 are aliases of q0 - q31.
// By default we'll emit v0-v31 for this unless there's a modifier where
// we'll emit the correct register as well.
return NumBits == 32 || NumBits == 64;
}
+/// \brief Lower an interleaved load into a ldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
+/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
+///
+/// Into:
+/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
+/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
+/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
+bool AArch64TargetLowering::lowerInterleavedLoad(
+ LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices, unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+ assert(!Shuffles.empty() && "Empty shufflevector input");
+ assert(Shuffles.size() == Indices.size() &&
+ "Unmatched number of shufflevectors and indices");
+
+ const DataLayout *DL = getDataLayout();
+
+ VectorType *VecTy = Shuffles[0]->getType();
+ unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
+
+ // Skip illegal vector types.
+ if (VecSize != 64 && VecSize != 128)
+ return false;
+
+ // A pointer vector can not be the return type of the ldN intrinsics. Need to
+ // load integer vectors first and then convert to pointer vectors.
+ Type *EltTy = VecTy->getVectorElementType();
+ if (EltTy->isPointerTy())
+ VecTy = VectorType::get(DL->getIntPtrType(EltTy),
+ VecTy->getVectorNumElements());
+
+ Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
+ Type *Tys[2] = {VecTy, PtrTy};
+ static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
+ Intrinsic::aarch64_neon_ld3,
+ Intrinsic::aarch64_neon_ld4};
+ Function *LdNFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+
+ IRBuilder<> Builder(LI);
+ Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+
+ CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+
+ // Replace uses of each shufflevector with the corresponding vector loaded
+ // by ldN.
+ for (unsigned i = 0; i < Shuffles.size(); i++) {
+ ShuffleVectorInst *SVI = Shuffles[i];
+ unsigned Index = Indices[i];
+
+ Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+
+ // Convert the integer vector to pointer vector if the element is pointer.
+ if (EltTy->isPointerTy())
+ SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+
+ SVI->replaceAllUsesWith(SubVec);
+ }
+
+ return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+ unsigned NumElts) {
+ SmallVector<Constant *, 16> Mask;
+ for (unsigned i = 0; i < NumElts; i++)
+ Mask.push_back(Builder.getInt32(Start + i));
+
+ return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store into a stN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+/// Into:
+/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// st3 instruction in CodeGen.
+bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+ ShuffleVectorInst *SVI,
+ unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
+ VectorType *VecTy = SVI->getType();
+ assert(VecTy->getVectorNumElements() % Factor == 0 &&
+ "Invalid interleaved store");
+
+ unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+ Type *EltTy = VecTy->getVectorElementType();
+ VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+
+ const DataLayout *DL = getDataLayout();
+ unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
+
+ // Skip illegal vector types.
+ if (SubVecSize != 64 && SubVecSize != 128)
+ return false;
+
+ Value *Op0 = SVI->getOperand(0);
+ Value *Op1 = SVI->getOperand(1);
+ IRBuilder<> Builder(SI);
+
+ // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+ // vectors to integer vectors.
+ if (EltTy->isPointerTy()) {
+ Type *IntTy = DL->getIntPtrType(EltTy);
+ unsigned NumOpElts =
+ dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
+
+ // Convert to the corresponding integer vector.
+ Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+ Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+ Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+ SubVecTy = VectorType::get(IntTy, NumSubElts);
+ }
+
+ Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
+ Type *Tys[2] = {SubVecTy, PtrTy};
+ static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
+ Intrinsic::aarch64_neon_st3,
+ Intrinsic::aarch64_neon_st4};
+ Function *StNFunc =
+ Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+ SmallVector<Value *, 5> Ops;
+
+ // Split the shufflevector operands into sub vectors for the new stN call.
+ for (unsigned i = 0; i < Factor; i++)
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+
+ Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
+ Builder.CreateCall(StNFunc, Ops);
+ return true;
+}
+
static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
unsigned AlignCheck) {
return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
- Type *Ty) const {
+ Type *Ty,
+ unsigned AS) const {
// AArch64 has five basic addressing modes:
// reg
// reg + 9-bit signed offset
}
int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
- Type *Ty) const {
+ Type *Ty,
+ unsigned AS) const {
// Scaling factors are not free at all.
// Operands | Rt Latency
// -------------------------------------------
// -------------------------------------------
// Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
// Rt, [Xn, Wm, <extend> #imm] |
- if (isLegalAddressingMode(AM, Ty))
+ if (isLegalAddressingMode(AM, Ty, AS))
// Scale represents reg2 * scale, thus account for 1 if
// it is not equal to 0 or 1.
return AM.Scale != 0 && AM.Scale != 1;
//
// This routine does the actual conversion of such DUPs, once outer routines
// have determined that everything else is in order.
+// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
+// similarly here.
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
- // We can handle most types of duplicate, but the lane ones have an extra
- // operand saying *which* lane, so we need to know.
- bool IsDUPLANE;
switch (N.getOpcode()) {
case AArch64ISD::DUP:
- IsDUPLANE = false;
- break;
case AArch64ISD::DUPLANE8:
case AArch64ISD::DUPLANE16:
case AArch64ISD::DUPLANE32:
case AArch64ISD::DUPLANE64:
- IsDUPLANE = true;
+ case AArch64ISD::MOVI:
+ case AArch64ISD::MOVIshift:
+ case AArch64ISD::MOVIedit:
+ case AArch64ISD::MOVImsl:
+ case AArch64ISD::MVNIshift:
+ case AArch64ISD::MVNImsl:
break;
default:
+ // FMOV could be supported, but isn't very useful, as it would only occur
+ // if you passed a bitcast' floating point immediate to an eligible long
+ // integer op (addl, smull, ...).
return SDValue();
}
MVT ElementTy = NarrowTy.getVectorElementType();
unsigned NumElems = NarrowTy.getVectorNumElements();
- MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+ MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
SDLoc dl(N);
- SDValue NewDUP;
- if (IsDUPLANE)
- NewDUP = DAG.getNode(N.getOpcode(), dl, NewDUPVT, N.getOperand(0),
- N.getOperand(1));
- else
- NewDUP = DAG.getNode(AArch64ISD::DUP, dl, NewDUPVT, N.getOperand(0));
-
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, NewDUP,
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
+ DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
DAG.getConstant(NumElems, dl, MVT::i64));
}
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
+/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC
+/// to match FMIN/FMAX patterns.
+static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) {
+ // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y".
+ // Unless the NoNaNsFPMath option is set, be careful about NaNs:
+ // vmax/vmin return NaN if either operand is a NaN;
+ // only do the transformation when it matches that behavior.
+
+ SDValue CondLHS = N->getOperand(0);
+ SDValue CondRHS = N->getOperand(1);
+ SDValue LHS = N->getOperand(2);
+ SDValue RHS = N->getOperand(3);
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+ unsigned Opcode;
+ bool IsReversed;
+ if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) &&
+ selectCCOpsAreFMaxCompatible(CondRHS, RHS)) {
+ IsReversed = false; // x CC y ? x : y
+ } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) &&
+ selectCCOpsAreFMaxCompatible(CondLHS, RHS)) {
+ IsReversed = true ; // x CC y ? y : x
+ } else {
+ return SDValue();
+ }
+
+ bool IsUnordered = false, IsOrEqual;
+ switch (CC) {
+ default:
+ return SDValue();
+ case ISD::SETULT:
+ case ISD::SETULE:
+ IsUnordered = true;
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE);
+ Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN;
+ break;
+
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ IsUnordered = true;
+ case ISD::SETOGT:
+ case ISD::SETOGE:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE);
+ Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX;
+ break;
+ }
+
+ // If LHS is NaN, an ordered comparison will be false and the result will be
+ // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking
+ // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN.
+ if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
+ return SDValue();
+
+ // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true,
+ // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be
+ // used for unsafe math or if one of the operands is known to be nonzero.
+ if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ return SDValue();
+
+ return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
+}
+
+/// Get rid of unnecessary NVCASTs (that don't change the type).
+static SDValue performNVCASTCombine(SDNode *N) {
+ if (N->getValueType(0) == N->getOperand(0).getValueType())
+ return N->getOperand(0);
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
return performSelectCombine(N, DCI);
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
+ case ISD::SELECT_CC:
+ return performSelectCCCombine(N, DCI.DAG);
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
case AArch64ISD::BRCOND:
return performCONDCombine(N, DCI, DAG, 2, 3);
case AArch64ISD::DUP:
return performPostLD1Combine(N, DCI, false);
+ case AArch64ISD::NVCAST:
+ return performNVCASTCombine(N);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
case ISD::INTRINSIC_VOID:
Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
- return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
+ return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
}
Intrinsic::ID Int =
Type *Tys[] = { Addr->getType() };
Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
- return Builder.CreateCall2(
- Stxr, Builder.CreateZExtOrBitCast(
- Val, Stxr->getFunctionType()->getParamType(0)),
- Addr);
+ return Builder.CreateCall(Stxr,
+ {Builder.CreateZExtOrBitCast(
+ Val, Stxr->getFunctionType()->getParamType(0)),
+ Addr});
}
bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(