#include "llvm/Intrinsics.h"
#include "llvm/Type.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
// This option should go away when tail calls fully work.
static cl::opt<bool>
setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Legal);
setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
if (VT.isInteger()) {
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ // Custom handling for some vector types to avoid expensive expansions
+ setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
+ setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
+ setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
+ setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRL);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
- // FIXME: Shouldn't need this, since no register is used, but the legalizer
- // doesn't yet know how to not do that for SjLj.
- setExceptionSelectorRegister(ARM::R0);
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+ setExceptionPointerRegister(ARM::R0);
+ setExceptionSelectorRegister(ARM::R1);
+
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
// ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
// the default expansion.
else
setSchedulingPreference(Sched::Hybrid);
- maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type
+ //// temporary - rewrite interface to use type
+ maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
// On ARM arguments smaller than 4 bytes are extended, so all arguments
// are at least 4 bytes aligned.
benefitFromCodePlacementOpt = true;
}
+// FIXME: It might make sense to define the representative register class as the
+// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
+// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
+// SPR's representative would be DPR_VFP2. This should work well if register
+// pressure tracking were modified such that a register use would increment the
+// pressure of the register class's representative and all of it's super
+// classes' representatives transitively. We have not implemented this because
+// of the difficulty prior to coalescing of modeling operand register classes
+// due to the common occurence of cross class copies and subregister insertions
+// and extractions.
std::pair<const TargetRegisterClass*, uint8_t>
ARMTargetLowering::findRepresentativeClass(EVT VT) const{
const TargetRegisterClass *RRC = 0;
case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
RRC = ARM::DPRRegisterClass;
+ // When NEON is used for SP, only half of the register file is available
+ // because operations that define both SP and DP results will be constrained
+ // to the VFP2 class (D0-D15). We currently model this constraint prior to
+ // coalescing by double-counting the SP regs. See the FIXME above.
+ if (Subtarget->useNEONForSinglePrecisionFP())
+ Cost = 2;
break;
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
case MVT::v4f32: case MVT::v2f64:
switch (Opcode) {
default: return 0;
case ARMISD::Wrapper: return "ARMISD::Wrapper";
+ case ARMISD::WrapperDYN: return "ARMISD::WrapperDYN";
+ case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC";
case ARMISD::WrapperJT: return "ARMISD::WrapperJT";
case ARMISD::CALL: return "ARMISD::CALL";
case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED";
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
+ case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD";
+ case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD";
+ case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD";
+ case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD";
+ case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
+ case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
+ case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
+ case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
+ case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
+ case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
+ case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD";
+ case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD";
+ case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD";
+ case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD";
+ case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
+ case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
+ case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
}
}
return Sched::RegPressure;
}
+// FIXME: Move to RegInfo
unsigned
ARMTargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
- const TargetFrameInfo *TFI = MF.getTarget().getFrameInfo();
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
switch (RC->getID()) {
default:
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
// Create a constant pool entry for the callee address
- unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
ARMPCLabelIndex,
ARMCP::CPValue, 0);
const char *Sym = S->getSymbol();
// Create a constant pool entry for the callee address
- unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
Sym, ARMPCLabelIndex, 0);
// Get the address of the callee into a register
isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
// tBX takes a register source operand.
if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
- unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
ARMPCLabelIndex,
ARMCP::CPValue, 4);
// tBX takes a register source operand.
const char *Sym = S->getSymbol();
if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
- unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
Sym, ARMPCLabelIndex, 4);
SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
+ if (!TargetRegisterInfo::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
} else {
unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
- ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV = new ARMConstantPoolValue(BA, ARMPCLabelIndex,
ARMCP::CPBlockAddress,
PCAdj);
unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV =
new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
if (GV->isDeclaration()) {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
// Initial exec model.
unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
ARMConstantPoolValue *CPV =
Result = DAG.getLoad(PtrVT, dl, Chain, Result,
MachinePointerInfo::getGOT(), false, false, 0);
return Result;
+ }
+
+ // If we have T2 ops, we can materialize the address directly via movt/movw
+ // pair. This is always cheaper.
+ if (Subtarget->useMovt()) {
+ ++NumMovwMovt;
+ // FIXME: Once remat is capable of dealing with instructions with register
+ // operands, expand this into two nodes.
+ return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+ DAG.getTargetGlobalAddress(GV, dl, PtrVT));
} else {
- // If we have T2 ops, we can materialize the address directly via movt/movw
- // pair. This is always cheaper.
- if (Subtarget->useMovt()) {
- return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
- DAG.getTargetGlobalAddress(GV, dl, PtrVT));
- } else {
- SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
- CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(),
- false, false, 0);
- }
+ SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(),
+ false, false, 0);
}
}
SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned ARMPCLabelIndex = 0;
EVT PtrVT = getPointerTy();
DebugLoc dl = Op.getDebugLoc();
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ if (Subtarget->useMovt()) {
+ ++NumMovwMovt;
+ // FIXME: Once remat is capable of dealing with instructions with register
+ // operands, expand this into two nodes.
+ if (RelocM == Reloc::Static)
+ return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+ DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+
+ unsigned Wrapper = (RelocM == Reloc::PIC_)
+ ? ARMISD::WrapperPIC : ARMISD::WrapperDYN;
+ SDValue Result = DAG.getNode(Wrapper, dl, PtrVT,
+ DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+ if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
+ Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(), false, false, 0);
+ return Result;
+ }
+
+ unsigned ARMPCLabelIndex = 0;
SDValue CPAddr;
- if (RelocM == Reloc::Static)
+ if (RelocM == Reloc::Static) {
CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
- else {
- ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ } else {
+ ARMPCLabelIndex = AFI->createPICLabelUId();
unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
ARMConstantPoolValue *CPV =
new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj);
"GLOBAL OFFSET TABLE not implemented for non-ELF targets");
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
EVT PtrVT = getPointerTy();
DebugLoc dl = Op.getDebugLoc();
unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
EVT PtrVT = getPointerTy();
DebugLoc dl = Op.getDebugLoc();
Reloc::Model RelocM = getTargetMachine().getRelocationModel();
unsigned NumGPRs = CCInfo.getFirstUnallocated
(GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
- unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+ unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
unsigned VARegSize = (4 - NumGPRs) * 4;
unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
unsigned ArgOffset = CCInfo.getNextStackOffset();
DebugLoc dl = Op.getDebugLoc();
EVT VT = Op.getValueType();
EVT SrcVT = Tmp1.getValueType();
- SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
- SDValue ARMcc = DAG.getConstant(ARMCC::LT, MVT::i32);
- SDValue FP0 = DAG.getConstantFP(0.0, SrcVT);
- SDValue Cmp = getVFPCmp(Tmp1, FP0, DAG, dl);
+ bool F2IisFast = Subtarget->isCortexA9() ||
+ Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR;
+
+ // Bitcast operand 1 to i32.
+ if (SrcVT == MVT::f64)
+ Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+ &Tmp1, 1).getValue(1);
+ Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
+
+ // If float to int conversion isn't going to be super expensive, then simply
+ // or in the signbit.
+ if (F2IisFast) {
+ SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
+ SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
+ Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
+ if (VT == MVT::f32) {
+ Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+ DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
+ }
+
+ // f64: Or the high part with signbit and then combine two parts.
+ Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+ &Tmp0, 1);
+ SDValue Lo = Tmp0.getValue(0);
+ SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
+ Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
+ return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+ }
+
+ // Remove the signbit of operand 0.
+ Tmp0 = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
+
+ // If operand 1 signbit is one, then negate operand 0.
+ SDValue ARMcc;
+ SDValue Cmp = getARMCmp(Tmp1, DAG.getConstant(0, MVT::i32),
+ ISD::SETLT, ARMcc, DAG, dl);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
- return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMcc, CCR, Cmp);
+ return DAG.getNode(ARMISD::CNEG, dl, VT, Tmp0, Tmp0, ARMcc, CCR, Cmp);
}
SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
// If this is a case we can't handle, return null and let the default
// expansion code take care of it.
-static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
- const ARMSubtarget *ST) {
+SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const {
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
DebugLoc dl = Op.getDebugLoc();
EVT VT = Op.getValueType();
if (isConstant)
return SDValue();
+ // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+ if (NumElts >= 4) {
+ SDValue shuffle = ReconstructShuffle(Op, DAG);
+ if (shuffle != SDValue())
+ return shuffle;
+ }
+
// Vectors with 32- or 64-bit elements can be built by directly assigning
// the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
// will be legalized.
return SDValue();
}
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ SmallVector<SDValue, 2> SourceVecs;
+ SmallVector<unsigned, 2> MinElts;
+ SmallVector<unsigned, 2> MaxElts;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.getOpcode() == ISD::UNDEF)
+ continue;
+ else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+ // A shuffle can only come from building a vector from various
+ // elements of other vectors.
+ return SDValue();
+ }
+
+ // Record this extraction against the appropriate vector if possible...
+ SDValue SourceVec = V.getOperand(0);
+ unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+ bool FoundSource = false;
+ for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+ if (SourceVecs[j] == SourceVec) {
+ if (MinElts[j] > EltNo)
+ MinElts[j] = EltNo;
+ if (MaxElts[j] < EltNo)
+ MaxElts[j] = EltNo;
+ FoundSource = true;
+ break;
+ }
+ }
+
+ // Or record a new source if not...
+ if (!FoundSource) {
+ SourceVecs.push_back(SourceVec);
+ MinElts.push_back(EltNo);
+ MaxElts.push_back(EltNo);
+ }
+ }
+
+ // Currently only do something sane when at most two source vectors
+ // involved.
+ if (SourceVecs.size() > 2)
+ return SDValue();
+
+ SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+ int VEXTOffsets[2] = {0, 0};
+
+ // This loop extracts the usage patterns of the source vectors
+ // and prepares appropriate SDValues for a shuffle if possible.
+ for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+ if (SourceVecs[i].getValueType() == VT) {
+ // No VEXT necessary
+ ShuffleSrcs[i] = SourceVecs[i];
+ VEXTOffsets[i] = 0;
+ continue;
+ } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+ // It probably isn't worth padding out a smaller vector just to
+ // break it down again in a shuffle.
+ return SDValue();
+ }
+
+ // Since only 64-bit and 128-bit vectors are legal on ARM and
+ // we've eliminated the other cases...
+ assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
+ "unexpected vector sizes in ReconstructShuffle");
+
+ if (MaxElts[i] - MinElts[i] >= NumElts) {
+ // Span too large for a VEXT to cope
+ return SDValue();
+ }
+
+ if (MinElts[i] >= NumElts) {
+ // The extraction can just take the second half
+ VEXTOffsets[i] = NumElts;
+ ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ SourceVecs[i],
+ DAG.getIntPtrConstant(NumElts));
+ } else if (MaxElts[i] < NumElts) {
+ // The extraction can just take the first half
+ VEXTOffsets[i] = 0;
+ ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ SourceVecs[i],
+ DAG.getIntPtrConstant(0));
+ } else {
+ // An actual VEXT is needed
+ VEXTOffsets[i] = MinElts[i];
+ SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ SourceVecs[i],
+ DAG.getIntPtrConstant(0));
+ SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ SourceVecs[i],
+ DAG.getIntPtrConstant(NumElts));
+ ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
+ DAG.getConstant(VEXTOffsets[i], MVT::i32));
+ }
+ }
+
+ SmallVector<int, 8> Mask;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Entry = Op.getOperand(i);
+ if (Entry.getOpcode() == ISD::UNDEF) {
+ Mask.push_back(-1);
+ continue;
+ }
+
+ SDValue ExtractVec = Entry.getOperand(0);
+ int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
+ .getOperand(1))->getSExtValue();
+ if (ExtractVec == SourceVecs[0]) {
+ Mask.push_back(ExtractElt - VEXTOffsets[0]);
+ } else {
+ Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+ }
+ }
+
+ // Final check before we try to produce nonsense...
+ if (isShuffleMaskLegal(Mask, VT))
+ return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+ &Mask[0]);
+
+ return SDValue();
+}
+
/// isShuffleMaskLegal - Targets can use this to indicate that they only
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
}
+static SDValue
+LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
+ // Convert to float
+ // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
+ // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
+ X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
+ Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
+ X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
+ Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
+ // Get reciprocal estimate.
+ // float4 recip = vrecpeq_f32(yf);
+ Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
+ // Because char has a smaller range than uchar, we can actually get away
+ // without any newton steps. This requires that we use a weird bias
+ // of 0xb000, however (again, this has been exhaustively tested).
+ // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
+ X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
+ X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
+ Y = DAG.getConstant(0xb000, MVT::i32);
+ Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
+ X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
+ X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
+ // Convert back to short.
+ X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
+ X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
+ return X;
+}
+
+static SDValue
+LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
+ SDValue N2;
+ // Convert to float.
+ // float4 yf = vcvt_f32_s32(vmovl_s16(y));
+ // float4 xf = vcvt_f32_s32(vmovl_s16(x));
+ N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
+ N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
+ N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+ N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+ // Use reciprocal estimate and one refinement step.
+ // float4 recip = vrecpeq_f32(yf);
+ // recip *= vrecpsq_f32(yf, recip);
+ N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+ N1, N2);
+ N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+ // Because short has a smaller range than ushort, we can actually get away
+ // with only a single newton step. This requires that we use a weird bias
+ // of 89, however (again, this has been exhaustively tested).
+ // float4 result = as_float4(as_int4(xf*recip) + 89);
+ N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+ N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+ N1 = DAG.getConstant(89, MVT::i32);
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+ N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+ N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+ // Convert back to integer and return.
+ // return vmovn_s32(vcvt_s32_f32(result));
+ N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+ N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+ return N0;
+}
+
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+ "unexpected type for custom-lowering ISD::SDIV");
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2, N3;
+
+ if (VT == MVT::v8i8) {
+ N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
+ N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
+
+ N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+ DAG.getIntPtrConstant(4));
+ N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+ DAG.getIntPtrConstant(4));
+ N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+ DAG.getIntPtrConstant(0));
+ N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+ DAG.getIntPtrConstant(0));
+
+ N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
+ N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
+
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+ N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+ N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
+ return N0;
+ }
+ return LowerSDIV_v4i16(N0, N1, dl, DAG);
+}
+
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+ "unexpected type for custom-lowering ISD::UDIV");
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2, N3;
+
+ if (VT == MVT::v8i8) {
+ N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
+
+ N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+ DAG.getIntPtrConstant(4));
+ N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+ DAG.getIntPtrConstant(4));
+ N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+ DAG.getIntPtrConstant(0));
+ N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+ DAG.getIntPtrConstant(0));
+
+ N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
+ N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
+
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+ N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+ N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
+ DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
+ N0);
+ return N0;
+ }
+
+ // v4i16 sdiv ... Convert to float.
+ // float4 yf = vcvt_f32_s32(vmovl_u16(y));
+ // float4 xf = vcvt_f32_s32(vmovl_u16(x));
+ N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
+ N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+ N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+ // Use reciprocal estimate and two refinement steps.
+ // float4 recip = vrecpeq_f32(yf);
+ // recip *= vrecpsq_f32(yf, recip);
+ // recip *= vrecpsq_f32(yf, recip);
+ N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+ N1, N2);
+ N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+ N1, N2);
+ N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+ // Simply multiplying by the reciprocal estimate can leave us a few ulps
+ // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
+ // and that it will never cause us to return an answer too large).
+ // float4 result = as_float4(as_int4(xf*recip) + 89);
+ N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+ N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+ N1 = DAG.getConstant(2, MVT::i32);
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+ N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+ N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+ // Convert back to integer and return.
+ // return vmovn_u32(vcvt_s32_f32(result));
+ N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+ N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+ return N0;
+}
+
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Don't know how to custom lower this!");
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
+ case ISD::SDIV: return LowerSDIV(Op, DAG);
+ case ISD::UDIV: return LowerUDIV(Op, DAG);
}
return SDValue();
}
default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
case 1:
ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
- strOpc = isThumb2 ? ARM::t2LDREXB : ARM::STREXB;
+ strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
break;
case 2:
ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
case ARM::BCCi64:
case ARM::BCCZi64: {
+ // If there is an unconditional branch to the other successor, remove it.
+ BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+
// Compare both parts that make up the double comparison separately for
// equality.
bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
DAG.getUNDEF(VT), NewMask.data());
}
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
+/// NEON load/store intrinsics to merge base address updates.
+static SDValue CombineBaseUpdate(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+ N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+ unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+ SDValue Addr = N->getOperand(AddrOpIdx);
+
+ // Search for a use of the address operand that is an increment.
+ for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+ UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::ADD ||
+ UI.getUse().getResNo() != Addr.getResNo())
+ continue;
+
+ // Check that the add is independent of the load/store. Otherwise, folding
+ // it would create a cycle.
+ if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+ continue;
+
+ // Find the new opcode for the updating load/store.
+ bool isLoad = true;
+ bool isLaneOp = false;
+ unsigned NewOpc = 0;
+ unsigned NumVecs = 0;
+ if (isIntrinsic) {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: assert(0 && "unexpected intrinsic for Neon base update");
+ case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
+ NumVecs = 1; break;
+ case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
+ NumVecs = 2; break;
+ case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
+ NumVecs = 3; break;
+ case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
+ NumVecs = 4; break;
+ case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+ NumVecs = 2; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+ NumVecs = 3; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+ NumVecs = 4; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
+ NumVecs = 1; isLoad = false; break;
+ case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
+ NumVecs = 2; isLoad = false; break;
+ case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
+ NumVecs = 3; isLoad = false; break;
+ case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
+ NumVecs = 4; isLoad = false; break;
+ case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+ NumVecs = 2; isLoad = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+ NumVecs = 3; isLoad = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+ NumVecs = 4; isLoad = false; isLaneOp = true; break;
+ }
+ } else {
+ isLaneOp = true;
+ switch (N->getOpcode()) {
+ default: assert(0 && "unexpected opcode for Neon base update");
+ case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+ case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+ case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+ }
+ }
+
+ // Find the size of memory referenced by the load/store.
+ EVT VecTy;
+ if (isLoad)
+ VecTy = N->getValueType(0);
+ else
+ VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+ unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+ if (isLaneOp)
+ NumBytes /= VecTy.getVectorNumElements();
+
+ // If the increment is a constant, it must match the memory ref size.
+ SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+ if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+ uint64_t IncVal = CInc->getZExtValue();
+ if (IncVal != NumBytes)
+ continue;
+ } else if (NumBytes >= 3 * 16) {
+ // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+ // separate instructions that make it harder to use a non-constant update.
+ continue;
+ }
+
+ // Create the new updating load/store node.
+ EVT Tys[6];
+ unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+ unsigned n;
+ for (n = 0; n < NumResultVecs; ++n)
+ Tys[n] = VecTy;
+ Tys[n++] = MVT::i32;
+ Tys[n] = MVT::Other;
+ SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(N->getOperand(0)); // incoming chain
+ Ops.push_back(N->getOperand(AddrOpIdx));
+ Ops.push_back(Inc);
+ for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+ MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
+ Ops.data(), Ops.size(),
+ MemInt->getMemoryVT(),
+ MemInt->getMemOperand());
+
+ // Update the uses.
+ std::vector<SDValue> NewResults;
+ for (unsigned i = 0; i < NumResultVecs; ++i) {
+ NewResults.push_back(SDValue(UpdN.getNode(), i));
+ }
+ NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
+ DCI.CombineTo(N, NewResults);
+ DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+ break;
+ }
+ return SDValue();
+}
+
/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
+ case ARMISD::VLD2DUP:
+ case ARMISD::VLD3DUP:
+ case ARMISD::VLD4DUP:
+ return CombineBaseUpdate(N, DCI);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN:
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld2:
+ case Intrinsic::arm_neon_vld3:
+ case Intrinsic::arm_neon_vld4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst2:
+ case Intrinsic::arm_neon_vst3:
+ case Intrinsic::arm_neon_vst4:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane:
+ return CombineBaseUpdate(N, DCI);
+ default: break;
+ }
+ break;
}
return SDValue();
}
+bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
+ EVT VT) const {
+ return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
+}
+
bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
if (!Subtarget->allowsUnalignedMem())
return false;
// ARM Inline Assembly Support
//===----------------------------------------------------------------------===//
+bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
+ // Looking for "rev" which is V6+.
+ if (!Subtarget->hasV6Ops())
+ return false;
+
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+ std::string AsmStr = IA->getAsmString();
+ SmallVector<StringRef, 4> AsmPieces;
+ SplitString(AsmStr, AsmPieces, ";\n");
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ AsmStr = AsmPieces[0];
+ AsmPieces.clear();
+ SplitString(AsmStr, AsmPieces, " \t,");
+
+ // rev $0, $1
+ if (AsmPieces.size() == 3 &&
+ AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
+ IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
+ const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (Ty && Ty->getBitWidth() == 32)
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ break;
+ }
+
+ return false;
+}
+
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
ARMTargetLowering::ConstraintType