lib/Target/ARM/ARMISelLowering.cpp

   1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that ARM uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #define DEBUG_TYPE "arm-isel"
  16 #include "ARMISelLowering.h"
  17 #include "ARM.h"
  18 #include "ARMCallingConv.h"
  19 #include "ARMConstantPoolValue.h"
  20 #include "ARMMachineFunctionInfo.h"
  21 #include "ARMPerfectShuffle.h"
  22 #include "ARMSubtarget.h"
  23 #include "ARMTargetMachine.h"
  24 #include "ARMTargetObjectFile.h"
  25 #include "MCTargetDesc/ARMAddressingModes.h"
  26 #include "llvm/ADT/Statistic.h"
  27 #include "llvm/ADT/StringExtras.h"
  28 #include "llvm/CodeGen/CallingConvLower.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineBasicBlock.h"
  31 #include "llvm/CodeGen/MachineFrameInfo.h"
  32 #include "llvm/CodeGen/MachineFunction.h"
  33 #include "llvm/CodeGen/MachineInstrBuilder.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/CodeGen/SelectionDAG.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/Function.h"
  40 #include "llvm/IR/GlobalValue.h"
  41 #include "llvm/IR/Instruction.h"
  42 #include "llvm/IR/Instructions.h"
  43 #include "llvm/IR/Intrinsics.h"
  44 #include "llvm/IR/Type.h"
  45 #include "llvm/MC/MCSectionMachO.h"
  46 #include "llvm/Support/CommandLine.h"
  47 #include "llvm/Support/ErrorHandling.h"
  48 #include "llvm/Support/MathExtras.h"
  49 #include "llvm/Support/raw_ostream.h"
  50 #include "llvm/Target/TargetOptions.h"
  51 using namespace llvm;
  52
  53 STATISTIC(NumTailCalls, "Number of tail calls");
  54 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
  55 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
  56
  57 // This option should go away when tail calls fully work.
  58 static cl::opt<bool>
  59 EnableARMTailCalls("arm-tail-calls", cl::Hidden,
  60   cl::desc("Generate tail calls (TEMPORARY OPTION)."),
  61   cl::init(false));
  62
  63 cl::opt<bool>
  64 EnableARMLongCalls("arm-long-calls", cl::Hidden,
  65   cl::desc("Generate calls via indirect call instructions"),
  66   cl::init(false));
  67
  68 static cl::opt<bool>
  69 ARMInterworking("arm-interworking", cl::Hidden,
  70   cl::desc("Enable / disable ARM interworking (for debugging only)"),
  71   cl::init(true));
  72
  73 namespace {
  74   class ARMCCState : public CCState {
  75   public:
  76     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
  77                const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
  78                LLVMContext &C, ParmContext PC)
  79         : CCState(CC, isVarArg, MF, TM, locs, C) {
  80       assert(((PC == Call) || (PC == Prologue)) &&
  81              "ARMCCState users must specify whether their context is call"
  82              "or prologue generation.");
  83       CallOrPrologue = PC;
  84     }
  85   };
  86 }
  87
  88 // The APCS parameter registers.
  89 static const uint16_t GPRArgRegs[] = {
  90   ARM::R0, ARM::R1, ARM::R2, ARM::R3
  91 };
  92
  93 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
  94                                        MVT PromotedBitwiseVT) {
  95   if (VT != PromotedLdStVT) {
  96     setOperationAction(ISD::LOAD, VT, Promote);
  97     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
  98
  99     setOperationAction(ISD::STORE, VT, Promote);
 100     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
 101   }
 102
 103   MVT ElemTy = VT.getVectorElementType();
 104   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
 105     setOperationAction(ISD::SETCC, VT, Custom);
 106   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 107   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 108   if (ElemTy == MVT::i32) {
 109     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
 110     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
 111     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
 112     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 113   } else {
 114     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 115     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 116     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 117     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 118   }
 119   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
 120   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
 121   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
 122   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 123   setOperationAction(ISD::SELECT,            VT, Expand);
 124   setOperationAction(ISD::SELECT_CC,         VT, Expand);
 125   setOperationAction(ISD::VSELECT,           VT, Expand);
 126   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 127   if (VT.isInteger()) {
 128     setOperationAction(ISD::SHL, VT, Custom);
 129     setOperationAction(ISD::SRA, VT, Custom);
 130     setOperationAction(ISD::SRL, VT, Custom);
 131   }
 132
 133   // Promote all bit-wise operations.
 134   if (VT.isInteger() && VT != PromotedBitwiseVT) {
 135     setOperationAction(ISD::AND, VT, Promote);
 136     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
 137     setOperationAction(ISD::OR,  VT, Promote);
 138     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
 139     setOperationAction(ISD::XOR, VT, Promote);
 140     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
 141   }
 142
 143   // Neon does not support vector divide/remainder operations.
 144   setOperationAction(ISD::SDIV, VT, Expand);
 145   setOperationAction(ISD::UDIV, VT, Expand);
 146   setOperationAction(ISD::FDIV, VT, Expand);
 147   setOperationAction(ISD::SREM, VT, Expand);
 148   setOperationAction(ISD::UREM, VT, Expand);
 149   setOperationAction(ISD::FREM, VT, Expand);
 150 }
 151
 152 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
 153   addRegisterClass(VT, &ARM::DPRRegClass);
 154   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 155 }
 156
 157 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
 158   addRegisterClass(VT, &ARM::QPRRegClass);
 159   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 160 }
 161
 162 static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
 163   if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
 164     return new TargetLoweringObjectFileMachO();
 165
 166   return new ARMElfTargetObjectFile();
 167 }
 168
 169 ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 170     : TargetLowering(TM, createTLOF(TM)) {
 171   Subtarget = &TM.getSubtarget<ARMSubtarget>();
 172   RegInfo = TM.getRegisterInfo();
 173   Itins = TM.getInstrItineraryData();
 174
 175   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 176
 177   if (Subtarget->isTargetDarwin()) {
 178     // Uses VFP for Thumb libfuncs if available.
 179     if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
 180       // Single-precision floating-point arithmetic.
 181       setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
 182       setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
 183       setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
 184       setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
 185
 186       // Double-precision floating-point arithmetic.
 187       setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
 188       setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
 189       setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
 190       setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
 191
 192       // Single-precision comparisons.
 193       setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
 194       setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
 195       setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
 196       setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
 197       setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
 198       setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
 199       setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
 200       setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
 201
 202       setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
 203       setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
 204       setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
 205       setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
 206       setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
 207       setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
 208       setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
 209       setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
 210
 211       // Double-precision comparisons.
 212       setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
 213       setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
 214       setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
 215       setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
 216       setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
 217       setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
 218       setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
 219       setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
 220
 221       setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
 222       setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
 223       setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
 224       setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
 225       setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
 226       setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
 227       setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
 228       setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
 229
 230       // Floating-point to integer conversions.
 231       // i64 conversions are done via library routines even when generating VFP
 232       // instructions, so use the same ones.
 233       setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
 234       setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
 235       setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
 236       setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
 237
 238       // Conversions between floating types.
 239       setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
 240       setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
 241
 242       // Integer to floating-point conversions.
 243       // i64 conversions are done via library routines even when generating VFP
 244       // instructions, so use the same ones.
 245       // FIXME: There appears to be some naming inconsistency in ARM libgcc:
 246       // e.g., __floatunsidf vs. __floatunssidfvfp.
 247       setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
 248       setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
 249       setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
 250       setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
 251     }
 252   }
 253
 254   // These libcalls are not available in 32-bit.
 255   setLibcallName(RTLIB::SHL_I128, 0);
 256   setLibcallName(RTLIB::SRL_I128, 0);
 257   setLibcallName(RTLIB::SRA_I128, 0);
 258
 259   if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) {
 260     // Double-precision floating-point arithmetic helper functions
 261     // RTABI chapter 4.1.2, Table 2
 262     setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
 263     setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv");
 264     setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul");
 265     setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub");
 266     setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS);
 267     setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS);
 268     setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS);
 269     setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS);
 270
 271     // Double-precision floating-point comparison helper functions
 272     // RTABI chapter 4.1.2, Table 3
 273     setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq");
 274     setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
 275     setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq");
 276     setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ);
 277     setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt");
 278     setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
 279     setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple");
 280     setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
 281     setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge");
 282     setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
 283     setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt");
 284     setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
 285     setLibcallName(RTLIB::UO_F64,  "__aeabi_dcmpun");
 286     setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
 287     setLibcallName(RTLIB::O_F64,   "__aeabi_dcmpun");
 288     setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
 289     setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS);
 290     setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS);
 291     setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS);
 292     setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS);
 293     setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS);
 294     setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS);
 295     setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS);
 296     setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS);
 297
 298     // Single-precision floating-point arithmetic helper functions
 299     // RTABI chapter 4.1.2, Table 4
 300     setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd");
 301     setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv");
 302     setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul");
 303     setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub");
 304     setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS);
 305     setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS);
 306     setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS);
 307     setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS);
 308
 309     // Single-precision floating-point comparison helper functions
 310     // RTABI chapter 4.1.2, Table 5
 311     setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq");
 312     setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
 313     setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq");
 314     setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ);
 315     setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt");
 316     setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
 317     setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple");
 318     setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
 319     setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge");
 320     setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
 321     setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt");
 322     setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
 323     setLibcallName(RTLIB::UO_F32,  "__aeabi_fcmpun");
 324     setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
 325     setLibcallName(RTLIB::O_F32,   "__aeabi_fcmpun");
 326     setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
 327     setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS);
 328     setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS);
 329     setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS);
 330     setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS);
 331     setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS);
 332     setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS);
 333     setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS);
 334     setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS);
 335
 336     // Floating-point to integer conversions.
 337     // RTABI chapter 4.1.2, Table 6
 338     setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz");
 339     setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz");
 340     setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz");
 341     setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz");
 342     setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz");
 343     setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz");
 344     setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz");
 345     setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz");
 346     setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS);
 347     setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS);
 348     setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS);
 349     setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS);
 350     setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS);
 351     setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS);
 352     setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS);
 353     setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS);
 354
 355     // Conversions between floating types.
 356     // RTABI chapter 4.1.2, Table 7
 357     setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f");
 358     setLibcallName(RTLIB::FPEXT_F32_F64,   "__aeabi_f2d");
 359     setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS);
 360     setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS);
 361
 362     // Integer to floating-point conversions.
 363     // RTABI chapter 4.1.2, Table 8
 364     setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d");
 365     setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d");
 366     setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d");
 367     setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d");
 368     setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f");
 369     setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f");
 370     setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f");
 371     setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f");
 372     setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
 373     setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
 374     setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
 375     setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
 376     setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
 377     setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
 378     setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
 379     setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
 380
 381     // Long long helper functions
 382     // RTABI chapter 4.2, Table 9
 383     setLibcallName(RTLIB::MUL_I64,  "__aeabi_lmul");
 384     setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl");
 385     setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr");
 386     setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr");
 387     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS);
 388     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
 389     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
 390     setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS);
 391     setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS);
 392     setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS);
 393
 394     // Integer division functions
 395     // RTABI chapter 4.3.1
 396     setLibcallName(RTLIB::SDIV_I8,  "__aeabi_idiv");
 397     setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv");
 398     setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv");
 399     setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod");
 400     setLibcallName(RTLIB::UDIV_I8,  "__aeabi_uidiv");
 401     setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv");
 402     setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv");
 403     setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod");
 404     setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS);
 405     setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS);
 406     setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS);
 407     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
 408     setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
 409     setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
 410     setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
 411     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
 412
 413     // Memory operations
 414     // RTABI chapter 4.3.4
 415     setLibcallName(RTLIB::MEMCPY,  "__aeabi_memcpy");
 416     setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove");
 417     setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
 418     setLibcallCallingConv(RTLIB::MEMCPY, CallingConv::ARM_AAPCS);
 419     setLibcallCallingConv(RTLIB::MEMMOVE, CallingConv::ARM_AAPCS);
 420     setLibcallCallingConv(RTLIB::MEMSET, CallingConv::ARM_AAPCS);
 421   }
 422
 423   // Use divmod compiler-rt calls for iOS 5.0 and later.
 424   if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
 425       !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
 426     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
 427     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
 428   }
 429
 430   if (Subtarget->isThumb1Only())
 431     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
 432   else
 433     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
 434   if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
 435       !Subtarget->isThumb1Only()) {
 436     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
 437     if (!Subtarget->isFPOnlySP())
 438       addRegisterClass(MVT::f64, &ARM::DPRRegClass);
 439
 440     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 441   }
 442
 443   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
 444        VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
 445     for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
 446          InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
 447       setTruncStoreAction((MVT::SimpleValueType)VT,
 448                           (MVT::SimpleValueType)InnerVT, Expand);
 449     setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
 450     setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
 451     setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
 452   }
 453
 454   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
 455   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 456
 457   if (Subtarget->hasNEON()) {
 458     addDRTypeForNEON(MVT::v2f32);
 459     addDRTypeForNEON(MVT::v8i8);
 460     addDRTypeForNEON(MVT::v4i16);
 461     addDRTypeForNEON(MVT::v2i32);
 462     addDRTypeForNEON(MVT::v1i64);
 463
 464     addQRTypeForNEON(MVT::v4f32);
 465     addQRTypeForNEON(MVT::v2f64);
 466     addQRTypeForNEON(MVT::v16i8);
 467     addQRTypeForNEON(MVT::v8i16);
 468     addQRTypeForNEON(MVT::v4i32);
 469     addQRTypeForNEON(MVT::v2i64);
 470
 471     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
 472     // neither Neon nor VFP support any arithmetic operations on it.
 473     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
 474     // supported for v4f32.
 475     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
 476     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
 477     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
 478     // FIXME: Code duplication: FDIV and FREM are expanded always, see
 479     // ARMTargetLowering::addTypeForNEON method for details.
 480     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
 481     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
 482     // FIXME: Create unittest.
 483     // In another words, find a way when "copysign" appears in DAG with vector
 484     // operands.
 485     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
 486     // FIXME: Code duplication: SETCC has custom operation action, see
 487     // ARMTargetLowering::addTypeForNEON method for details.
 488     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
 489     // FIXME: Create unittest for FNEG and for FABS.
 490     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
 491     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
 492     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
 493     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
 494     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
 495     setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
 496     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
 497     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
 498     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
 499     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
 500     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
 501     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
 502     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
 503     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
 504     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
 505     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
 506     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
 507     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
 508     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
 509
 510     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
 511     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
 512     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
 513     setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
 514     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
 515     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
 516     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
 517     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
 518     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
 519     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
 520     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
 521     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
 522     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
 523     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
 524     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 525
 526     // Mark v2f32 intrinsics.
 527     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
 528     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
 529     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
 530     setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
 531     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
 532     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
 533     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
 534     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
 535     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
 536     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
 537     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
 538     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
 539     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
 540     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
 541     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
 542
 543     // Neon does not support some operations on v1i64 and v2i64 types.
 544     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 545     // Custom handling for some quad-vector types to detect VMULL.
 546     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
 547     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 548     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 549     // Custom handling for some vector types to avoid expensive expansions
 550     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
 551     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
 552     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
 553     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
 554     setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
 555     setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
 556     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
 557     // a destination type that is wider than the source, and nor does
 558     // it have a FP_TO_[SU]INT instruction with a narrower destination than
 559     // source.
 560     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
 561     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
 562     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
 563     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
 564
 565     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
 566     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
 567
 568     // Custom expand long extensions to vectors.
 569     setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32,  Custom);
 570     setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32,  Custom);
 571     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64,  Custom);
 572     setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64,  Custom);
 573     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
 574     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
 575     setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
 576     setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
 577
 578     // NEON does not have single instruction CTPOP for vectors with element
 579     // types wider than 8-bits.  However, custom lowering can leverage the
 580     // v8i8/v16i8 vcnt instruction.
 581     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
 582     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
 583     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
 584     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
 585
 586     // NEON only has FMA instructions as of VFP4.
 587     if (!Subtarget->hasVFP4()) {
 588       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
 589       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
 590     }
 591
 592     setTargetDAGCombine(ISD::INTRINSIC_VOID);
 593     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 594     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 595     setTargetDAGCombine(ISD::SHL);
 596     setTargetDAGCombine(ISD::SRL);
 597     setTargetDAGCombine(ISD::SRA);
 598     setTargetDAGCombine(ISD::SIGN_EXTEND);
 599     setTargetDAGCombine(ISD::ZERO_EXTEND);
 600     setTargetDAGCombine(ISD::ANY_EXTEND);
 601     setTargetDAGCombine(ISD::SELECT_CC);
 602     setTargetDAGCombine(ISD::BUILD_VECTOR);
 603     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
 604     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 605     setTargetDAGCombine(ISD::STORE);
 606     setTargetDAGCombine(ISD::FP_TO_SINT);
 607     setTargetDAGCombine(ISD::FP_TO_UINT);
 608     setTargetDAGCombine(ISD::FDIV);
 609
 610     // It is legal to extload from v4i8 to v4i16 or v4i32.
 611     MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8,
 612                   MVT::v4i16, MVT::v2i16,
 613                   MVT::v2i32};
 614     for (unsigned i = 0; i < 6; ++i) {
 615       setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal);
 616       setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal);
 617       setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal);
 618     }
 619   }
 620
 621   // ARM and Thumb2 support UMLAL/SMLAL.
 622   if (!Subtarget->isThumb1Only())
 623     setTargetDAGCombine(ISD::ADDC);
 624
 625
 626   computeRegisterProperties();
 627
 628   // ARM does not have f32 extending load.
 629   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
 630
 631   // ARM does not have i1 sign extending load.
 632   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 633
 634   // ARM supports all 4 flavors of integer indexed load / store.
 635   if (!Subtarget->isThumb1Only()) {
 636     for (unsigned im = (unsigned)ISD::PRE_INC;
 637          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 638       setIndexedLoadAction(im,  MVT::i1,  Legal);
 639       setIndexedLoadAction(im,  MVT::i8,  Legal);
 640       setIndexedLoadAction(im,  MVT::i16, Legal);
 641       setIndexedLoadAction(im,  MVT::i32, Legal);
 642       setIndexedStoreAction(im, MVT::i1,  Legal);
 643       setIndexedStoreAction(im, MVT::i8,  Legal);
 644       setIndexedStoreAction(im, MVT::i16, Legal);
 645       setIndexedStoreAction(im, MVT::i32, Legal);
 646     }
 647   }
 648
 649   // i64 operation support.
 650   setOperationAction(ISD::MUL,     MVT::i64, Expand);
 651   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
 652   if (Subtarget->isThumb1Only()) {
 653     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
 654     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 655   }
 656   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
 657       || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP()))
 658     setOperationAction(ISD::MULHS, MVT::i32, Expand);
 659
 660   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 661   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 662   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 663   setOperationAction(ISD::SRL,       MVT::i64, Custom);
 664   setOperationAction(ISD::SRA,       MVT::i64, Custom);
 665
 666   if (!Subtarget->isThumb1Only()) {
 667     // FIXME: We should do this for Thumb1 as well.
 668     setOperationAction(ISD::ADDC,    MVT::i32, Custom);
 669     setOperationAction(ISD::ADDE,    MVT::i32, Custom);
 670     setOperationAction(ISD::SUBC,    MVT::i32, Custom);
 671     setOperationAction(ISD::SUBE,    MVT::i32, Custom);
 672   }
 673
 674   // ARM does not have ROTL.
 675   setOperationAction(ISD::ROTL,  MVT::i32, Expand);
 676   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
 677   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 678   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
 679     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
 680
 681   // These just redirect to CTTZ and CTLZ on ARM.
 682   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
 683   setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
 684
 685   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
 686
 687   // Only ARMv6 has BSWAP.
 688   if (!Subtarget->hasV6Ops())
 689     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 690
 691   if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
 692       !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
 693     // These are expanded into libcalls if the cpu doesn't have HW divider.
 694     setOperationAction(ISD::SDIV,  MVT::i32, Expand);
 695     setOperationAction(ISD::UDIV,  MVT::i32, Expand);
 696   }
 697
 698   // FIXME: Also set divmod for SREM on EABI
 699   setOperationAction(ISD::SREM,  MVT::i32, Expand);
 700   setOperationAction(ISD::UREM,  MVT::i32, Expand);
 701   // Register based DivRem for AEABI (RTABI 4.2)
 702   if (Subtarget->isTargetAEABI()) {
 703     setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
 704     setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
 705     setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
 706     setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod");
 707     setLibcallName(RTLIB::UDIVREM_I8,  "__aeabi_uidivmod");
 708     setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod");
 709     setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod");
 710     setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod");
 711
 712     setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
 713     setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
 714     setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
 715     setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
 716     setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
 717     setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
 718     setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
 719     setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
 720
 721     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
 722     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
 723   } else {
 724     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
 725     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 726   }
 727
 728   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
 729   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
 730   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
 731   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
 732   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
 733
 734   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 735
 736   // Use the default implementation.
 737   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
 738   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
 739   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
 740   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
 741   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 742   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 743
 744   if (!Subtarget->isTargetDarwin()) {
 745     // Non-Darwin platforms may return values in these registers via the
 746     // personality function.
 747     setExceptionPointerRegister(ARM::R0);
 748     setExceptionSelectorRegister(ARM::R1);
 749   }
 750
 751   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 752   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
 753   // the default expansion.
 754   // FIXME: This should be checking for v6k, not just v6.
 755   if (Subtarget->hasDataBarrier() ||
 756       (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
 757     // membarrier needs custom lowering; the rest are legal and handled
 758     // normally.
 759     setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
 760     // Custom lowering for 64-bit ops
 761     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
 762     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
 763     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
 764     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
 765     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
 766     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Custom);
 767     setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i64, Custom);
 768     setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i64, Custom);
 769     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
 770     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
 771     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
 772     // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
 773     setInsertFencesForAtomic(true);
 774   } else {
 775     // Set them all for expansion, which will force libcalls.
 776     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
 777     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
 778     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
 779     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
 780     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
 781     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
 782     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
 783     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
 784     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
 785     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
 786     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
 787     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
 788     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
 789     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
 790     // Unordered/Monotonic case.
 791     setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
 792     setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
 793   }
 794
 795   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 796
 797   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
 798   if (!Subtarget->hasV6Ops()) {
 799     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 800     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
 801   }
 802   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 803
 804   if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
 805       !Subtarget->isThumb1Only()) {
 806     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
 807     // iff target supports vfp2.
 808     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
 809     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 810   }
 811
 812   // We want to custom lower some of our intrinsics.
 813   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 814   if (Subtarget->isTargetDarwin()) {
 815     setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 816     setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 817     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 818   }
 819
 820   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
 821   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
 822   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
 823   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
 824   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
 825   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
 826   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 827   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
 828   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 829
 830   setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
 831   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
 832   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
 833   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
 834   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
 835
 836   // We don't support sin/cos/fmod/copysign/pow
 837   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
 838   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
 839   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
 840   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
 841   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
 842   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
 843   setOperationAction(ISD::FREM,      MVT::f64, Expand);
 844   setOperationAction(ISD::FREM,      MVT::f32, Expand);
 845   if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
 846       !Subtarget->isThumb1Only()) {
 847     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 848     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 849   }
 850   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
 851   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
 852
 853   if (!Subtarget->hasVFP4()) {
 854     setOperationAction(ISD::FMA, MVT::f64, Expand);
 855     setOperationAction(ISD::FMA, MVT::f32, Expand);
 856   }
 857
 858   // Various VFP goodness
 859   if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) {
 860     // int <-> fp are custom expanded into bit_convert + ARMISD ops.
 861     if (Subtarget->hasVFP2()) {
 862       setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 863       setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 864       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 865       setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 866     }
 867     // Special handling for half-precision FP.
 868     if (!Subtarget->hasFP16()) {
 869       setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
 870       setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
 871     }
 872   }
 873
 874   // We have target-specific dag combine patterns for the following nodes:
 875   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
 876   setTargetDAGCombine(ISD::ADD);
 877   setTargetDAGCombine(ISD::SUB);
 878   setTargetDAGCombine(ISD::MUL);
 879   setTargetDAGCombine(ISD::AND);
 880   setTargetDAGCombine(ISD::OR);
 881   setTargetDAGCombine(ISD::XOR);
 882
 883   if (Subtarget->hasV6Ops())
 884     setTargetDAGCombine(ISD::SRL);
 885
 886   setStackPointerRegisterToSaveRestore(ARM::SP);
 887
 888   if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() ||
 889       !Subtarget->hasVFP2())
 890     setSchedulingPreference(Sched::RegPressure);
 891   else
 892     setSchedulingPreference(Sched::Hybrid);
 893
 894   //// temporary - rewrite interface to use type
 895   MaxStoresPerMemset = 8;
 896   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
 897   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
 898   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
 899   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
 900   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
 901
 902   // On ARM arguments smaller than 4 bytes are extended, so all arguments
 903   // are at least 4 bytes aligned.
 904   setMinStackArgumentAlignment(4);
 905
 906   // Prefer likely predicted branches to selects on out-of-order cores.
 907   PredictableSelectIsExpensive = Subtarget->isLikeA9();
 908
 909   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 910 }
 911
 912 // FIXME: It might make sense to define the representative register class as the
 913 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
 914 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
 915 // SPR's representative would be DPR_VFP2. This should work well if register
 916 // pressure tracking were modified such that a register use would increment the
 917 // pressure of the register class's representative and all of it's super
 918 // classes' representatives transitively. We have not implemented this because
 919 // of the difficulty prior to coalescing of modeling operand register classes
 920 // due to the common occurrence of cross class copies and subregister insertions
 921 // and extractions.
 922 std::pair<const TargetRegisterClass*, uint8_t>
 923 ARMTargetLowering::findRepresentativeClass(MVT VT) const{
 924   const TargetRegisterClass *RRC = 0;
 925   uint8_t Cost = 1;
 926   switch (VT.SimpleTy) {
 927   default:
 928     return TargetLowering::findRepresentativeClass(VT);
 929   // Use DPR as representative register class for all floating point
 930   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
 931   // the cost is 1 for both f32 and f64.
 932   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
 933   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
 934     RRC = &ARM::DPRRegClass;
 935     // When NEON is used for SP, only half of the register file is available
 936     // because operations that define both SP and DP results will be constrained
 937     // to the VFP2 class (D0-D15). We currently model this constraint prior to
 938     // coalescing by double-counting the SP regs. See the FIXME above.
 939     if (Subtarget->useNEONForSinglePrecisionFP())
 940       Cost = 2;
 941     break;
 942   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
 943   case MVT::v4f32: case MVT::v2f64:
 944     RRC = &ARM::DPRRegClass;
 945     Cost = 2;
 946     break;
 947   case MVT::v4i64:
 948     RRC = &ARM::DPRRegClass;
 949     Cost = 4;
 950     break;
 951   case MVT::v8i64:
 952     RRC = &ARM::DPRRegClass;
 953     Cost = 8;
 954     break;
 955   }
 956   return std::make_pair(RRC, Cost);
 957 }
 958
 959 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 960   switch (Opcode) {
 961   default: return 0;
 962   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
 963   case ARMISD::WrapperDYN:    return "ARMISD::WrapperDYN";
 964   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
 965   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
 966   case ARMISD::CALL:          return "ARMISD::CALL";
 967   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
 968   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
 969   case ARMISD::tCALL:         return "ARMISD::tCALL";
 970   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
 971   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
 972   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
 973   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
 974   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
 975   case ARMISD::CMP:           return "ARMISD::CMP";
 976   case ARMISD::CMN:           return "ARMISD::CMN";
 977   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
 978   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
 979   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
 980   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
 981   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
 982
 983   case ARMISD::CMOV:          return "ARMISD::CMOV";
 984
 985   case ARMISD::RBIT:          return "ARMISD::RBIT";
 986
 987   case ARMISD::FTOSI:         return "ARMISD::FTOSI";
 988   case ARMISD::FTOUI:         return "ARMISD::FTOUI";
 989   case ARMISD::SITOF:         return "ARMISD::SITOF";
 990   case ARMISD::UITOF:         return "ARMISD::UITOF";
 991
 992   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
 993   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
 994   case ARMISD::RRX:           return "ARMISD::RRX";
 995
 996   case ARMISD::ADDC:          return "ARMISD::ADDC";
 997   case ARMISD::ADDE:          return "ARMISD::ADDE";
 998   case ARMISD::SUBC:          return "ARMISD::SUBC";
 999   case ARMISD::SUBE:          return "ARMISD::SUBE";
1000
1001   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1002   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1003
1004   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1005   case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
1006
1007   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1008
1009   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1010
1011   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1012
1013   case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
1014   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1015
1016   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1017
1018   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
1019   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
1020   case ARMISD::VCGE:          return "ARMISD::VCGE";
1021   case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
1022   case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
1023   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
1024   case ARMISD::VCGT:          return "ARMISD::VCGT";
1025   case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
1026   case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
1027   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
1028   case ARMISD::VTST:          return "ARMISD::VTST";
1029
1030   case ARMISD::VSHL:          return "ARMISD::VSHL";
1031   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
1032   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
1033   case ARMISD::VSHLLs:        return "ARMISD::VSHLLs";
1034   case ARMISD::VSHLLu:        return "ARMISD::VSHLLu";
1035   case ARMISD::VSHLLi:        return "ARMISD::VSHLLi";
1036   case ARMISD::VSHRN:         return "ARMISD::VSHRN";
1037   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
1038   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
1039   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
1040   case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
1041   case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
1042   case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
1043   case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
1044   case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
1045   case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
1046   case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
1047   case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
1048   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
1049   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1050   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1051   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1052   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1053   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1054   case ARMISD::VDUP:          return "ARMISD::VDUP";
1055   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1056   case ARMISD::VEXT:          return "ARMISD::VEXT";
1057   case ARMISD::VREV64:        return "ARMISD::VREV64";
1058   case ARMISD::VREV32:        return "ARMISD::VREV32";
1059   case ARMISD::VREV16:        return "ARMISD::VREV16";
1060   case ARMISD::VZIP:          return "ARMISD::VZIP";
1061   case ARMISD::VUZP:          return "ARMISD::VUZP";
1062   case ARMISD::VTRN:          return "ARMISD::VTRN";
1063   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1064   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1065   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1066   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1067   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1068   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1069   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1070   case ARMISD::FMAX:          return "ARMISD::FMAX";
1071   case ARMISD::FMIN:          return "ARMISD::FMIN";
1072   case ARMISD::BFI:           return "ARMISD::BFI";
1073   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1074   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1075   case ARMISD::VBSL:          return "ARMISD::VBSL";
1076   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1077   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1078   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1079   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1080   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1081   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1082   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1083   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1084   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1085   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1086   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1087   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1088   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1089   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1090   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1091   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1092   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1093   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1094   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1095   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1096
1097   case ARMISD::ATOMADD64_DAG:     return "ATOMADD64_DAG";
1098   case ARMISD::ATOMSUB64_DAG:     return "ATOMSUB64_DAG";
1099   case ARMISD::ATOMOR64_DAG:      return "ATOMOR64_DAG";
1100   case ARMISD::ATOMXOR64_DAG:     return "ATOMXOR64_DAG";
1101   case ARMISD::ATOMAND64_DAG:     return "ATOMAND64_DAG";
1102   case ARMISD::ATOMNAND64_DAG:    return "ATOMNAND64_DAG";
1103   case ARMISD::ATOMSWAP64_DAG:    return "ATOMSWAP64_DAG";
1104   case ARMISD::ATOMCMPXCHG64_DAG: return "ATOMCMPXCHG64_DAG";
1105   case ARMISD::ATOMMIN64_DAG:     return "ATOMMIN64_DAG";
1106   case ARMISD::ATOMUMIN64_DAG:    return "ATOMUMIN64_DAG";
1107   case ARMISD::ATOMMAX64_DAG:     return "ATOMMAX64_DAG";
1108   case ARMISD::ATOMUMAX64_DAG:    return "ATOMUMAX64_DAG";
1109   }
1110 }
1111
1112 EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1113   if (!VT.isVector()) return getPointerTy();
1114   return VT.changeVectorElementTypeToInteger();
1115 }
1116
1117 /// getRegClassFor - Return the register class that should be used for the
1118 /// specified value type.
1119 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
1120   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1121   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1122   // load / store 4 to 8 consecutive D registers.
1123   if (Subtarget->hasNEON()) {
1124     if (VT == MVT::v4i64)
1125       return &ARM::QQPRRegClass;
1126     if (VT == MVT::v8i64)
1127       return &ARM::QQQQPRRegClass;
1128   }
1129   return TargetLowering::getRegClassFor(VT);
1130 }
1131
1132 // Create a fast isel object.
1133 FastISel *
1134 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1135                                   const TargetLibraryInfo *libInfo) const {
1136   return ARM::createFastISel(funcInfo, libInfo);
1137 }
1138
1139 /// getMaximalGlobalOffset - Returns the maximal possible offset which can
1140 /// be used for loads / stores from the global.
1141 unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
1142   return (Subtarget->isThumb1Only() ? 127 : 4095);
1143 }
1144
1145 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1146   unsigned NumVals = N->getNumValues();
1147   if (!NumVals)
1148     return Sched::RegPressure;
1149
1150   for (unsigned i = 0; i != NumVals; ++i) {
1151     EVT VT = N->getValueType(i);
1152     if (VT == MVT::Glue || VT == MVT::Other)
1153       continue;
1154     if (VT.isFloatingPoint() || VT.isVector())
1155       return Sched::ILP;
1156   }
1157
1158   if (!N->isMachineOpcode())
1159     return Sched::RegPressure;
1160
1161   // Load are scheduled for latency even if there instruction itinerary
1162   // is not available.
1163   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
1164   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1165
1166   if (MCID.getNumDefs() == 0)
1167     return Sched::RegPressure;
1168   if (!Itins->isEmpty() &&
1169       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1170     return Sched::ILP;
1171
1172   return Sched::RegPressure;
1173 }
1174
1175 //===----------------------------------------------------------------------===//
1176 // Lowering Code
1177 //===----------------------------------------------------------------------===//
1178
1179 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1180 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1181   switch (CC) {
1182   default: llvm_unreachable("Unknown condition code!");
1183   case ISD::SETNE:  return ARMCC::NE;
1184   case ISD::SETEQ:  return ARMCC::EQ;
1185   case ISD::SETGT:  return ARMCC::GT;
1186   case ISD::SETGE:  return ARMCC::GE;
1187   case ISD::SETLT:  return ARMCC::LT;
1188   case ISD::SETLE:  return ARMCC::LE;
1189   case ISD::SETUGT: return ARMCC::HI;
1190   case ISD::SETUGE: return ARMCC::HS;
1191   case ISD::SETULT: return ARMCC::LO;
1192   case ISD::SETULE: return ARMCC::LS;
1193   }
1194 }
1195
1196 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1197 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1198                         ARMCC::CondCodes &CondCode2) {
1199   CondCode2 = ARMCC::AL;
1200   switch (CC) {
1201   default: llvm_unreachable("Unknown FP condition!");
1202   case ISD::SETEQ:
1203   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1204   case ISD::SETGT:
1205   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1206   case ISD::SETGE:
1207   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1208   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1209   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1210   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1211   case ISD::SETO:   CondCode = ARMCC::VC; break;
1212   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1213   case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1214   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1215   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1216   case ISD::SETLT:
1217   case ISD::SETULT: CondCode = ARMCC::LT; break;
1218   case ISD::SETLE:
1219   case ISD::SETULE: CondCode = ARMCC::LE; break;
1220   case ISD::SETNE:
1221   case ISD::SETUNE: CondCode = ARMCC::NE; break;
1222   }
1223 }
1224
1225 //===----------------------------------------------------------------------===//
1226 //                      Calling Convention Implementation
1227 //===----------------------------------------------------------------------===//
1228
1229 #include "ARMGenCallingConv.inc"
1230
1231 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1232 /// given CallingConvention value.
1233 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1234                                                  bool Return,
1235                                                  bool isVarArg) const {
1236   switch (CC) {
1237   default:
1238     llvm_unreachable("Unsupported calling convention");
1239   case CallingConv::Fast:
1240     if (Subtarget->hasVFP2() && !isVarArg) {
1241       if (!Subtarget->isAAPCS_ABI())
1242         return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1243       // For AAPCS ABI targets, just use VFP variant of the calling convention.
1244       return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1245     }
1246     // Fallthrough
1247   case CallingConv::C: {
1248     // Use target triple & subtarget features to do actual dispatch.
1249     if (!Subtarget->isAAPCS_ABI())
1250       return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1251     else if (Subtarget->hasVFP2() &&
1252              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1253              !isVarArg)
1254       return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1255     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1256   }
1257   case CallingConv::ARM_AAPCS_VFP:
1258     if (!isVarArg)
1259       return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1260     // Fallthrough
1261   case CallingConv::ARM_AAPCS:
1262     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1263   case CallingConv::ARM_APCS:
1264     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1265   case CallingConv::GHC:
1266     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1267   }
1268 }
1269
1270 /// LowerCallResult - Lower the result values of a call into the
1271 /// appropriate copies out of appropriate physical registers.
1272 SDValue
1273 ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1274                                    CallingConv::ID CallConv, bool isVarArg,
1275                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1276                                    SDLoc dl, SelectionDAG &DAG,
1277                                    SmallVectorImpl<SDValue> &InVals,
1278                                    bool isThisReturn, SDValue ThisVal) const {
1279
1280   // Assign locations to each value returned by this call.
1281   SmallVector<CCValAssign, 16> RVLocs;
1282   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1283                     getTargetMachine(), RVLocs, *DAG.getContext(), Call);
1284   CCInfo.AnalyzeCallResult(Ins,
1285                            CCAssignFnForNode(CallConv, /* Return*/ true,
1286                                              isVarArg));
1287
1288   // Copy all of the result registers out of their specified physreg.
1289   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1290     CCValAssign VA = RVLocs[i];
1291
1292     // Pass 'this' value directly from the argument to return value, to avoid
1293     // reg unit interference
1294     if (i == 0 && isThisReturn) {
1295       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1296              "unexpected return calling convention register assignment");
1297       InVals.push_back(ThisVal);
1298       continue;
1299     }
1300
1301     SDValue Val;
1302     if (VA.needsCustom()) {
1303       // Handle f64 or half of a v2f64.
1304       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1305                                       InFlag);
1306       Chain = Lo.getValue(1);
1307       InFlag = Lo.getValue(2);
1308       VA = RVLocs[++i]; // skip ahead to next loc
1309       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1310                                       InFlag);
1311       Chain = Hi.getValue(1);
1312       InFlag = Hi.getValue(2);
1313       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1314
1315       if (VA.getLocVT() == MVT::v2f64) {
1316         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1317         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1318                           DAG.getConstant(0, MVT::i32));
1319
1320         VA = RVLocs[++i]; // skip ahead to next loc
1321         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1322         Chain = Lo.getValue(1);
1323         InFlag = Lo.getValue(2);
1324         VA = RVLocs[++i]; // skip ahead to next loc
1325         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1326         Chain = Hi.getValue(1);
1327         InFlag = Hi.getValue(2);
1328         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1329         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1330                           DAG.getConstant(1, MVT::i32));
1331       }
1332     } else {
1333       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1334                                InFlag);
1335       Chain = Val.getValue(1);
1336       InFlag = Val.getValue(2);
1337     }
1338
1339     switch (VA.getLocInfo()) {
1340     default: llvm_unreachable("Unknown loc info!");
1341     case CCValAssign::Full: break;
1342     case CCValAssign::BCvt:
1343       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1344       break;
1345     }
1346
1347     InVals.push_back(Val);
1348   }
1349
1350   return Chain;
1351 }
1352
1353 /// LowerMemOpCallTo - Store the argument to the stack.
1354 SDValue
1355 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
1356                                     SDValue StackPtr, SDValue Arg,
1357                                     SDLoc dl, SelectionDAG &DAG,
1358                                     const CCValAssign &VA,
1359                                     ISD::ArgFlagsTy Flags) const {
1360   unsigned LocMemOffset = VA.getLocMemOffset();
1361   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1362   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1363   return DAG.getStore(Chain, dl, Arg, PtrOff,
1364                       MachinePointerInfo::getStack(LocMemOffset),
1365                       false, false, 0);
1366 }
1367
1368 void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
1369                                          SDValue Chain, SDValue &Arg,
1370                                          RegsToPassVector &RegsToPass,
1371                                          CCValAssign &VA, CCValAssign &NextVA,
1372                                          SDValue &StackPtr,
1373                                          SmallVectorImpl<SDValue> &MemOpChains,
1374                                          ISD::ArgFlagsTy Flags) const {
1375
1376   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1377                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
1378   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
1379
1380   if (NextVA.isRegLoc())
1381     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
1382   else {
1383     assert(NextVA.isMemLoc());
1384     if (StackPtr.getNode() == 0)
1385       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
1386
1387     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
1388                                            dl, DAG, NextVA,
1389                                            Flags));
1390   }
1391 }
1392
1393 /// LowerCall - Lowering a call into a callseq_start <-
1394 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1395 /// nodes.
1396 SDValue
1397 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1398                              SmallVectorImpl<SDValue> &InVals) const {
1399   SelectionDAG &DAG                     = CLI.DAG;
1400   SDLoc &dl                          = CLI.DL;
1401   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1402   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1403   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1404   SDValue Chain                         = CLI.Chain;
1405   SDValue Callee                        = CLI.Callee;
1406   bool &isTailCall                      = CLI.IsTailCall;
1407   CallingConv::ID CallConv              = CLI.CallConv;
1408   bool doesNotRet                       = CLI.DoesNotReturn;
1409   bool isVarArg                         = CLI.IsVarArg;
1410
1411   MachineFunction &MF = DAG.getMachineFunction();
1412   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1413   bool isThisReturn   = false;
1414   bool isSibCall      = false;
1415   // Disable tail calls if they're not supported.
1416   if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
1417     isTailCall = false;
1418   if (isTailCall) {
1419     // Check if it's really possible to do a tail call.
1420     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1421                     isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
1422                                                    Outs, OutVals, Ins, DAG);
1423     // We don't support GuaranteedTailCallOpt for ARM, only automatically
1424     // detected sibcalls.
1425     if (isTailCall) {
1426       ++NumTailCalls;
1427       isSibCall = true;
1428     }
1429   }
1430
1431   // Analyze operands of the call, assigning locations to each operand.
1432   SmallVector<CCValAssign, 16> ArgLocs;
1433   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1434                  getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
1435   CCInfo.AnalyzeCallOperands(Outs,
1436                              CCAssignFnForNode(CallConv, /* Return*/ false,
1437                                                isVarArg));
1438
1439   // Get a count of how many bytes are to be pushed on the stack.
1440   unsigned NumBytes = CCInfo.getNextStackOffset();
1441
1442   // For tail calls, memory operands are available in our caller's stack.
1443   if (isSibCall)
1444     NumBytes = 0;
1445
1446   // Adjust the stack pointer for the new arguments...
1447   // These operations are automatically eliminated by the prolog/epilog pass
1448   if (!isSibCall)
1449     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
1450                                  dl);
1451
1452   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
1453
1454   RegsToPassVector RegsToPass;
1455   SmallVector<SDValue, 8> MemOpChains;
1456
1457   // Walk the register/memloc assignments, inserting copies/loads.  In the case
1458   // of tail call optimization, arguments are handled later.
1459   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1460        i != e;
1461        ++i, ++realArgIdx) {
1462     CCValAssign &VA = ArgLocs[i];
1463     SDValue Arg = OutVals[realArgIdx];
1464     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1465     bool isByVal = Flags.isByVal();
1466
1467     // Promote the value if needed.
1468     switch (VA.getLocInfo()) {
1469     default: llvm_unreachable("Unknown loc info!");
1470     case CCValAssign::Full: break;
1471     case CCValAssign::SExt:
1472       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1473       break;
1474     case CCValAssign::ZExt:
1475       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1476       break;
1477     case CCValAssign::AExt:
1478       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1479       break;
1480     case CCValAssign::BCvt:
1481       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1482       break;
1483     }
1484
1485     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1486     if (VA.needsCustom()) {
1487       if (VA.getLocVT() == MVT::v2f64) {
1488         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1489                                   DAG.getConstant(0, MVT::i32));
1490         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1491                                   DAG.getConstant(1, MVT::i32));
1492
1493         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1494                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1495
1496         VA = ArgLocs[++i]; // skip ahead to next loc
1497         if (VA.isRegLoc()) {
1498           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1499                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1500         } else {
1501           assert(VA.isMemLoc());
1502
1503           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1504                                                  dl, DAG, VA, Flags));
1505         }
1506       } else {
1507         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1508                          StackPtr, MemOpChains, Flags);
1509       }
1510     } else if (VA.isRegLoc()) {
1511       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
1512         assert(VA.getLocVT() == MVT::i32 &&
1513                "unexpected calling convention register assignment");
1514         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
1515                "unexpected use of 'returned'");
1516         isThisReturn = true;
1517       }
1518       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1519     } else if (isByVal) {
1520       assert(VA.isMemLoc());
1521       unsigned offset = 0;
1522
1523       // True if this byval aggregate will be split between registers
1524       // and memory.
1525       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
1526       unsigned CurByValIdx = CCInfo.getInRegsParamsProceed();
1527
1528       if (CurByValIdx < ByValArgsCount) {
1529
1530         unsigned RegBegin, RegEnd;
1531         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
1532
1533         EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1534         unsigned int i, j;
1535         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
1536           SDValue Const = DAG.getConstant(4*i, MVT::i32);
1537           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1538           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1539                                      MachinePointerInfo(),
1540                                      false, false, false, 0);
1541           MemOpChains.push_back(Load.getValue(1));
1542           RegsToPass.push_back(std::make_pair(j, Load));
1543         }
1544
1545         // If parameter size outsides register area, "offset" value
1546         // helps us to calculate stack slot for remained part properly.
1547         offset = RegEnd - RegBegin;
1548
1549         CCInfo.nextInRegsParam();
1550       }
1551
1552       if (Flags.getByValSize() > 4*offset) {
1553         unsigned LocMemOffset = VA.getLocMemOffset();
1554         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
1555         SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
1556                                   StkPtrOff);
1557         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
1558         SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
1559         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
1560                                            MVT::i32);
1561         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
1562
1563         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1564         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1565         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1566                                           Ops, array_lengthof(Ops)));
1567       }
1568     } else if (!isSibCall) {
1569       assert(VA.isMemLoc());
1570
1571       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1572                                              dl, DAG, VA, Flags));
1573     }
1574   }
1575
1576   if (!MemOpChains.empty())
1577     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1578                         &MemOpChains[0], MemOpChains.size());
1579
1580   // Build a sequence of copy-to-reg nodes chained together with token chain
1581   // and flag operands which copy the outgoing args into the appropriate regs.
1582   SDValue InFlag;
1583   // Tail call byval lowering might overwrite argument registers so in case of
1584   // tail call optimization the copies to registers are lowered later.
1585   if (!isTailCall)
1586     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1587       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1588                                RegsToPass[i].second, InFlag);
1589       InFlag = Chain.getValue(1);
1590     }
1591
1592   // For tail calls lower the arguments to the 'real' stack slot.
1593   if (isTailCall) {
1594     // Force all the incoming stack arguments to be loaded from the stack
1595     // before any new outgoing arguments are stored to the stack, because the
1596     // outgoing stack slots may alias the incoming argument stack slots, and
1597     // the alias isn't otherwise explicit. This is slightly more conservative
1598     // than necessary, because it means that each store effectively depends
1599     // on every argument instead of just those arguments it would clobber.
1600
1601     // Do not flag preceding copytoreg stuff together with the following stuff.
1602     InFlag = SDValue();
1603     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1604       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1605                                RegsToPass[i].second, InFlag);
1606       InFlag = Chain.getValue(1);
1607     }
1608     InFlag = SDValue();
1609   }
1610
1611   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1612   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1613   // node so that legalize doesn't hack it.
1614   bool isDirect = false;
1615   bool isARMFunc = false;
1616   bool isLocalARMFunc = false;
1617   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1618
1619   if (EnableARMLongCalls) {
1620     assert (getTargetMachine().getRelocationModel() == Reloc::Static
1621             && "long-calls with non-static relocation model!");
1622     // Handle a global address or an external symbol. If it's not one of
1623     // those, the target's already in a register, so we don't need to do
1624     // anything extra.
1625     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1626       const GlobalValue *GV = G->getGlobal();
1627       // Create a constant pool entry for the callee address
1628       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1629       ARMConstantPoolValue *CPV =
1630         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
1631
1632       // Get the address of the callee into a register
1633       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1634       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1635       Callee = DAG.getLoad(getPointerTy(), dl,
1636                            DAG.getEntryNode(), CPAddr,
1637                            MachinePointerInfo::getConstantPool(),
1638                            false, false, false, 0);
1639     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
1640       const char *Sym = S->getSymbol();
1641
1642       // Create a constant pool entry for the callee address
1643       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1644       ARMConstantPoolValue *CPV =
1645         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1646                                       ARMPCLabelIndex, 0);
1647       // Get the address of the callee into a register
1648       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1649       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1650       Callee = DAG.getLoad(getPointerTy(), dl,
1651                            DAG.getEntryNode(), CPAddr,
1652                            MachinePointerInfo::getConstantPool(),
1653                            false, false, false, 0);
1654     }
1655   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1656     const GlobalValue *GV = G->getGlobal();
1657     isDirect = true;
1658     bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
1659     bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
1660                    getTargetMachine().getRelocationModel() != Reloc::Static;
1661     isARMFunc = !Subtarget->isThumb() || isStub;
1662     // ARM call to a local ARM function is predicable.
1663     isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
1664     // tBX takes a register source operand.
1665     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1666       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1667       ARMConstantPoolValue *CPV =
1668         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4);
1669       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1670       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1671       Callee = DAG.getLoad(getPointerTy(), dl,
1672                            DAG.getEntryNode(), CPAddr,
1673                            MachinePointerInfo::getConstantPool(),
1674                            false, false, false, 0);
1675       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1676       Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
1677                            getPointerTy(), Callee, PICLabel);
1678     } else {
1679       // On ELF targets for PIC code, direct calls should go through the PLT
1680       unsigned OpFlags = 0;
1681       if (Subtarget->isTargetELF() &&
1682           getTargetMachine().getRelocationModel() == Reloc::PIC_)
1683         OpFlags = ARMII::MO_PLT;
1684       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
1685     }
1686   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1687     isDirect = true;
1688     bool isStub = Subtarget->isTargetDarwin() &&
1689                   getTargetMachine().getRelocationModel() != Reloc::Static;
1690     isARMFunc = !Subtarget->isThumb() || isStub;
1691     // tBX takes a register source operand.
1692     const char *Sym = S->getSymbol();
1693     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1694       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1695       ARMConstantPoolValue *CPV =
1696         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1697                                       ARMPCLabelIndex, 4);
1698       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1699       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1700       Callee = DAG.getLoad(getPointerTy(), dl,
1701                            DAG.getEntryNode(), CPAddr,
1702                            MachinePointerInfo::getConstantPool(),
1703                            false, false, false, 0);
1704       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1705       Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
1706                            getPointerTy(), Callee, PICLabel);
1707     } else {
1708       unsigned OpFlags = 0;
1709       // On ELF targets for PIC code, direct calls should go through the PLT
1710       if (Subtarget->isTargetELF() &&
1711                   getTargetMachine().getRelocationModel() == Reloc::PIC_)
1712         OpFlags = ARMII::MO_PLT;
1713       Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
1714     }
1715   }
1716
1717   // FIXME: handle tail calls differently.
1718   unsigned CallOpc;
1719   bool HasMinSizeAttr = MF.getFunction()->getAttributes().
1720     hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
1721   if (Subtarget->isThumb()) {
1722     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
1723       CallOpc = ARMISD::CALL_NOLINK;
1724     else
1725       CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
1726   } else {
1727     if (!isDirect && !Subtarget->hasV5TOps())
1728       CallOpc = ARMISD::CALL_NOLINK;
1729     else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
1730                // Emit regular call when code size is the priority
1731                !HasMinSizeAttr)
1732       // "mov lr, pc; b _foo" to avoid confusing the RSP
1733       CallOpc = ARMISD::CALL_NOLINK;
1734     else
1735       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
1736   }
1737
1738   std::vector<SDValue> Ops;
1739   Ops.push_back(Chain);
1740   Ops.push_back(Callee);
1741
1742   // Add argument registers to the end of the list so that they are known live
1743   // into the call.
1744   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1745     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1746                                   RegsToPass[i].second.getValueType()));
1747
1748   // Add a register mask operand representing the call-preserved registers.
1749   const uint32_t *Mask;
1750   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
1751   const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
1752   if (isThisReturn) {
1753     // For 'this' returns, use the R0-preserving mask if applicable
1754     Mask = ARI->getThisReturnPreservedMask(CallConv);
1755     if (!Mask) {
1756       // Set isThisReturn to false if the calling convention is not one that
1757       // allows 'returned' to be modeled in this way, so LowerCallResult does
1758       // not try to pass 'this' straight through
1759       isThisReturn = false;
1760       Mask = ARI->getCallPreservedMask(CallConv);
1761     }
1762   } else
1763     Mask = ARI->getCallPreservedMask(CallConv);
1764
1765   assert(Mask && "Missing call preserved mask for calling convention");
1766   Ops.push_back(DAG.getRegisterMask(Mask));
1767
1768   if (InFlag.getNode())
1769     Ops.push_back(InFlag);
1770
1771   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1772   if (isTailCall)
1773     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
1774
1775   // Returns a chain and a flag for retval copy to use.
1776   Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
1777   InFlag = Chain.getValue(1);
1778
1779   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1780                              DAG.getIntPtrConstant(0, true), InFlag, dl);
1781   if (!Ins.empty())
1782     InFlag = Chain.getValue(1);
1783
1784   // Handle result values, copying them out of physregs into vregs that we
1785   // return.
1786   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
1787                          InVals, isThisReturn,
1788                          isThisReturn ? OutVals[0] : SDValue());
1789 }
1790
1791 /// HandleByVal - Every parameter *after* a byval parameter is passed
1792 /// on the stack.  Remember the next parameter register to allocate,
1793 /// and then confiscate the rest of the parameter registers to insure
1794 /// this.
1795 void
1796 ARMTargetLowering::HandleByVal(
1797     CCState *State, unsigned &size, unsigned Align) const {
1798   unsigned reg = State->AllocateReg(GPRArgRegs, 4);
1799   assert((State->getCallOrPrologue() == Prologue ||
1800           State->getCallOrPrologue() == Call) &&
1801          "unhandled ParmContext");
1802
1803   // For in-prologue parameters handling, we also introduce stack offset
1804   // for byval registers: see CallingConvLower.cpp, CCState::HandleByVal.
1805   // This behaviour outsides AAPCS rules (5.5 Parameters Passing) of how
1806   // NSAA should be evaluted (NSAA means "next stacked argument address").
1807   // So: NextStackOffset = NSAAOffset + SizeOfByValParamsStoredInRegs.
1808   // Then: NSAAOffset = NextStackOffset - SizeOfByValParamsStoredInRegs.
1809   unsigned NSAAOffset = State->getNextStackOffset();
1810   if (State->getCallOrPrologue() != Call) {
1811     for (unsigned i = 0, e = State->getInRegsParamsCount(); i != e; ++i) {
1812       unsigned RB, RE;
1813       State->getInRegsParamInfo(i, RB, RE);
1814       assert(NSAAOffset >= (RE-RB)*4 &&
1815              "Stack offset for byval regs doesn't introduced anymore?");
1816       NSAAOffset -= (RE-RB)*4;
1817     }
1818   }
1819   if ((ARM::R0 <= reg) && (reg <= ARM::R3)) {
1820     if (Subtarget->isAAPCS_ABI() && Align > 4) {
1821       unsigned AlignInRegs = Align / 4;
1822       unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
1823       for (unsigned i = 0; i < Waste; ++i)
1824         reg = State->AllocateReg(GPRArgRegs, 4);
1825     }
1826     if (reg != 0) {
1827       unsigned excess = 4 * (ARM::R4 - reg);
1828
1829       // Special case when NSAA != SP and parameter size greater than size of
1830       // all remained GPR regs. In that case we can't split parameter, we must
1831       // send it to stack. We also must set NCRN to R4, so waste all
1832       // remained registers.
1833       if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) {
1834         while (State->AllocateReg(GPRArgRegs, 4))
1835           ;
1836         return;
1837       }
1838
1839       // First register for byval parameter is the first register that wasn't
1840       // allocated before this method call, so it would be "reg".
1841       // If parameter is small enough to be saved in range [reg, r4), then
1842       // the end (first after last) register would be reg + param-size-in-regs,
1843       // else parameter would be splitted between registers and stack,
1844       // end register would be r4 in this case.
1845       unsigned ByValRegBegin = reg;
1846       unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4;
1847       State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
1848       // Note, first register is allocated in the beginning of function already,
1849       // allocate remained amount of registers we need.
1850       for (unsigned i = reg+1; i != ByValRegEnd; ++i)
1851         State->AllocateReg(GPRArgRegs, 4);
1852       // At a call site, a byval parameter that is split between
1853       // registers and memory needs its size truncated here.  In a
1854       // function prologue, such byval parameters are reassembled in
1855       // memory, and are not truncated.
1856       if (State->getCallOrPrologue() == Call) {
1857         // Make remained size equal to 0 in case, when
1858         // the whole structure may be stored into registers.
1859         if (size < excess)
1860           size = 0;
1861         else
1862           size -= excess;
1863       }
1864     }
1865   }
1866 }
1867
1868 /// MatchingStackOffset - Return true if the given stack call argument is
1869 /// already available in the same position (relatively) of the caller's
1870 /// incoming argument stack.
1871 static
1872 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
1873                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
1874                          const TargetInstrInfo *TII) {
1875   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
1876   int FI = INT_MAX;
1877   if (Arg.getOpcode() == ISD::CopyFromReg) {
1878     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
1879     if (!TargetRegisterInfo::isVirtualRegister(VR))
1880       return false;
1881     MachineInstr *Def = MRI->getVRegDef(VR);
1882     if (!Def)
1883       return false;
1884     if (!Flags.isByVal()) {
1885       if (!TII->isLoadFromStackSlot(Def, FI))
1886         return false;
1887     } else {
1888       return false;
1889     }
1890   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
1891     if (Flags.isByVal())
1892       // ByVal argument is passed in as a pointer but it's now being
1893       // dereferenced. e.g.
1894       // define @foo(%struct.X* %A) {
1895       //   tail call @bar(%struct.X* byval %A)
1896       // }
1897       return false;
1898     SDValue Ptr = Ld->getBasePtr();
1899     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
1900     if (!FINode)
1901       return false;
1902     FI = FINode->getIndex();
1903   } else
1904     return false;
1905
1906   assert(FI != INT_MAX);
1907   if (!MFI->isFixedObjectIndex(FI))
1908     return false;
1909   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
1910 }
1911
1912 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
1913 /// for tail call optimization. Targets which want to do tail call
1914 /// optimization should implement this function.
1915 bool
1916 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
1917                                                      CallingConv::ID CalleeCC,
1918                                                      bool isVarArg,
1919                                                      bool isCalleeStructRet,
1920                                                      bool isCallerStructRet,
1921                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
1922                                     const SmallVectorImpl<SDValue> &OutVals,
1923                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1924                                                      SelectionDAG& DAG) const {
1925   const Function *CallerF = DAG.getMachineFunction().getFunction();
1926   CallingConv::ID CallerCC = CallerF->getCallingConv();
1927   bool CCMatch = CallerCC == CalleeCC;
1928
1929   // Look for obvious safe cases to perform tail call optimization that do not
1930   // require ABI changes. This is what gcc calls sibcall.
1931
1932   // Do not sibcall optimize vararg calls unless the call site is not passing
1933   // any arguments.
1934   if (isVarArg && !Outs.empty())
1935     return false;
1936
1937   // Also avoid sibcall optimization if either caller or callee uses struct
1938   // return semantics.
1939   if (isCalleeStructRet || isCallerStructRet)
1940     return false;
1941
1942   // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
1943   // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
1944   // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
1945   // support in the assembler and linker to be used. This would need to be
1946   // fixed to fully support tail calls in Thumb1.
1947   //
1948   // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
1949   // LR.  This means if we need to reload LR, it takes an extra instructions,
1950   // which outweighs the value of the tail call; but here we don't know yet
1951   // whether LR is going to be used.  Probably the right approach is to
1952   // generate the tail call here and turn it back into CALL/RET in
1953   // emitEpilogue if LR is used.
1954
1955   // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
1956   // but we need to make sure there are enough registers; the only valid
1957   // registers are the 4 used for parameters.  We don't currently do this
1958   // case.
1959   if (Subtarget->isThumb1Only())
1960     return false;
1961
1962   // If the calling conventions do not match, then we'd better make sure the
1963   // results are returned in the same way as what the caller expects.
1964   if (!CCMatch) {
1965     SmallVector<CCValAssign, 16> RVLocs1;
1966     ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
1967                        getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
1968     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
1969
1970     SmallVector<CCValAssign, 16> RVLocs2;
1971     ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
1972                        getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
1973     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
1974
1975     if (RVLocs1.size() != RVLocs2.size())
1976       return false;
1977     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
1978       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
1979         return false;
1980       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
1981         return false;
1982       if (RVLocs1[i].isRegLoc()) {
1983         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
1984           return false;
1985       } else {
1986         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
1987           return false;
1988       }
1989     }
1990   }
1991
1992   // If Caller's vararg or byval argument has been split between registers and
1993   // stack, do not perform tail call, since part of the argument is in caller's
1994   // local frame.
1995   const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
1996                                       getInfo<ARMFunctionInfo>();
1997   if (AFI_Caller->getArgRegsSaveSize())
1998     return false;
1999
2000   // If the callee takes no arguments then go on to check the results of the
2001   // call.
2002   if (!Outs.empty()) {
2003     // Check if stack adjustment is needed. For now, do not do this if any
2004     // argument is passed on the stack.
2005     SmallVector<CCValAssign, 16> ArgLocs;
2006     ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2007                       getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
2008     CCInfo.AnalyzeCallOperands(Outs,
2009                                CCAssignFnForNode(CalleeCC, false, isVarArg));
2010     if (CCInfo.getNextStackOffset()) {
2011       MachineFunction &MF = DAG.getMachineFunction();
2012
2013       // Check if the arguments are already laid out in the right way as
2014       // the caller's fixed stack objects.
2015       MachineFrameInfo *MFI = MF.getFrameInfo();
2016       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2017       const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
2018       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2019            i != e;
2020            ++i, ++realArgIdx) {
2021         CCValAssign &VA = ArgLocs[i];
2022         EVT RegVT = VA.getLocVT();
2023         SDValue Arg = OutVals[realArgIdx];
2024         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2025         if (VA.getLocInfo() == CCValAssign::Indirect)
2026           return false;
2027         if (VA.needsCustom()) {
2028           // f64 and vector types are split into multiple registers or
2029           // register/stack-slot combinations.  The types will not match
2030           // the registers; give up on memory f64 refs until we figure
2031           // out what to do about this.
2032           if (!VA.isRegLoc())
2033             return false;
2034           if (!ArgLocs[++i].isRegLoc())
2035             return false;
2036           if (RegVT == MVT::v2f64) {
2037             if (!ArgLocs[++i].isRegLoc())
2038               return false;
2039             if (!ArgLocs[++i].isRegLoc())
2040               return false;
2041           }
2042         } else if (!VA.isRegLoc()) {
2043           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2044                                    MFI, MRI, TII))
2045             return false;
2046         }
2047       }
2048     }
2049   }
2050
2051   return true;
2052 }
2053
2054 bool
2055 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2056                                   MachineFunction &MF, bool isVarArg,
2057                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2058                                   LLVMContext &Context) const {
2059   SmallVector<CCValAssign, 16> RVLocs;
2060   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
2061   return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
2062                                                     isVarArg));
2063 }
2064
2065 SDValue
2066 ARMTargetLowering::LowerReturn(SDValue Chain,
2067                                CallingConv::ID CallConv, bool isVarArg,
2068                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2069                                const SmallVectorImpl<SDValue> &OutVals,
2070                                SDLoc dl, SelectionDAG &DAG) const {
2071
2072   // CCValAssign - represent the assignment of the return value to a location.
2073   SmallVector<CCValAssign, 16> RVLocs;
2074
2075   // CCState - Info about the registers and stack slots.
2076   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
2077                     getTargetMachine(), RVLocs, *DAG.getContext(), Call);
2078
2079   // Analyze outgoing return values.
2080   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
2081                                                isVarArg));
2082
2083   SDValue Flag;
2084   SmallVector<SDValue, 4> RetOps;
2085   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2086
2087   // Copy the result values into the output registers.
2088   for (unsigned i = 0, realRVLocIdx = 0;
2089        i != RVLocs.size();
2090        ++i, ++realRVLocIdx) {
2091     CCValAssign &VA = RVLocs[i];
2092     assert(VA.isRegLoc() && "Can only return in registers!");
2093
2094     SDValue Arg = OutVals[realRVLocIdx];
2095
2096     switch (VA.getLocInfo()) {
2097     default: llvm_unreachable("Unknown loc info!");
2098     case CCValAssign::Full: break;
2099     case CCValAssign::BCvt:
2100       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2101       break;
2102     }
2103
2104     if (VA.needsCustom()) {
2105       if (VA.getLocVT() == MVT::v2f64) {
2106         // Extract the first half and return it in two registers.
2107         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2108                                    DAG.getConstant(0, MVT::i32));
2109         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2110                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2111
2112         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
2113         Flag = Chain.getValue(1);
2114         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2115         VA = RVLocs[++i]; // skip ahead to next loc
2116         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2117                                  HalfGPRs.getValue(1), Flag);
2118         Flag = Chain.getValue(1);
2119         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2120         VA = RVLocs[++i]; // skip ahead to next loc
2121
2122         // Extract the 2nd half and fall through to handle it as an f64 value.
2123         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2124                           DAG.getConstant(1, MVT::i32));
2125       }
2126       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2127       // available.
2128       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2129                                   DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
2130       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
2131       Flag = Chain.getValue(1);
2132       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2133       VA = RVLocs[++i]; // skip ahead to next loc
2134       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
2135                                Flag);
2136     } else
2137       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2138
2139     // Guarantee that all emitted copies are
2140     // stuck together, avoiding something bad.
2141     Flag = Chain.getValue(1);
2142     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2143   }
2144
2145   // Update chain and glue.
2146   RetOps[0] = Chain;
2147   if (Flag.getNode())
2148     RetOps.push_back(Flag);
2149
2150   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
2151                      RetOps.data(), RetOps.size());
2152 }
2153
2154 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2155   if (N->getNumValues() != 1)
2156     return false;
2157   if (!N->hasNUsesOfValue(1, 0))
2158     return false;
2159
2160   SDValue TCChain = Chain;
2161   SDNode *Copy = *N->use_begin();
2162   if (Copy->getOpcode() == ISD::CopyToReg) {
2163     // If the copy has a glue operand, we conservatively assume it isn't safe to
2164     // perform a tail call.
2165     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2166       return false;
2167     TCChain = Copy->getOperand(0);
2168   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2169     SDNode *VMov = Copy;
2170     // f64 returned in a pair of GPRs.
2171     SmallPtrSet<SDNode*, 2> Copies;
2172     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2173          UI != UE; ++UI) {
2174       if (UI->getOpcode() != ISD::CopyToReg)
2175         return false;
2176       Copies.insert(*UI);
2177     }
2178     if (Copies.size() > 2)
2179       return false;
2180
2181     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2182          UI != UE; ++UI) {
2183       SDValue UseChain = UI->getOperand(0);
2184       if (Copies.count(UseChain.getNode()))
2185         // Second CopyToReg
2186         Copy = *UI;
2187       else
2188         // First CopyToReg
2189         TCChain = UseChain;
2190     }
2191   } else if (Copy->getOpcode() == ISD::BITCAST) {
2192     // f32 returned in a single GPR.
2193     if (!Copy->hasOneUse())
2194       return false;
2195     Copy = *Copy->use_begin();
2196     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2197       return false;
2198     TCChain = Copy->getOperand(0);
2199   } else {
2200     return false;
2201   }
2202
2203   bool HasRet = false;
2204   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2205        UI != UE; ++UI) {
2206     if (UI->getOpcode() != ARMISD::RET_FLAG)
2207       return false;
2208     HasRet = true;
2209   }
2210
2211   if (!HasRet)
2212     return false;
2213
2214   Chain = TCChain;
2215   return true;
2216 }
2217
2218 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2219   if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
2220     return false;
2221
2222   if (!CI->isTailCall())
2223     return false;
2224
2225   return !Subtarget->isThumb1Only();
2226 }
2227
2228 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2229 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2230 // one of the above mentioned nodes. It has to be wrapped because otherwise
2231 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2232 // be used to form addressing mode. These wrapped nodes will be selected
2233 // into MOVi.
2234 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
2235   EVT PtrVT = Op.getValueType();
2236   // FIXME there is no actual debug info here
2237   SDLoc dl(Op);
2238   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2239   SDValue Res;
2240   if (CP->isMachineConstantPoolEntry())
2241     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2242                                     CP->getAlignment());
2243   else
2244     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2245                                     CP->getAlignment());
2246   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2247 }
2248
2249 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2250   return MachineJumpTableInfo::EK_Inline;
2251 }
2252
2253 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2254                                              SelectionDAG &DAG) const {
2255   MachineFunction &MF = DAG.getMachineFunction();
2256   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2257   unsigned ARMPCLabelIndex = 0;
2258   SDLoc DL(Op);
2259   EVT PtrVT = getPointerTy();
2260   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2261   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2262   SDValue CPAddr;
2263   if (RelocM == Reloc::Static) {
2264     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2265   } else {
2266     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2267     ARMPCLabelIndex = AFI->createPICLabelUId();
2268     ARMConstantPoolValue *CPV =
2269       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2270                                       ARMCP::CPBlockAddress, PCAdj);
2271     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2272   }
2273   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2274   SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
2275                                MachinePointerInfo::getConstantPool(),
2276                                false, false, false, 0);
2277   if (RelocM == Reloc::Static)
2278     return Result;
2279   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2280   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2281 }
2282
2283 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
2284 SDValue
2285 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2286                                                  SelectionDAG &DAG) const {
2287   SDLoc dl(GA);
2288   EVT PtrVT = getPointerTy();
2289   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2290   MachineFunction &MF = DAG.getMachineFunction();
2291   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2292   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2293   ARMConstantPoolValue *CPV =
2294     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2295                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2296   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2297   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2298   Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
2299                          MachinePointerInfo::getConstantPool(),
2300                          false, false, false, 0);
2301   SDValue Chain = Argument.getValue(1);
2302
2303   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2304   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2305
2306   // call __tls_get_addr.
2307   ArgListTy Args;
2308   ArgListEntry Entry;
2309   Entry.Node = Argument;
2310   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2311   Args.push_back(Entry);
2312   // FIXME: is there useful debug info available here?
2313   TargetLowering::CallLoweringInfo CLI(Chain,
2314                 (Type *) Type::getInt32Ty(*DAG.getContext()),
2315                 false, false, false, false,
2316                 0, CallingConv::C, /*isTailCall=*/false,
2317                 /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
2318                 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
2319   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2320   return CallResult.first;
2321 }
2322
2323 // Lower ISD::GlobalTLSAddress using the "initial exec" or
2324 // "local exec" model.
2325 SDValue
2326 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2327                                         SelectionDAG &DAG,
2328                                         TLSModel::Model model) const {
2329   const GlobalValue *GV = GA->getGlobal();
2330   SDLoc dl(GA);
2331   SDValue Offset;
2332   SDValue Chain = DAG.getEntryNode();
2333   EVT PtrVT = getPointerTy();
2334   // Get the Thread Pointer
2335   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2336
2337   if (model == TLSModel::InitialExec) {
2338     MachineFunction &MF = DAG.getMachineFunction();
2339     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2340     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2341     // Initial exec model.
2342     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2343     ARMConstantPoolValue *CPV =
2344       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2345                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2346                                       true);
2347     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2348     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2349     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2350                          MachinePointerInfo::getConstantPool(),
2351                          false, false, false, 0);
2352     Chain = Offset.getValue(1);
2353
2354     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2355     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
2356
2357     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2358                          MachinePointerInfo::getConstantPool(),
2359                          false, false, false, 0);
2360   } else {
2361     // local exec model
2362     assert(model == TLSModel::LocalExec);
2363     ARMConstantPoolValue *CPV =
2364       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
2365     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2366     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2367     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2368                          MachinePointerInfo::getConstantPool(),
2369                          false, false, false, 0);
2370   }
2371
2372   // The address of the thread local variable is the add of the thread
2373   // pointer with the offset of the variable.
2374   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
2375 }
2376
2377 SDValue
2378 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
2379   // TODO: implement the "local dynamic" model
2380   assert(Subtarget->isTargetELF() &&
2381          "TLS not implemented for non-ELF targets");
2382   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2383
2384   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
2385
2386   switch (model) {
2387     case TLSModel::GeneralDynamic:
2388     case TLSModel::LocalDynamic:
2389       return LowerToTLSGeneralDynamicModel(GA, DAG);
2390     case TLSModel::InitialExec:
2391     case TLSModel::LocalExec:
2392       return LowerToTLSExecModels(GA, DAG, model);
2393   }
2394   llvm_unreachable("bogus TLS model");
2395 }
2396
2397 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
2398                                                  SelectionDAG &DAG) const {
2399   EVT PtrVT = getPointerTy();
2400   SDLoc dl(Op);
2401   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2402   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2403     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
2404     ARMConstantPoolValue *CPV =
2405       ARMConstantPoolConstant::Create(GV,
2406                                       UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
2407     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2408     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2409     SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
2410                                  CPAddr,
2411                                  MachinePointerInfo::getConstantPool(),
2412                                  false, false, false, 0);
2413     SDValue Chain = Result.getValue(1);
2414     SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
2415     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
2416     if (!UseGOTOFF)
2417       Result = DAG.getLoad(PtrVT, dl, Chain, Result,
2418                            MachinePointerInfo::getGOT(),
2419                            false, false, false, 0);
2420     return Result;
2421   }
2422
2423   // If we have T2 ops, we can materialize the address directly via movt/movw
2424   // pair. This is always cheaper.
2425   if (Subtarget->useMovt()) {
2426     ++NumMovwMovt;
2427     // FIXME: Once remat is capable of dealing with instructions with register
2428     // operands, expand this into two nodes.
2429     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2430                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2431   } else {
2432     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2433     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2434     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2435                        MachinePointerInfo::getConstantPool(),
2436                        false, false, false, 0);
2437   }
2438 }
2439
2440 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
2441                                                     SelectionDAG &DAG) const {
2442   EVT PtrVT = getPointerTy();
2443   SDLoc dl(Op);
2444   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2445   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2446
2447   // FIXME: Enable this for static codegen when tool issues are fixed.  Also
2448   // update ARMFastISel::ARMMaterializeGV.
2449   if (Subtarget->useMovt() && RelocM != Reloc::Static) {
2450     ++NumMovwMovt;
2451     // FIXME: Once remat is capable of dealing with instructions with register
2452     // operands, expand this into two nodes.
2453     if (RelocM == Reloc::Static)
2454       return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2455                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2456
2457     unsigned Wrapper = (RelocM == Reloc::PIC_)
2458       ? ARMISD::WrapperPIC : ARMISD::WrapperDYN;
2459     SDValue Result = DAG.getNode(Wrapper, dl, PtrVT,
2460                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2461     if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2462       Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
2463                            MachinePointerInfo::getGOT(),
2464                            false, false, false, 0);
2465     return Result;
2466   }
2467
2468   unsigned ARMPCLabelIndex = 0;
2469   SDValue CPAddr;
2470   if (RelocM == Reloc::Static) {
2471     CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2472   } else {
2473     ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2474     ARMPCLabelIndex = AFI->createPICLabelUId();
2475     unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
2476     ARMConstantPoolValue *CPV =
2477       ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue,
2478                                       PCAdj);
2479     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2480   }
2481   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2482
2483   SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2484                                MachinePointerInfo::getConstantPool(),
2485                                false, false, false, 0);
2486   SDValue Chain = Result.getValue(1);
2487
2488   if (RelocM == Reloc::PIC_) {
2489     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2490     Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2491   }
2492
2493   if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2494     Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(),
2495                          false, false, false, 0);
2496
2497   return Result;
2498 }
2499
2500 SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
2501                                                     SelectionDAG &DAG) const {
2502   assert(Subtarget->isTargetELF() &&
2503          "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
2504   MachineFunction &MF = DAG.getMachineFunction();
2505   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2506   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2507   EVT PtrVT = getPointerTy();
2508   SDLoc dl(Op);
2509   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2510   ARMConstantPoolValue *CPV =
2511     ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
2512                                   ARMPCLabelIndex, PCAdj);
2513   SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2514   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2515   SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2516                                MachinePointerInfo::getConstantPool(),
2517                                false, false, false, 0);
2518   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2519   return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2520 }
2521
2522 SDValue
2523 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
2524   SDLoc dl(Op);
2525   SDValue Val = DAG.getConstant(0, MVT::i32);
2526   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
2527                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
2528                      Op.getOperand(1), Val);
2529 }
2530
2531 SDValue
2532 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
2533   SDLoc dl(Op);
2534   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
2535                      Op.getOperand(1), DAG.getConstant(0, MVT::i32));
2536 }
2537
2538 SDValue
2539 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
2540                                           const ARMSubtarget *Subtarget) const {
2541   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2542   SDLoc dl(Op);
2543   switch (IntNo) {
2544   default: return SDValue();    // Don't custom lower most intrinsics.
2545   case Intrinsic::arm_thread_pointer: {
2546     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2547     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2548   }
2549   case Intrinsic::eh_sjlj_lsda: {
2550     MachineFunction &MF = DAG.getMachineFunction();
2551     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2552     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2553     EVT PtrVT = getPointerTy();
2554     Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2555     SDValue CPAddr;
2556     unsigned PCAdj = (RelocM != Reloc::PIC_)
2557       ? 0 : (Subtarget->isThumb() ? 4 : 8);
2558     ARMConstantPoolValue *CPV =
2559       ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
2560                                       ARMCP::CPLSDA, PCAdj);
2561     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2562     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2563     SDValue Result =
2564       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2565                   MachinePointerInfo::getConstantPool(),
2566                   false, false, false, 0);
2567
2568     if (RelocM == Reloc::PIC_) {
2569       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2570       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2571     }
2572     return Result;
2573   }
2574   case Intrinsic::arm_neon_vmulls:
2575   case Intrinsic::arm_neon_vmullu: {
2576     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
2577       ? ARMISD::VMULLs : ARMISD::VMULLu;
2578     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2579                        Op.getOperand(1), Op.getOperand(2));
2580   }
2581   }
2582 }
2583
2584 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
2585                                  const ARMSubtarget *Subtarget) {
2586   // FIXME: handle "fence singlethread" more efficiently.
2587   SDLoc dl(Op);
2588   if (!Subtarget->hasDataBarrier()) {
2589     // Some ARMv6 cpus can support data barriers with an mcr instruction.
2590     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
2591     // here.
2592     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
2593            "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
2594     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
2595                        DAG.getConstant(0, MVT::i32));
2596   }
2597
2598   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
2599   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
2600   unsigned Domain = ARM_MB::ISH;
2601   if (Subtarget->isSwift() && Ord == Release) {
2602     // Swift happens to implement ISHST barriers in a way that's compatible with
2603     // Release semantics but weaker than ISH so we'd be fools not to use
2604     // it. Beware: other processors probably don't!
2605     Domain = ARM_MB::ISHST;
2606   }
2607
2608   return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
2609                      DAG.getConstant(Domain, MVT::i32));
2610 }
2611
2612 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
2613                              const ARMSubtarget *Subtarget) {
2614   // ARM pre v5TE and Thumb1 does not have preload instructions.
2615   if (!(Subtarget->isThumb2() ||
2616         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
2617     // Just preserve the chain.
2618     return Op.getOperand(0);
2619
2620   SDLoc dl(Op);
2621   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
2622   if (!isRead &&
2623       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
2624     // ARMv7 with MP extension has PLDW.
2625     return Op.getOperand(0);
2626
2627   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2628   if (Subtarget->isThumb()) {
2629     // Invert the bits.
2630     isRead = ~isRead & 1;
2631     isData = ~isData & 1;
2632   }
2633
2634   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
2635                      Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
2636                      DAG.getConstant(isData, MVT::i32));
2637 }
2638
2639 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
2640   MachineFunction &MF = DAG.getMachineFunction();
2641   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
2642
2643   // vastart just stores the address of the VarArgsFrameIndex slot into the
2644   // memory location argument.
2645   SDLoc dl(Op);
2646   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2647   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
2648   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2649   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
2650                       MachinePointerInfo(SV), false, false, 0);
2651 }
2652
2653 SDValue
2654 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
2655                                         SDValue &Root, SelectionDAG &DAG,
2656                                         SDLoc dl) const {
2657   MachineFunction &MF = DAG.getMachineFunction();
2658   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2659
2660   const TargetRegisterClass *RC;
2661   if (AFI->isThumb1OnlyFunction())
2662     RC = &ARM::tGPRRegClass;
2663   else
2664     RC = &ARM::GPRRegClass;
2665
2666   // Transform the arguments stored in physical registers into virtual ones.
2667   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2668   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2669
2670   SDValue ArgValue2;
2671   if (NextVA.isMemLoc()) {
2672     MachineFrameInfo *MFI = MF.getFrameInfo();
2673     int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
2674
2675     // Create load node to retrieve arguments from the stack.
2676     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2677     ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
2678                             MachinePointerInfo::getFixedStack(FI),
2679                             false, false, false, 0);
2680   } else {
2681     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2682     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2683   }
2684
2685   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
2686 }
2687
2688 void
2689 ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
2690                                   unsigned InRegsParamRecordIdx,
2691                                   unsigned ArgSize,
2692                                   unsigned &ArgRegsSize,
2693                                   unsigned &ArgRegsSaveSize)
2694   const {
2695   unsigned NumGPRs;
2696   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
2697     unsigned RBegin, REnd;
2698     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
2699     NumGPRs = REnd - RBegin;
2700   } else {
2701     unsigned int firstUnalloced;
2702     firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
2703                                                 sizeof(GPRArgRegs) /
2704                                                 sizeof(GPRArgRegs[0]));
2705     NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
2706   }
2707
2708   unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
2709   ArgRegsSize = NumGPRs * 4;
2710
2711   // If parameter is split between stack and GPRs...
2712   if (NumGPRs && Align == 8 &&
2713       (ArgRegsSize < ArgSize ||
2714         InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) {
2715     // Add padding for part of param recovered from GPRs, so
2716     // its last byte must be at address K*8 - 1.
2717     // We need to do it, since remained (stack) part of parameter has
2718     // stack alignment, and we need to "attach" "GPRs head" without gaps
2719     // to it:
2720     // Stack:
2721     // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes...
2722     // [ [padding] [GPRs head] ] [        Tail passed via stack       ....
2723     //
2724     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2725     unsigned Padding =
2726         ((ArgRegsSize + AFI->getArgRegsSaveSize() + Align - 1) & ~(Align-1)) -
2727         (ArgRegsSize + AFI->getArgRegsSaveSize());
2728     ArgRegsSaveSize = ArgRegsSize + Padding;
2729   } else
2730     // We don't need to extend regs save size for byval parameters if they
2731     // are passed via GPRs only.
2732     ArgRegsSaveSize = ArgRegsSize;
2733 }
2734
2735 // The remaining GPRs hold either the beginning of variable-argument
2736 // data, or the beginning of an aggregate passed by value (usually
2737 // byval).  Either way, we allocate stack slots adjacent to the data
2738 // provided by our caller, and store the unallocated registers there.
2739 // If this is a variadic function, the va_list pointer will begin with
2740 // these values; otherwise, this reassembles a (byval) structure that
2741 // was split between registers and memory.
2742 // Return: The frame index registers were stored into.
2743 int
2744 ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
2745                                   SDLoc dl, SDValue &Chain,
2746                                   const Value *OrigArg,
2747                                   unsigned InRegsParamRecordIdx,
2748                                   unsigned OffsetFromOrigArg,
2749                                   unsigned ArgOffset,
2750                                   unsigned ArgSize,
2751                                   bool ForceMutable) const {
2752
2753   // Currently, two use-cases possible:
2754   // Case #1. Non var-args function, and we meet first byval parameter.
2755   //          Setup first unallocated register as first byval register;
2756   //          eat all remained registers
2757   //          (these two actions are performed by HandleByVal method).
2758   //          Then, here, we initialize stack frame with
2759   //          "store-reg" instructions.
2760   // Case #2. Var-args function, that doesn't contain byval parameters.
2761   //          The same: eat all remained unallocated registers,
2762   //          initialize stack frame.
2763
2764   MachineFunction &MF = DAG.getMachineFunction();
2765   MachineFrameInfo *MFI = MF.getFrameInfo();
2766   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2767   unsigned firstRegToSaveIndex, lastRegToSaveIndex;
2768   unsigned RBegin, REnd;
2769   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
2770     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
2771     firstRegToSaveIndex = RBegin - ARM::R0;
2772     lastRegToSaveIndex = REnd - ARM::R0;
2773   } else {
2774     firstRegToSaveIndex = CCInfo.getFirstUnallocated
2775       (GPRArgRegs, array_lengthof(GPRArgRegs));
2776     lastRegToSaveIndex = 4;
2777   }
2778
2779   unsigned ArgRegsSize, ArgRegsSaveSize;
2780   computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize,
2781                  ArgRegsSize, ArgRegsSaveSize);
2782
2783   // Store any by-val regs to their spots on the stack so that they may be
2784   // loaded by deferencing the result of formal parameter pointer or va_next.
2785   // Note: once stack area for byval/varargs registers
2786   // was initialized, it can't be initialized again.
2787   if (ArgRegsSaveSize) {
2788
2789     unsigned Padding = ArgRegsSaveSize - ArgRegsSize;
2790
2791     if (Padding) {
2792       assert(AFI->getStoredByValParamsPadding() == 0 &&
2793              "The only parameter may be padded.");
2794       AFI->setStoredByValParamsPadding(Padding);
2795     }
2796
2797     int FrameIndex = MFI->CreateFixedObject(
2798                       ArgRegsSaveSize,
2799                       Padding + ArgOffset,
2800                       false);
2801     SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
2802
2803     SmallVector<SDValue, 4> MemOps;
2804     for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex;
2805          ++firstRegToSaveIndex, ++i) {
2806       const TargetRegisterClass *RC;
2807       if (AFI->isThumb1OnlyFunction())
2808         RC = &ARM::tGPRRegClass;
2809       else
2810         RC = &ARM::GPRRegClass;
2811
2812       unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
2813       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
2814       SDValue Store =
2815         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2816                      MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i),
2817                      false, false, 0);
2818       MemOps.push_back(Store);
2819       FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
2820                         DAG.getConstant(4, getPointerTy()));
2821     }
2822
2823     AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize());
2824
2825     if (!MemOps.empty())
2826       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2827                           &MemOps[0], MemOps.size());
2828     return FrameIndex;
2829   } else
2830     // This will point to the next argument passed via stack.
2831     return MFI->CreateFixedObject(
2832         4, AFI->getStoredByValParamsPadding() + ArgOffset, !ForceMutable);
2833 }
2834
2835 // Setup stack frame, the va_list pointer will start from.
2836 void
2837 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
2838                                         SDLoc dl, SDValue &Chain,
2839                                         unsigned ArgOffset,
2840                                         bool ForceMutable) const {
2841   MachineFunction &MF = DAG.getMachineFunction();
2842   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2843
2844   // Try to store any remaining integer argument regs
2845   // to their spots on the stack so that they may be loaded by deferencing
2846   // the result of va_next.
2847   // If there is no regs to be stored, just point address after last
2848   // argument passed via stack.
2849   int FrameIndex =
2850     StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(),
2851                    0, ArgOffset, 0, ForceMutable);
2852
2853   AFI->setVarArgsFrameIndex(FrameIndex);
2854 }
2855
2856 SDValue
2857 ARMTargetLowering::LowerFormalArguments(SDValue Chain,
2858                                         CallingConv::ID CallConv, bool isVarArg,
2859                                         const SmallVectorImpl<ISD::InputArg>
2860                                           &Ins,
2861                                         SDLoc dl, SelectionDAG &DAG,
2862                                         SmallVectorImpl<SDValue> &InVals)
2863                                           const {
2864   MachineFunction &MF = DAG.getMachineFunction();
2865   MachineFrameInfo *MFI = MF.getFrameInfo();
2866
2867   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2868
2869   // Assign locations to all of the incoming arguments.
2870   SmallVector<CCValAssign, 16> ArgLocs;
2871   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
2872                     getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
2873   CCInfo.AnalyzeFormalArguments(Ins,
2874                                 CCAssignFnForNode(CallConv, /* Return*/ false,
2875                                                   isVarArg));
2876
2877   SmallVector<SDValue, 16> ArgValues;
2878   int lastInsIndex = -1;
2879   SDValue ArgValue;
2880   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
2881   unsigned CurArgIdx = 0;
2882
2883   // Initially ArgRegsSaveSize is zero.
2884   // Then we increase this value each time we meet byval parameter.
2885   // We also increase this value in case of varargs function.
2886   AFI->setArgRegsSaveSize(0);
2887
2888   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2889     CCValAssign &VA = ArgLocs[i];
2890     std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
2891     CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
2892     // Arguments stored in registers.
2893     if (VA.isRegLoc()) {
2894       EVT RegVT = VA.getLocVT();
2895
2896       if (VA.needsCustom()) {
2897         // f64 and vector types are split up into multiple registers or
2898         // combinations of registers and stack slots.
2899         if (VA.getLocVT() == MVT::v2f64) {
2900           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
2901                                                    Chain, DAG, dl);
2902           VA = ArgLocs[++i]; // skip ahead to next loc
2903           SDValue ArgValue2;
2904           if (VA.isMemLoc()) {
2905             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
2906             SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2907             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
2908                                     MachinePointerInfo::getFixedStack(FI),
2909                                     false, false, false, 0);
2910           } else {
2911             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
2912                                              Chain, DAG, dl);
2913           }
2914           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2915           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
2916                                  ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
2917           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
2918                                  ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
2919         } else
2920           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
2921
2922       } else {
2923         const TargetRegisterClass *RC;
2924
2925         if (RegVT == MVT::f32)
2926           RC = &ARM::SPRRegClass;
2927         else if (RegVT == MVT::f64)
2928           RC = &ARM::DPRRegClass;
2929         else if (RegVT == MVT::v2f64)
2930           RC = &ARM::QPRRegClass;
2931         else if (RegVT == MVT::i32)
2932           RC = AFI->isThumb1OnlyFunction() ?
2933             (const TargetRegisterClass*)&ARM::tGPRRegClass :
2934             (const TargetRegisterClass*)&ARM::GPRRegClass;
2935         else
2936           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2937
2938         // Transform the arguments in physical registers into virtual ones.
2939         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2940         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2941       }
2942
2943       // If this is an 8 or 16-bit value, it is really passed promoted
2944       // to 32 bits.  Insert an assert[sz]ext to capture this, then
2945       // truncate to the right size.
2946       switch (VA.getLocInfo()) {
2947       default: llvm_unreachable("Unknown loc info!");
2948       case CCValAssign::Full: break;
2949       case CCValAssign::BCvt:
2950         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2951         break;
2952       case CCValAssign::SExt:
2953         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2954                                DAG.getValueType(VA.getValVT()));
2955         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2956         break;
2957       case CCValAssign::ZExt:
2958         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2959                                DAG.getValueType(VA.getValVT()));
2960         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2961         break;
2962       }
2963
2964       InVals.push_back(ArgValue);
2965
2966     } else { // VA.isRegLoc()
2967
2968       // sanity check
2969       assert(VA.isMemLoc());
2970       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
2971
2972       int index = ArgLocs[i].getValNo();
2973
2974       // Some Ins[] entries become multiple ArgLoc[] entries.
2975       // Process them only once.
2976       if (index != lastInsIndex)
2977         {
2978           ISD::ArgFlagsTy Flags = Ins[index].Flags;
2979           // FIXME: For now, all byval parameter objects are marked mutable.
2980           // This can be changed with more analysis.
2981           // In case of tail call optimization mark all arguments mutable.
2982           // Since they could be overwritten by lowering of arguments in case of
2983           // a tail call.
2984           if (Flags.isByVal()) {
2985             unsigned CurByValIndex = CCInfo.getInRegsParamsProceed();
2986             int FrameIndex = StoreByValRegs(
2987                 CCInfo, DAG, dl, Chain, CurOrigArg,
2988                 CurByValIndex,
2989                 Ins[VA.getValNo()].PartOffset,
2990                 VA.getLocMemOffset(),
2991                 Flags.getByValSize(),
2992                 true /*force mutable frames*/);
2993             InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
2994             CCInfo.nextInRegsParam();
2995           } else {
2996             unsigned FIOffset = VA.getLocMemOffset() +
2997                                 AFI->getStoredByValParamsPadding();
2998             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
2999                                             FIOffset, true);
3000
3001             // Create load nodes to retrieve arguments from the stack.
3002             SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
3003             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
3004                                          MachinePointerInfo::getFixedStack(FI),
3005                                          false, false, false, 0));
3006           }
3007           lastInsIndex = index;
3008         }
3009     }
3010   }
3011
3012   // varargs
3013   if (isVarArg)
3014     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3015                          CCInfo.getNextStackOffset());
3016
3017   return Chain;
3018 }
3019
3020 /// isFloatingPointZero - Return true if this is +0.0.
3021 static bool isFloatingPointZero(SDValue Op) {
3022   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
3023     return CFP->getValueAPF().isPosZero();
3024   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
3025     // Maybe this has already been legalized into the constant pool?
3026     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
3027       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
3028       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
3029         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
3030           return CFP->getValueAPF().isPosZero();
3031     }
3032   }
3033   return false;
3034 }
3035
3036 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
3037 /// the given operands.
3038 SDValue
3039 ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3040                              SDValue &ARMcc, SelectionDAG &DAG,
3041                              SDLoc dl) const {
3042   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3043     unsigned C = RHSC->getZExtValue();
3044     if (!isLegalICmpImmediate(C)) {
3045       // Constant does not fit, try adjusting it by one?
3046       switch (CC) {
3047       default: break;
3048       case ISD::SETLT:
3049       case ISD::SETGE:
3050         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
3051           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3052           RHS = DAG.getConstant(C-1, MVT::i32);
3053         }
3054         break;
3055       case ISD::SETULT:
3056       case ISD::SETUGE:
3057         if (C != 0 && isLegalICmpImmediate(C-1)) {
3058           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3059           RHS = DAG.getConstant(C-1, MVT::i32);
3060         }
3061         break;
3062       case ISD::SETLE:
3063       case ISD::SETGT:
3064         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
3065           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3066           RHS = DAG.getConstant(C+1, MVT::i32);
3067         }
3068         break;
3069       case ISD::SETULE:
3070       case ISD::SETUGT:
3071         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
3072           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3073           RHS = DAG.getConstant(C+1, MVT::i32);
3074         }
3075         break;
3076       }
3077     }
3078   }
3079
3080   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3081   ARMISD::NodeType CompareType;
3082   switch (CondCode) {
3083   default:
3084     CompareType = ARMISD::CMP;
3085     break;
3086   case ARMCC::EQ:
3087   case ARMCC::NE:
3088     // Uses only Z Flag
3089     CompareType = ARMISD::CMPZ;
3090     break;
3091   }
3092   ARMcc = DAG.getConstant(CondCode, MVT::i32);
3093   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
3094 }
3095
3096 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
3097 SDValue
3098 ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
3099                              SDLoc dl) const {
3100   SDValue Cmp;
3101   if (!isFloatingPointZero(RHS))
3102     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
3103   else
3104     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
3105   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
3106 }
3107
3108 /// duplicateCmp - Glue values can have only one use, so this function
3109 /// duplicates a comparison node.
3110 SDValue
3111 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
3112   unsigned Opc = Cmp.getOpcode();
3113   SDLoc DL(Cmp);
3114   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
3115     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3116
3117   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
3118   Cmp = Cmp.getOperand(0);
3119   Opc = Cmp.getOpcode();
3120   if (Opc == ARMISD::CMPFP)
3121     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3122   else {
3123     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
3124     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
3125   }
3126   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
3127 }
3128
3129 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
3130   SDValue Cond = Op.getOperand(0);
3131   SDValue SelectTrue = Op.getOperand(1);
3132   SDValue SelectFalse = Op.getOperand(2);
3133   SDLoc dl(Op);
3134
3135   // Convert:
3136   //
3137   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
3138   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
3139   //
3140   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
3141     const ConstantSDNode *CMOVTrue =
3142       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
3143     const ConstantSDNode *CMOVFalse =
3144       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3145
3146     if (CMOVTrue && CMOVFalse) {
3147       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
3148       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
3149
3150       SDValue True;
3151       SDValue False;
3152       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
3153         True = SelectTrue;
3154         False = SelectFalse;
3155       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
3156         True = SelectFalse;
3157         False = SelectTrue;
3158       }
3159
3160       if (True.getNode() && False.getNode()) {
3161         EVT VT = Op.getValueType();
3162         SDValue ARMcc = Cond.getOperand(2);
3163         SDValue CCR = Cond.getOperand(3);
3164         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
3165         assert(True.getValueType() == VT);
3166         return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
3167       }
3168     }
3169   }
3170
3171   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
3172   // undefined bits before doing a full-word comparison with zero.
3173   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
3174                      DAG.getConstant(1, Cond.getValueType()));
3175
3176   return DAG.getSelectCC(dl, Cond,
3177                          DAG.getConstant(0, Cond.getValueType()),
3178                          SelectTrue, SelectFalse, ISD::SETNE);
3179 }
3180
3181 static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) {
3182   if (CC == ISD::SETNE)
3183     return ISD::SETEQ;
3184   return ISD::getSetCCSwappedOperands(CC);
3185 }
3186
3187 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
3188                                  bool &swpCmpOps, bool &swpVselOps) {
3189   // Start by selecting the GE condition code for opcodes that return true for
3190   // 'equality'
3191   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
3192       CC == ISD::SETULE)
3193     CondCode = ARMCC::GE;
3194
3195   // and GT for opcodes that return false for 'equality'.
3196   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
3197            CC == ISD::SETULT)
3198     CondCode = ARMCC::GT;
3199
3200   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
3201   // to swap the compare operands.
3202   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
3203       CC == ISD::SETULT)
3204     swpCmpOps = true;
3205
3206   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
3207   // If we have an unordered opcode, we need to swap the operands to the VSEL
3208   // instruction (effectively negating the condition).
3209   //
3210   // This also has the effect of swapping which one of 'less' or 'greater'
3211   // returns true, so we also swap the compare operands. It also switches
3212   // whether we return true for 'equality', so we compensate by picking the
3213   // opposite condition code to our original choice.
3214   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
3215       CC == ISD::SETUGT) {
3216     swpCmpOps = !swpCmpOps;
3217     swpVselOps = !swpVselOps;
3218     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
3219   }
3220
3221   // 'ordered' is 'anything but unordered', so use the VS condition code and
3222   // swap the VSEL operands.
3223   if (CC == ISD::SETO) {
3224     CondCode = ARMCC::VS;
3225     swpVselOps = true;
3226   }
3227
3228   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
3229   // code and swap the VSEL operands.
3230   if (CC == ISD::SETUNE) {
3231     CondCode = ARMCC::EQ;
3232     swpVselOps = true;
3233   }
3234 }
3235
3236 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
3237   EVT VT = Op.getValueType();
3238   SDValue LHS = Op.getOperand(0);
3239   SDValue RHS = Op.getOperand(1);
3240   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
3241   SDValue TrueVal = Op.getOperand(2);
3242   SDValue FalseVal = Op.getOperand(3);
3243   SDLoc dl(Op);
3244
3245   if (LHS.getValueType() == MVT::i32) {
3246     // Try to generate VSEL on ARMv8.
3247     // The VSEL instruction can't use all the usual ARM condition
3248     // codes: it only has two bits to select the condition code, so it's
3249     // constrained to use only GE, GT, VS and EQ.
3250     //
3251     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
3252     // swap the operands of the previous compare instruction (effectively
3253     // inverting the compare condition, swapping 'less' and 'greater') and
3254     // sometimes need to swap the operands to the VSEL (which inverts the
3255     // condition in the sense of firing whenever the previous condition didn't)
3256     if (getSubtarget()->hasV8FP() && (TrueVal.getValueType() == MVT::f32 ||
3257                                       TrueVal.getValueType() == MVT::f64)) {
3258       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3259       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
3260           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
3261         CC = getInverseCCForVSEL(CC);
3262         std::swap(TrueVal, FalseVal);
3263       }
3264     }
3265
3266     SDValue ARMcc;
3267     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3268     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3269     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
3270                        Cmp);
3271   }
3272
3273   ARMCC::CondCodes CondCode, CondCode2;
3274   FPCCToARMCC(CC, CondCode, CondCode2);
3275
3276   // Try to generate VSEL on ARMv8.
3277   if (getSubtarget()->hasV8FP() && (TrueVal.getValueType() == MVT::f32 ||
3278                                     TrueVal.getValueType() == MVT::f64)) {
3279     bool swpCmpOps = false;
3280     bool swpVselOps = false;
3281     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
3282
3283     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
3284         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
3285       if (swpCmpOps)
3286         std::swap(LHS, RHS);
3287       if (swpVselOps)
3288         std::swap(TrueVal, FalseVal);
3289     }
3290   }
3291
3292   SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
3293   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3294   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3295   SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
3296                                ARMcc, CCR, Cmp);
3297   if (CondCode2 != ARMCC::AL) {
3298     SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32);
3299     // FIXME: Needs another CMP because flag can have but one use.
3300     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
3301     Result = DAG.getNode(ARMISD::CMOV, dl, VT,
3302                          Result, TrueVal, ARMcc2, CCR, Cmp2);
3303   }
3304   return Result;
3305 }
3306
3307 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
3308 /// to morph to an integer compare sequence.
3309 static bool canChangeToInt(SDValue Op, bool &SeenZero,
3310                            const ARMSubtarget *Subtarget) {
3311   SDNode *N = Op.getNode();
3312   if (!N->hasOneUse())
3313     // Otherwise it requires moving the value from fp to integer registers.
3314     return false;
3315   if (!N->getNumValues())
3316     return false;
3317   EVT VT = Op.getValueType();
3318   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
3319     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
3320     // vmrs are very slow, e.g. cortex-a8.
3321     return false;
3322
3323   if (isFloatingPointZero(Op)) {
3324     SeenZero = true;
3325     return true;
3326   }
3327   return ISD::isNormalLoad(N);
3328 }
3329
3330 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
3331   if (isFloatingPointZero(Op))
3332     return DAG.getConstant(0, MVT::i32);
3333
3334   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
3335     return DAG.getLoad(MVT::i32, SDLoc(Op),
3336                        Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
3337                        Ld->isVolatile(), Ld->isNonTemporal(),
3338                        Ld->isInvariant(), Ld->getAlignment());
3339
3340   llvm_unreachable("Unknown VFP cmp argument!");
3341 }
3342
3343 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
3344                            SDValue &RetVal1, SDValue &RetVal2) {
3345   if (isFloatingPointZero(Op)) {
3346     RetVal1 = DAG.getConstant(0, MVT::i32);
3347     RetVal2 = DAG.getConstant(0, MVT::i32);
3348     return;
3349   }
3350
3351   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
3352     SDValue Ptr = Ld->getBasePtr();
3353     RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op),
3354                           Ld->getChain(), Ptr,
3355                           Ld->getPointerInfo(),
3356                           Ld->isVolatile(), Ld->isNonTemporal(),
3357                           Ld->isInvariant(), Ld->getAlignment());
3358
3359     EVT PtrType = Ptr.getValueType();
3360     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
3361     SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op),
3362                                  PtrType, Ptr, DAG.getConstant(4, PtrType));
3363     RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op),
3364                           Ld->getChain(), NewPtr,
3365                           Ld->getPointerInfo().getWithOffset(4),
3366                           Ld->isVolatile(), Ld->isNonTemporal(),
3367                           Ld->isInvariant(), NewAlign);
3368     return;
3369   }
3370
3371   llvm_unreachable("Unknown VFP cmp argument!");
3372 }
3373
3374 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
3375 /// f32 and even f64 comparisons to integer ones.
3376 SDValue
3377 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
3378   SDValue Chain = Op.getOperand(0);
3379   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3380   SDValue LHS = Op.getOperand(2);
3381   SDValue RHS = Op.getOperand(3);
3382   SDValue Dest = Op.getOperand(4);
3383   SDLoc dl(Op);
3384
3385   bool LHSSeenZero = false;
3386   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
3387   bool RHSSeenZero = false;
3388   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
3389   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
3390     // If unsafe fp math optimization is enabled and there are no other uses of
3391     // the CMP operands, and the condition code is EQ or NE, we can optimize it
3392     // to an integer comparison.
3393     if (CC == ISD::SETOEQ)
3394       CC = ISD::SETEQ;
3395     else if (CC == ISD::SETUNE)
3396       CC = ISD::SETNE;
3397
3398     SDValue Mask = DAG.getConstant(0x7fffffff, MVT::i32);
3399     SDValue ARMcc;
3400     if (LHS.getValueType() == MVT::f32) {
3401       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3402                         bitcastf32Toi32(LHS, DAG), Mask);
3403       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3404                         bitcastf32Toi32(RHS, DAG), Mask);
3405       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3406       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3407       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3408                          Chain, Dest, ARMcc, CCR, Cmp);
3409     }
3410
3411     SDValue LHS1, LHS2;
3412     SDValue RHS1, RHS2;
3413     expandf64Toi32(LHS, DAG, LHS1, LHS2);
3414     expandf64Toi32(RHS, DAG, RHS1, RHS2);
3415     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
3416     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
3417     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3418     ARMcc = DAG.getConstant(CondCode, MVT::i32);
3419     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3420     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
3421     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7);
3422   }
3423
3424   return SDValue();
3425 }
3426
3427 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3428   SDValue Chain = Op.getOperand(0);
3429   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3430   SDValue LHS = Op.getOperand(2);
3431   SDValue RHS = Op.getOperand(3);
3432   SDValue Dest = Op.getOperand(4);
3433   SDLoc dl(Op);
3434
3435   if (LHS.getValueType() == MVT::i32) {
3436     SDValue ARMcc;
3437     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3438     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3439     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3440                        Chain, Dest, ARMcc, CCR, Cmp);
3441   }
3442
3443   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
3444
3445   if (getTargetMachine().Options.UnsafeFPMath &&
3446       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
3447        CC == ISD::SETNE || CC == ISD::SETUNE)) {
3448     SDValue Result = OptimizeVFPBrcond(Op, DAG);
3449     if (Result.getNode())
3450       return Result;
3451   }
3452
3453   ARMCC::CondCodes CondCode, CondCode2;
3454   FPCCToARMCC(CC, CondCode, CondCode2);
3455
3456   SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
3457   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3458   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3459   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3460   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
3461   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
3462   if (CondCode2 != ARMCC::AL) {
3463     ARMcc = DAG.getConstant(CondCode2, MVT::i32);
3464     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
3465     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
3466   }
3467   return Res;
3468 }
3469
3470 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3471   SDValue Chain = Op.getOperand(0);
3472   SDValue Table = Op.getOperand(1);
3473   SDValue Index = Op.getOperand(2);
3474   SDLoc dl(Op);
3475
3476   EVT PTy = getPointerTy();
3477   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
3478   ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3479   SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
3480   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
3481   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
3482   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
3483   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
3484   if (Subtarget->isThumb2()) {
3485     // Thumb2 uses a two-level jump. That is, it jumps into the jump table
3486     // which does another jump to the destination. This also makes it easier
3487     // to translate it to TBB / TBH later.
3488     // FIXME: This might not work if the function is extremely large.
3489     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
3490                        Addr, Op.getOperand(2), JTI, UId);
3491   }
3492   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
3493     Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
3494                        MachinePointerInfo::getJumpTable(),
3495                        false, false, false, 0);
3496     Chain = Addr.getValue(1);
3497     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
3498     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
3499   } else {
3500     Addr = DAG.getLoad(PTy, dl, Chain, Addr,
3501                        MachinePointerInfo::getJumpTable(),
3502                        false, false, false, 0);
3503     Chain = Addr.getValue(1);
3504     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
3505   }
3506 }
3507
3508 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
3509   EVT VT = Op.getValueType();
3510   SDLoc dl(Op);
3511
3512   if (Op.getValueType().getVectorElementType() == MVT::i32) {
3513     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
3514       return Op;
3515     return DAG.UnrollVectorOp(Op.getNode());
3516   }
3517
3518   assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
3519          "Invalid type for custom lowering!");
3520   if (VT != MVT::v4i16)
3521     return DAG.UnrollVectorOp(Op.getNode());
3522
3523   Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
3524   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
3525 }
3526
3527 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
3528   EVT VT = Op.getValueType();
3529   if (VT.isVector())
3530     return LowerVectorFP_TO_INT(Op, DAG);
3531
3532   SDLoc dl(Op);
3533   unsigned Opc;
3534
3535   switch (Op.getOpcode()) {
3536   default: llvm_unreachable("Invalid opcode!");
3537   case ISD::FP_TO_SINT:
3538     Opc = ARMISD::FTOSI;
3539     break;
3540   case ISD::FP_TO_UINT:
3541     Opc = ARMISD::FTOUI;
3542     break;
3543   }
3544   Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
3545   return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
3546 }
3547
3548 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
3549   EVT VT = Op.getValueType();
3550   SDLoc dl(Op);
3551
3552   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
3553     if (VT.getVectorElementType() == MVT::f32)
3554       return Op;
3555     return DAG.UnrollVectorOp(Op.getNode());
3556   }
3557
3558   assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
3559          "Invalid type for custom lowering!");
3560   if (VT != MVT::v4f32)
3561     return DAG.UnrollVectorOp(Op.getNode());
3562
3563   unsigned CastOpc;
3564   unsigned Opc;
3565   switch (Op.getOpcode()) {
3566   default: llvm_unreachable("Invalid opcode!");
3567   case ISD::SINT_TO_FP:
3568     CastOpc = ISD::SIGN_EXTEND;
3569     Opc = ISD::SINT_TO_FP;
3570     break;
3571   case ISD::UINT_TO_FP:
3572     CastOpc = ISD::ZERO_EXTEND;
3573     Opc = ISD::UINT_TO_FP;
3574     break;
3575   }
3576
3577   Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
3578   return DAG.getNode(Opc, dl, VT, Op);
3579 }
3580
3581 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
3582   EVT VT = Op.getValueType();
3583   if (VT.isVector())
3584     return LowerVectorINT_TO_FP(Op, DAG);
3585
3586   SDLoc dl(Op);
3587   unsigned Opc;
3588
3589   switch (Op.getOpcode()) {
3590   default: llvm_unreachable("Invalid opcode!");
3591   case ISD::SINT_TO_FP:
3592     Opc = ARMISD::SITOF;
3593     break;
3594   case ISD::UINT_TO_FP:
3595     Opc = ARMISD::UITOF;
3596     break;
3597   }
3598
3599   Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
3600   return DAG.getNode(Opc, dl, VT, Op);
3601 }
3602
3603 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
3604   // Implement fcopysign with a fabs and a conditional fneg.
3605   SDValue Tmp0 = Op.getOperand(0);
3606   SDValue Tmp1 = Op.getOperand(1);
3607   SDLoc dl(Op);
3608   EVT VT = Op.getValueType();
3609   EVT SrcVT = Tmp1.getValueType();
3610   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
3611     Tmp0.getOpcode() == ARMISD::VMOVDRR;
3612   bool UseNEON = !InGPR && Subtarget->hasNEON();
3613
3614   if (UseNEON) {
3615     // Use VBSL to copy the sign bit.
3616     unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
3617     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
3618                                DAG.getTargetConstant(EncodedVal, MVT::i32));
3619     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
3620     if (VT == MVT::f64)
3621       Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
3622                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
3623                          DAG.getConstant(32, MVT::i32));
3624     else /*if (VT == MVT::f32)*/
3625       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
3626     if (SrcVT == MVT::f32) {
3627       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
3628       if (VT == MVT::f64)
3629         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
3630                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
3631                            DAG.getConstant(32, MVT::i32));
3632     } else if (VT == MVT::f32)
3633       Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
3634                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
3635                          DAG.getConstant(32, MVT::i32));
3636     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
3637     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
3638
3639     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
3640                                             MVT::i32);
3641     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
3642     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
3643                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
3644
3645     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
3646                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
3647                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
3648     if (VT == MVT::f32) {
3649       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
3650       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
3651                         DAG.getConstant(0, MVT::i32));
3652     } else {
3653       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
3654     }
3655
3656     return Res;
3657   }
3658
3659   // Bitcast operand 1 to i32.
3660   if (SrcVT == MVT::f64)
3661     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
3662                        &Tmp1, 1).getValue(1);
3663   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
3664
3665   // Or in the signbit with integer operations.
3666   SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
3667   SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
3668   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
3669   if (VT == MVT::f32) {
3670     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
3671                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
3672     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
3673                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
3674   }
3675
3676   // f64: Or the high part with signbit and then combine two parts.
3677   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
3678                      &Tmp0, 1);
3679   SDValue Lo = Tmp0.getValue(0);
3680   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
3681   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
3682   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
3683 }
3684
3685 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
3686   MachineFunction &MF = DAG.getMachineFunction();
3687   MachineFrameInfo *MFI = MF.getFrameInfo();
3688   MFI->setReturnAddressIsTaken(true);
3689
3690   EVT VT = Op.getValueType();
3691   SDLoc dl(Op);
3692   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3693   if (Depth) {
3694     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
3695     SDValue Offset = DAG.getConstant(4, MVT::i32);
3696     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
3697                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
3698                        MachinePointerInfo(), false, false, false, 0);
3699   }
3700
3701   // Return LR, which contains the return address. Mark it an implicit live-in.
3702   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3703   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
3704 }
3705
3706 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
3707   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3708   MFI->setFrameAddressIsTaken(true);
3709
3710   EVT VT = Op.getValueType();
3711   SDLoc dl(Op);  // FIXME probably not meaningful
3712   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3713   unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
3714     ? ARM::R7 : ARM::R11;
3715   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
3716   while (Depth--)
3717     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
3718                             MachinePointerInfo(),
3719                             false, false, false, 0);
3720   return FrameAddr;
3721 }
3722
3723 /// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
3724 /// and size(DestVec) > 128-bits.
3725 /// This is achieved by doing the one extension from the SrcVec, splitting the
3726 /// result, extending these parts, and then concatenating these into the
3727 /// destination.
3728 static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
3729   SDValue Op = N->getOperand(0);
3730   EVT SrcVT = Op.getValueType();
3731   EVT DestVT = N->getValueType(0);
3732
3733   assert(DestVT.getSizeInBits() > 128 &&
3734          "Custom sext/zext expansion needs >128-bit vector.");
3735   // If this is a normal length extension, use the default expansion.
3736   if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
3737       SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
3738     return SDValue();
3739
3740   SDLoc dl(N);
3741   unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
3742   unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
3743   unsigned NumElts = SrcVT.getVectorNumElements();
3744   LLVMContext &Ctx = *DAG.getContext();
3745   SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
3746
3747   EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
3748                                NumElts);
3749   EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
3750                                  NumElts/2);
3751   EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
3752                                NumElts/2);
3753
3754   Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
3755   SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
3756                         DAG.getIntPtrConstant(0));
3757   SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
3758                         DAG.getIntPtrConstant(NumElts/2));
3759   ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
3760   ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
3761   return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
3762 }
3763
3764 /// ExpandBITCAST - If the target supports VFP, this function is called to
3765 /// expand a bit convert where either the source or destination type is i64 to
3766 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
3767 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
3768 /// vectors), since the legalizer won't know what to do with that.
3769 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
3770   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3771   SDLoc dl(N);
3772   SDValue Op = N->getOperand(0);
3773
3774   // This function is only supposed to be called for i64 types, either as the
3775   // source or destination of the bit convert.
3776   EVT SrcVT = Op.getValueType();
3777   EVT DstVT = N->getValueType(0);
3778   assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
3779          "ExpandBITCAST called for non-i64 type");
3780
3781   // Turn i64->f64 into VMOVDRR.
3782   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
3783     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
3784                              DAG.getConstant(0, MVT::i32));
3785     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
3786                              DAG.getConstant(1, MVT::i32));
3787     return DAG.getNode(ISD::BITCAST, dl, DstVT,
3788                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
3789   }
3790
3791   // Turn f64->i64 into VMOVRRD.
3792   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
3793     SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
3794                               DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
3795     // Merge the pieces into a single i64 value.
3796     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
3797   }
3798
3799   return SDValue();
3800 }
3801
3802 /// getZeroVector - Returns a vector of specified type with all zero elements.
3803 /// Zero vectors are used to represent vector negation and in those cases
3804 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
3805 /// not support i64 elements, so sometimes the zero vectors will need to be
3806 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
3807 /// zero vector.
3808 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
3809   assert(VT.isVector() && "Expected a vector type");
3810   // The canonical modified immediate encoding of a zero vector is....0!
3811   SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
3812   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
3813   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
3814   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
3815 }
3816
3817 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
3818 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
3819 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
3820                                                 SelectionDAG &DAG) const {
3821   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3822   EVT VT = Op.getValueType();
3823   unsigned VTBits = VT.getSizeInBits();
3824   SDLoc dl(Op);
3825   SDValue ShOpLo = Op.getOperand(0);
3826   SDValue ShOpHi = Op.getOperand(1);
3827   SDValue ShAmt  = Op.getOperand(2);
3828   SDValue ARMcc;
3829   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
3830
3831   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
3832
3833   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
3834                                  DAG.getConstant(VTBits, MVT::i32), ShAmt);
3835   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
3836   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
3837                                    DAG.getConstant(VTBits, MVT::i32));
3838   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
3839   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3840   SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
3841
3842   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3843   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
3844                           ARMcc, DAG, dl);
3845   SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
3846   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
3847                            CCR, Cmp);
3848
3849   SDValue Ops[2] = { Lo, Hi };
3850   return DAG.getMergeValues(Ops, 2, dl);
3851 }
3852
3853 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
3854 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
3855 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
3856                                                SelectionDAG &DAG) const {
3857   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3858   EVT VT = Op.getValueType();
3859   unsigned VTBits = VT.getSizeInBits();
3860   SDLoc dl(Op);
3861   SDValue ShOpLo = Op.getOperand(0);
3862   SDValue ShOpHi = Op.getOperand(1);
3863   SDValue ShAmt  = Op.getOperand(2);
3864   SDValue ARMcc;
3865
3866   assert(Op.getOpcode() == ISD::SHL_PARTS);
3867   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
3868                                  DAG.getConstant(VTBits, MVT::i32), ShAmt);
3869   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
3870   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
3871                                    DAG.getConstant(VTBits, MVT::i32));
3872   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
3873   SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
3874
3875   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3876   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3877   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
3878                           ARMcc, DAG, dl);
3879   SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
3880   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
3881                            CCR, Cmp);
3882
3883   SDValue Ops[2] = { Lo, Hi };
3884   return DAG.getMergeValues(Ops, 2, dl);
3885 }
3886
3887 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3888                                             SelectionDAG &DAG) const {
3889   // The rounding mode is in bits 23:22 of the FPSCR.
3890   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3891   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3892   // so that the shift + and get folded into a bitfield extract.
3893   SDLoc dl(Op);
3894   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
3895                               DAG.getConstant(Intrinsic::arm_get_fpscr,
3896                                               MVT::i32));
3897   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
3898                                   DAG.getConstant(1U << 22, MVT::i32));
3899   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
3900                               DAG.getConstant(22, MVT::i32));
3901   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
3902                      DAG.getConstant(3, MVT::i32));
3903 }
3904
3905 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
3906                          const ARMSubtarget *ST) {
3907   EVT VT = N->getValueType(0);
3908   SDLoc dl(N);
3909
3910   if (!ST->hasV6T2Ops())
3911     return SDValue();
3912
3913   SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
3914   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
3915 }
3916
3917 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
3918 /// for each 16-bit element from operand, repeated.  The basic idea is to
3919 /// leverage vcnt to get the 8-bit counts, gather and add the results.
3920 ///
3921 /// Trace for v4i16:
3922 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
3923 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
3924 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
3925 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
3926 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
3927 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
3928 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
3929 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
3930 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
3931   EVT VT = N->getValueType(0);
3932   SDLoc DL(N);
3933
3934   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
3935   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
3936   SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
3937   SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
3938   SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
3939   return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
3940 }
3941
3942 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
3943 /// bit-count for each 16-bit element from the operand.  We need slightly
3944 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
3945 /// 64/128-bit registers.
3946 ///
3947 /// Trace for v4i16:
3948 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
3949 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
3950 /// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
3951 /// v4i16:Extracted = [k0    k1    k2    k3    ]
3952 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
3953   EVT VT = N->getValueType(0);
3954   SDLoc DL(N);
3955
3956   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
3957   if (VT.is64BitVector()) {
3958     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
3959     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
3960                        DAG.getIntPtrConstant(0));
3961   } else {
3962     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
3963                                     BitCounts, DAG.getIntPtrConstant(0));
3964     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
3965   }
3966 }
3967
3968 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
3969 /// bit-count for each 32-bit element from the operand.  The idea here is
3970 /// to split the vector into 16-bit elements, leverage the 16-bit count
3971 /// routine, and then combine the results.
3972 ///
3973 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
3974 /// input    = [v0    v1    ] (vi: 32-bit elements)
3975 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
3976 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
3977 /// vrev: N0 = [k1 k0 k3 k2 ]
3978 ///            [k0 k1 k2 k3 ]
3979 ///       N1 =+[k1 k0 k3 k2 ]
3980 ///            [k0 k2 k1 k3 ]
3981 ///       N2 =+[k1 k3 k0 k2 ]
3982 ///            [k0    k2    k1    k3    ]
3983 /// Extended =+[k1    k3    k0    k2    ]
3984 ///            [k0    k2    ]
3985 /// Extracted=+[k1    k3    ]
3986 ///
3987 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
3988   EVT VT = N->getValueType(0);
3989   SDLoc DL(N);
3990
3991   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
3992
3993   SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
3994   SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
3995   SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
3996   SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
3997   SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
3998
3999   if (VT.is64BitVector()) {
4000     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
4001     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
4002                        DAG.getIntPtrConstant(0));
4003   } else {
4004     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
4005                                     DAG.getIntPtrConstant(0));
4006     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
4007   }
4008 }
4009
4010 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
4011                           const ARMSubtarget *ST) {
4012   EVT VT = N->getValueType(0);
4013
4014   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
4015   assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
4016           VT == MVT::v4i16 || VT == MVT::v8i16) &&
4017          "Unexpected type for custom ctpop lowering");
4018
4019   if (VT.getVectorElementType() == MVT::i32)
4020     return lowerCTPOP32BitElements(N, DAG);
4021   else
4022     return lowerCTPOP16BitElements(N, DAG);
4023 }
4024
4025 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
4026                           const ARMSubtarget *ST) {
4027   EVT VT = N->getValueType(0);
4028   SDLoc dl(N);
4029
4030   if (!VT.isVector())
4031     return SDValue();
4032
4033   // Lower vector shifts on NEON to use VSHL.
4034   assert(ST->hasNEON() && "unexpected vector shift");
4035
4036   // Left shifts translate directly to the vshiftu intrinsic.
4037   if (N->getOpcode() == ISD::SHL)
4038     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4039                        DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
4040                        N->getOperand(0), N->getOperand(1));
4041
4042   assert((N->getOpcode() == ISD::SRA ||
4043           N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
4044
4045   // NEON uses the same intrinsics for both left and right shifts.  For
4046   // right shifts, the shift amounts are negative, so negate the vector of
4047   // shift amounts.
4048   EVT ShiftVT = N->getOperand(1).getValueType();
4049   SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
4050                                      getZeroVector(ShiftVT, DAG, dl),
4051                                      N->getOperand(1));
4052   Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
4053                              Intrinsic::arm_neon_vshifts :
4054                              Intrinsic::arm_neon_vshiftu);
4055   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4056                      DAG.getConstant(vshiftInt, MVT::i32),
4057                      N->getOperand(0), NegatedCount);
4058 }
4059
4060 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
4061                                 const ARMSubtarget *ST) {
4062   EVT VT = N->getValueType(0);
4063   SDLoc dl(N);
4064
4065   // We can get here for a node like i32 = ISD::SHL i32, i64
4066   if (VT != MVT::i64)
4067     return SDValue();
4068
4069   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
4070          "Unknown shift to lower!");
4071
4072   // We only lower SRA, SRL of 1 here, all others use generic lowering.
4073   if (!isa<ConstantSDNode>(N->getOperand(1)) ||
4074       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
4075     return SDValue();
4076
4077   // If we are in thumb mode, we don't have RRX.
4078   if (ST->isThumb1Only()) return SDValue();
4079
4080   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
4081   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
4082                            DAG.getConstant(0, MVT::i32));
4083   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
4084                            DAG.getConstant(1, MVT::i32));
4085
4086   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
4087   // captures the result into a carry flag.
4088   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
4089   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1);
4090
4091   // The low part is an ARMISD::RRX operand, which shifts the carry in.
4092   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
4093
4094   // Merge the pieces into a single i64 value.
4095  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
4096 }
4097
4098 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
4099   SDValue TmpOp0, TmpOp1;
4100   bool Invert = false;
4101   bool Swap = false;
4102   unsigned Opc = 0;
4103
4104   SDValue Op0 = Op.getOperand(0);
4105   SDValue Op1 = Op.getOperand(1);
4106   SDValue CC = Op.getOperand(2);
4107   EVT VT = Op.getValueType();
4108   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
4109   SDLoc dl(Op);
4110
4111   if (Op.getOperand(1).getValueType().isFloatingPoint()) {
4112     switch (SetCCOpcode) {
4113     default: llvm_unreachable("Illegal FP comparison");
4114     case ISD::SETUNE:
4115     case ISD::SETNE:  Invert = true; // Fallthrough
4116     case ISD::SETOEQ:
4117     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
4118     case ISD::SETOLT:
4119     case ISD::SETLT: Swap = true; // Fallthrough
4120     case ISD::SETOGT:
4121     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
4122     case ISD::SETOLE:
4123     case ISD::SETLE:  Swap = true; // Fallthrough
4124     case ISD::SETOGE:
4125     case ISD::SETGE: Opc = ARMISD::VCGE; break;
4126     case ISD::SETUGE: Swap = true; // Fallthrough
4127     case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
4128     case ISD::SETUGT: Swap = true; // Fallthrough
4129     case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
4130     case ISD::SETUEQ: Invert = true; // Fallthrough
4131     case ISD::SETONE:
4132       // Expand this to (OLT | OGT).
4133       TmpOp0 = Op0;
4134       TmpOp1 = Op1;
4135       Opc = ISD::OR;
4136       Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
4137       Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
4138       break;
4139     case ISD::SETUO: Invert = true; // Fallthrough
4140     case ISD::SETO:
4141       // Expand this to (OLT | OGE).
4142       TmpOp0 = Op0;
4143       TmpOp1 = Op1;
4144       Opc = ISD::OR;
4145       Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
4146       Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
4147       break;
4148     }
4149   } else {
4150     // Integer comparisons.
4151     switch (SetCCOpcode) {
4152     default: llvm_unreachable("Illegal integer comparison");
4153     case ISD::SETNE:  Invert = true;
4154     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
4155     case ISD::SETLT:  Swap = true;
4156     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
4157     case ISD::SETLE:  Swap = true;
4158     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
4159     case ISD::SETULT: Swap = true;
4160     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
4161     case ISD::SETULE: Swap = true;
4162     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
4163     }
4164
4165     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
4166     if (Opc == ARMISD::VCEQ) {
4167
4168       SDValue AndOp;
4169       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
4170         AndOp = Op0;
4171       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
4172         AndOp = Op1;
4173
4174       // Ignore bitconvert.
4175       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
4176         AndOp = AndOp.getOperand(0);
4177
4178       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
4179         Opc = ARMISD::VTST;
4180         Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
4181         Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
4182         Invert = !Invert;
4183       }
4184     }
4185   }
4186
4187   if (Swap)
4188     std::swap(Op0, Op1);
4189
4190   // If one of the operands is a constant vector zero, attempt to fold the
4191   // comparison to a specialized compare-against-zero form.
4192   SDValue SingleOp;
4193   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
4194     SingleOp = Op0;
4195   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
4196     if (Opc == ARMISD::VCGE)
4197       Opc = ARMISD::VCLEZ;
4198     else if (Opc == ARMISD::VCGT)
4199       Opc = ARMISD::VCLTZ;
4200     SingleOp = Op1;
4201   }
4202
4203   SDValue Result;
4204   if (SingleOp.getNode()) {
4205     switch (Opc) {
4206     case ARMISD::VCEQ:
4207       Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
4208     case ARMISD::VCGE:
4209       Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
4210     case ARMISD::VCLEZ:
4211       Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
4212     case ARMISD::VCGT:
4213       Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
4214     case ARMISD::VCLTZ:
4215       Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
4216     default:
4217       Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
4218     }
4219   } else {
4220      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
4221   }
4222
4223   if (Invert)
4224     Result = DAG.getNOT(dl, Result, VT);
4225
4226   return Result;
4227 }
4228
4229 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
4230 /// valid vector constant for a NEON instruction with a "modified immediate"
4231 /// operand (e.g., VMOV).  If so, return the encoded value.
4232 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
4233                                  unsigned SplatBitSize, SelectionDAG &DAG,
4234                                  EVT &VT, bool is128Bits, NEONModImmType type) {
4235   unsigned OpCmode, Imm;
4236
4237   // SplatBitSize is set to the smallest size that splats the vector, so a
4238   // zero vector will always have SplatBitSize == 8.  However, NEON modified
4239   // immediate instructions others than VMOV do not support the 8-bit encoding
4240   // of a zero vector, and the default encoding of zero is supposed to be the
4241   // 32-bit version.
4242   if (SplatBits == 0)
4243     SplatBitSize = 32;
4244
4245   switch (SplatBitSize) {
4246   case 8:
4247     if (type != VMOVModImm)
4248       return SDValue();
4249     // Any 1-byte value is OK.  Op=0, Cmode=1110.
4250     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
4251     OpCmode = 0xe;
4252     Imm = SplatBits;
4253     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
4254     break;
4255
4256   case 16:
4257     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
4258     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
4259     if ((SplatBits & ~0xff) == 0) {
4260       // Value = 0x00nn: Op=x, Cmode=100x.
4261       OpCmode = 0x8;
4262       Imm = SplatBits;
4263       break;
4264     }
4265     if ((SplatBits & ~0xff00) == 0) {
4266       // Value = 0xnn00: Op=x, Cmode=101x.
4267       OpCmode = 0xa;
4268       Imm = SplatBits >> 8;
4269       break;
4270     }
4271     return SDValue();
4272
4273   case 32:
4274     // NEON's 32-bit VMOV supports splat values where:
4275     // * only one byte is nonzero, or
4276     // * the least significant byte is 0xff and the second byte is nonzero, or
4277     // * the least significant 2 bytes are 0xff and the third is nonzero.
4278     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
4279     if ((SplatBits & ~0xff) == 0) {
4280       // Value = 0x000000nn: Op=x, Cmode=000x.
4281       OpCmode = 0;
4282       Imm = SplatBits;
4283       break;
4284     }
4285     if ((SplatBits & ~0xff00) == 0) {
4286       // Value = 0x0000nn00: Op=x, Cmode=001x.
4287       OpCmode = 0x2;
4288       Imm = SplatBits >> 8;
4289       break;
4290     }
4291     if ((SplatBits & ~0xff0000) == 0) {
4292       // Value = 0x00nn0000: Op=x, Cmode=010x.
4293       OpCmode = 0x4;
4294       Imm = SplatBits >> 16;
4295       break;
4296     }
4297     if ((SplatBits & ~0xff000000) == 0) {
4298       // Value = 0xnn000000: Op=x, Cmode=011x.
4299       OpCmode = 0x6;
4300       Imm = SplatBits >> 24;
4301       break;
4302     }
4303
4304     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
4305     if (type == OtherModImm) return SDValue();
4306
4307     if ((SplatBits & ~0xffff) == 0 &&
4308         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
4309       // Value = 0x0000nnff: Op=x, Cmode=1100.
4310       OpCmode = 0xc;
4311       Imm = SplatBits >> 8;
4312       SplatBits |= 0xff;
4313       break;
4314     }
4315
4316     if ((SplatBits & ~0xffffff) == 0 &&
4317         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
4318       // Value = 0x00nnffff: Op=x, Cmode=1101.
4319       OpCmode = 0xd;
4320       Imm = SplatBits >> 16;
4321       SplatBits |= 0xffff;
4322       break;
4323     }
4324
4325     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
4326     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
4327     // VMOV.I32.  A (very) minor optimization would be to replicate the value
4328     // and fall through here to test for a valid 64-bit splat.  But, then the
4329     // caller would also need to check and handle the change in size.
4330     return SDValue();
4331
4332   case 64: {
4333     if (type != VMOVModImm)
4334       return SDValue();
4335     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
4336     uint64_t BitMask = 0xff;
4337     uint64_t Val = 0;
4338     unsigned ImmMask = 1;
4339     Imm = 0;
4340     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
4341       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
4342         Val |= BitMask;
4343         Imm |= ImmMask;
4344       } else if ((SplatBits & BitMask) != 0) {
4345         return SDValue();
4346       }
4347       BitMask <<= 8;
4348       ImmMask <<= 1;
4349     }
4350     // Op=1, Cmode=1110.
4351     OpCmode = 0x1e;
4352     SplatBits = Val;
4353     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
4354     break;
4355   }
4356
4357   default:
4358     llvm_unreachable("unexpected size for isNEONModifiedImm");
4359   }
4360
4361   unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
4362   return DAG.getTargetConstant(EncodedVal, MVT::i32);
4363 }
4364
4365 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
4366                                            const ARMSubtarget *ST) const {
4367   if (!ST->hasVFP3())
4368     return SDValue();
4369
4370   bool IsDouble = Op.getValueType() == MVT::f64;
4371   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
4372
4373   // Try splatting with a VMOV.f32...
4374   APFloat FPVal = CFP->getValueAPF();
4375   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
4376
4377   if (ImmVal != -1) {
4378     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
4379       // We have code in place to select a valid ConstantFP already, no need to
4380       // do any mangling.
4381       return Op;
4382     }
4383
4384     // It's a float and we are trying to use NEON operations where
4385     // possible. Lower it to a splat followed by an extract.
4386     SDLoc DL(Op);
4387     SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
4388     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
4389                                       NewVal);
4390     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
4391                        DAG.getConstant(0, MVT::i32));
4392   }
4393
4394   // The rest of our options are NEON only, make sure that's allowed before
4395   // proceeding..
4396   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
4397     return SDValue();
4398
4399   EVT VMovVT;
4400   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
4401
4402   // It wouldn't really be worth bothering for doubles except for one very
4403   // important value, which does happen to match: 0.0. So make sure we don't do
4404   // anything stupid.
4405   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
4406     return SDValue();
4407
4408   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
4409   SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
4410                                      false, VMOVModImm);
4411   if (NewVal != SDValue()) {
4412     SDLoc DL(Op);
4413     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
4414                                       NewVal);
4415     if (IsDouble)
4416       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
4417
4418     // It's a float: cast and extract a vector element.
4419     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
4420                                        VecConstant);
4421     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
4422                        DAG.getConstant(0, MVT::i32));
4423   }
4424
4425   // Finally, try a VMVN.i32
4426   NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
4427                              false, VMVNModImm);
4428   if (NewVal != SDValue()) {
4429     SDLoc DL(Op);
4430     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
4431
4432     if (IsDouble)
4433       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
4434
4435     // It's a float: cast and extract a vector element.
4436     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
4437                                        VecConstant);
4438     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
4439                        DAG.getConstant(0, MVT::i32));
4440   }
4441
4442   return SDValue();
4443 }
4444
4445 // check if an VEXT instruction can handle the shuffle mask when the
4446 // vector sources of the shuffle are the same.
4447 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
4448   unsigned NumElts = VT.getVectorNumElements();
4449
4450   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
4451   if (M[0] < 0)
4452     return false;
4453
4454   Imm = M[0];
4455
4456   // If this is a VEXT shuffle, the immediate value is the index of the first
4457   // element.  The other shuffle indices must be the successive elements after
4458   // the first one.
4459   unsigned ExpectedElt = Imm;
4460   for (unsigned i = 1; i < NumElts; ++i) {
4461     // Increment the expected index.  If it wraps around, just follow it
4462     // back to index zero and keep going.
4463     ++ExpectedElt;
4464     if (ExpectedElt == NumElts)
4465       ExpectedElt = 0;
4466
4467     if (M[i] < 0) continue; // ignore UNDEF indices
4468     if (ExpectedElt != static_cast<unsigned>(M[i]))
4469       return false;
4470   }
4471
4472   return true;
4473 }
4474
4475
4476 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
4477                        bool &ReverseVEXT, unsigned &Imm) {
4478   unsigned NumElts = VT.getVectorNumElements();
4479   ReverseVEXT = false;
4480
4481   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
4482   if (M[0] < 0)
4483     return false;
4484
4485   Imm = M[0];
4486
4487   // If this is a VEXT shuffle, the immediate value is the index of the first
4488   // element.  The other shuffle indices must be the successive elements after
4489   // the first one.
4490   unsigned ExpectedElt = Imm;
4491   for (unsigned i = 1; i < NumElts; ++i) {
4492     // Increment the expected index.  If it wraps around, it may still be
4493     // a VEXT but the source vectors must be swapped.
4494     ExpectedElt += 1;
4495     if (ExpectedElt == NumElts * 2) {
4496       ExpectedElt = 0;
4497       ReverseVEXT = true;
4498     }
4499
4500     if (M[i] < 0) continue; // ignore UNDEF indices
4501     if (ExpectedElt != static_cast<unsigned>(M[i]))
4502       return false;
4503   }
4504
4505   // Adjust the index value if the source operands will be swapped.
4506   if (ReverseVEXT)
4507     Imm -= NumElts;
4508
4509   return true;
4510 }
4511
4512 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
4513 /// instruction with the specified blocksize.  (The order of the elements
4514 /// within each block of the vector is reversed.)
4515 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
4516   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
4517          "Only possible block sizes for VREV are: 16, 32, 64");
4518
4519   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4520   if (EltSz == 64)
4521     return false;
4522
4523   unsigned NumElts = VT.getVectorNumElements();
4524   unsigned BlockElts = M[0] + 1;
4525   // If the first shuffle index is UNDEF, be optimistic.
4526   if (M[0] < 0)
4527     BlockElts = BlockSize / EltSz;
4528
4529   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
4530     return false;
4531
4532   for (unsigned i = 0; i < NumElts; ++i) {
4533     if (M[i] < 0) continue; // ignore UNDEF indices
4534     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
4535       return false;
4536   }
4537
4538   return true;
4539 }
4540
4541 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
4542   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
4543   // range, then 0 is placed into the resulting vector. So pretty much any mask
4544   // of 8 elements can work here.
4545   return VT == MVT::v8i8 && M.size() == 8;
4546 }
4547
4548 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4549   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4550   if (EltSz == 64)
4551     return false;
4552
4553   unsigned NumElts = VT.getVectorNumElements();
4554   WhichResult = (M[0] == 0 ? 0 : 1);
4555   for (unsigned i = 0; i < NumElts; i += 2) {
4556     if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
4557         (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
4558       return false;
4559   }
4560   return true;
4561 }
4562
4563 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
4564 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4565 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
4566 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
4567   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4568   if (EltSz == 64)
4569     return false;
4570
4571   unsigned NumElts = VT.getVectorNumElements();
4572   WhichResult = (M[0] == 0 ? 0 : 1);
4573   for (unsigned i = 0; i < NumElts; i += 2) {
4574     if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
4575         (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
4576       return false;
4577   }
4578   return true;
4579 }
4580
4581 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4582   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4583   if (EltSz == 64)
4584     return false;
4585
4586   unsigned NumElts = VT.getVectorNumElements();
4587   WhichResult = (M[0] == 0 ? 0 : 1);
4588   for (unsigned i = 0; i != NumElts; ++i) {
4589     if (M[i] < 0) continue; // ignore UNDEF indices
4590     if ((unsigned) M[i] != 2 * i + WhichResult)
4591       return false;
4592   }
4593
4594   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4595   if (VT.is64BitVector() && EltSz == 32)
4596     return false;
4597
4598   return true;
4599 }
4600
4601 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
4602 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4603 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
4604 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
4605   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4606   if (EltSz == 64)
4607     return false;
4608
4609   unsigned Half = VT.getVectorNumElements() / 2;
4610   WhichResult = (M[0] == 0 ? 0 : 1);
4611   for (unsigned j = 0; j != 2; ++j) {
4612     unsigned Idx = WhichResult;
4613     for (unsigned i = 0; i != Half; ++i) {
4614       int MIdx = M[i + j * Half];
4615       if (MIdx >= 0 && (unsigned) MIdx != Idx)
4616         return false;
4617       Idx += 2;
4618     }
4619   }
4620
4621   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4622   if (VT.is64BitVector() && EltSz == 32)
4623     return false;
4624
4625   return true;
4626 }
4627
4628 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4629   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4630   if (EltSz == 64)
4631     return false;
4632
4633   unsigned NumElts = VT.getVectorNumElements();
4634   WhichResult = (M[0] == 0 ? 0 : 1);
4635   unsigned Idx = WhichResult * NumElts / 2;
4636   for (unsigned i = 0; i != NumElts; i += 2) {
4637     if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
4638         (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
4639       return false;
4640     Idx += 1;
4641   }
4642
4643   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4644   if (VT.is64BitVector() && EltSz == 32)
4645     return false;
4646
4647   return true;
4648 }
4649
4650 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
4651 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4652 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
4653 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
4654   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4655   if (EltSz == 64)
4656     return false;
4657
4658   unsigned NumElts = VT.getVectorNumElements();
4659   WhichResult = (M[0] == 0 ? 0 : 1);
4660   unsigned Idx = WhichResult * NumElts / 2;
4661   for (unsigned i = 0; i != NumElts; i += 2) {
4662     if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
4663         (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
4664       return false;
4665     Idx += 1;
4666   }
4667
4668   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4669   if (VT.is64BitVector() && EltSz == 32)
4670     return false;
4671
4672   return true;
4673 }
4674
4675 /// \return true if this is a reverse operation on an vector.
4676 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
4677   unsigned NumElts = VT.getVectorNumElements();
4678   // Make sure the mask has the right size.
4679   if (NumElts != M.size())
4680       return false;
4681
4682   // Look for <15, ..., 3, -1, 1, 0>.
4683   for (unsigned i = 0; i != NumElts; ++i)
4684     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
4685       return false;
4686
4687   return true;
4688 }
4689
4690 // If N is an integer constant that can be moved into a register in one
4691 // instruction, return an SDValue of such a constant (will become a MOV
4692 // instruction).  Otherwise return null.
4693 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
4694                                      const ARMSubtarget *ST, SDLoc dl) {
4695   uint64_t Val;
4696   if (!isa<ConstantSDNode>(N))
4697     return SDValue();
4698   Val = cast<ConstantSDNode>(N)->getZExtValue();
4699
4700   if (ST->isThumb1Only()) {
4701     if (Val <= 255 || ~Val <= 255)
4702       return DAG.getConstant(Val, MVT::i32);
4703   } else {
4704     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
4705       return DAG.getConstant(Val, MVT::i32);
4706   }
4707   return SDValue();
4708 }
4709
4710 // If this is a case we can't handle, return null and let the default
4711 // expansion code take care of it.
4712 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
4713                                              const ARMSubtarget *ST) const {
4714   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
4715   SDLoc dl(Op);
4716   EVT VT = Op.getValueType();
4717
4718   APInt SplatBits, SplatUndef;
4719   unsigned SplatBitSize;
4720   bool HasAnyUndefs;
4721   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
4722     if (SplatBitSize <= 64) {
4723       // Check if an immediate VMOV works.
4724       EVT VmovVT;
4725       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
4726                                       SplatUndef.getZExtValue(), SplatBitSize,
4727                                       DAG, VmovVT, VT.is128BitVector(),
4728                                       VMOVModImm);
4729       if (Val.getNode()) {
4730         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
4731         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4732       }
4733
4734       // Try an immediate VMVN.
4735       uint64_t NegatedImm = (~SplatBits).getZExtValue();
4736       Val = isNEONModifiedImm(NegatedImm,
4737                                       SplatUndef.getZExtValue(), SplatBitSize,
4738                                       DAG, VmovVT, VT.is128BitVector(),
4739                                       VMVNModImm);
4740       if (Val.getNode()) {
4741         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
4742         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4743       }
4744
4745       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
4746       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
4747         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
4748         if (ImmVal != -1) {
4749           SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
4750           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
4751         }
4752       }
4753     }
4754   }
4755
4756   // Scan through the operands to see if only one value is used.
4757   //
4758   // As an optimisation, even if more than one value is used it may be more
4759   // profitable to splat with one value then change some lanes.
4760   //
4761   // Heuristically we decide to do this if the vector has a "dominant" value,
4762   // defined as splatted to more than half of the lanes.
4763   unsigned NumElts = VT.getVectorNumElements();
4764   bool isOnlyLowElement = true;
4765   bool usesOnlyOneValue = true;
4766   bool hasDominantValue = false;
4767   bool isConstant = true;
4768
4769   // Map of the number of times a particular SDValue appears in the
4770   // element list.
4771   DenseMap<SDValue, unsigned> ValueCounts;
4772   SDValue Value;
4773   for (unsigned i = 0; i < NumElts; ++i) {
4774     SDValue V = Op.getOperand(i);
4775     if (V.getOpcode() == ISD::UNDEF)
4776       continue;
4777     if (i > 0)
4778       isOnlyLowElement = false;
4779     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
4780       isConstant = false;
4781
4782     ValueCounts.insert(std::make_pair(V, 0));
4783     unsigned &Count = ValueCounts[V];
4784
4785     // Is this value dominant? (takes up more than half of the lanes)
4786     if (++Count > (NumElts / 2)) {
4787       hasDominantValue = true;
4788       Value = V;
4789     }
4790   }
4791   if (ValueCounts.size() != 1)
4792     usesOnlyOneValue = false;
4793   if (!Value.getNode() && ValueCounts.size() > 0)
4794     Value = ValueCounts.begin()->first;
4795
4796   if (ValueCounts.size() == 0)
4797     return DAG.getUNDEF(VT);
4798
4799   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
4800   // Keep going if we are hitting this case.
4801   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
4802     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
4803
4804   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4805
4806   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
4807   // i32 and try again.
4808   if (hasDominantValue && EltSize <= 32) {
4809     if (!isConstant) {
4810       SDValue N;
4811
4812       // If we are VDUPing a value that comes directly from a vector, that will
4813       // cause an unnecessary move to and from a GPR, where instead we could
4814       // just use VDUPLANE. We can only do this if the lane being extracted
4815       // is at a constant index, as the VDUP from lane instructions only have
4816       // constant-index forms.
4817       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
4818           isa<ConstantSDNode>(Value->getOperand(1))) {
4819         // We need to create a new undef vector to use for the VDUPLANE if the
4820         // size of the vector from which we get the value is different than the
4821         // size of the vector that we need to create. We will insert the element
4822         // such that the register coalescer will remove unnecessary copies.
4823         if (VT != Value->getOperand(0).getValueType()) {
4824           ConstantSDNode *constIndex;
4825           constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
4826           assert(constIndex && "The index is not a constant!");
4827           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
4828                              VT.getVectorNumElements();
4829           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
4830                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
4831                         Value, DAG.getConstant(index, MVT::i32)),
4832                            DAG.getConstant(index, MVT::i32));
4833         } else
4834           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
4835                         Value->getOperand(0), Value->getOperand(1));
4836       } else
4837         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
4838
4839       if (!usesOnlyOneValue) {
4840         // The dominant value was splatted as 'N', but we now have to insert
4841         // all differing elements.
4842         for (unsigned I = 0; I < NumElts; ++I) {
4843           if (Op.getOperand(I) == Value)
4844             continue;
4845           SmallVector<SDValue, 3> Ops;
4846           Ops.push_back(N);
4847           Ops.push_back(Op.getOperand(I));
4848           Ops.push_back(DAG.getConstant(I, MVT::i32));
4849           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
4850         }
4851       }
4852       return N;
4853     }
4854     if (VT.getVectorElementType().isFloatingPoint()) {
4855       SmallVector<SDValue, 8> Ops;
4856       for (unsigned i = 0; i < NumElts; ++i)
4857         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
4858                                   Op.getOperand(i)));
4859       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
4860       SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
4861       Val = LowerBUILD_VECTOR(Val, DAG, ST);
4862       if (Val.getNode())
4863         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
4864     }
4865     if (usesOnlyOneValue) {
4866       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
4867       if (isConstant && Val.getNode())
4868         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
4869     }
4870   }
4871
4872   // If all elements are constants and the case above didn't get hit, fall back
4873   // to the default expansion, which will generate a load from the constant
4874   // pool.
4875   if (isConstant)
4876     return SDValue();
4877
4878   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
4879   if (NumElts >= 4) {
4880     SDValue shuffle = ReconstructShuffle(Op, DAG);
4881     if (shuffle != SDValue())
4882       return shuffle;
4883   }
4884
4885   // Vectors with 32- or 64-bit elements can be built by directly assigning
4886   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
4887   // will be legalized.
4888   if (EltSize >= 32) {
4889     // Do the expansion with floating-point types, since that is what the VFP
4890     // registers are defined to use, and since i64 is not legal.
4891     EVT EltVT = EVT::getFloatingPointVT(EltSize);
4892     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
4893     SmallVector<SDValue, 8> Ops;
4894     for (unsigned i = 0; i < NumElts; ++i)
4895       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
4896     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
4897     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
4898   }
4899
4900   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
4901   // know the default expansion would otherwise fall back on something even
4902   // worse. For a vector with one or two non-undef values, that's
4903   // scalar_to_vector for the elements followed by a shuffle (provided the
4904   // shuffle is valid for the target) and materialization element by element
4905   // on the stack followed by a load for everything else.
4906   if (!isConstant && !usesOnlyOneValue) {
4907     SDValue Vec = DAG.getUNDEF(VT);
4908     for (unsigned i = 0 ; i < NumElts; ++i) {
4909       SDValue V = Op.getOperand(i);
4910       if (V.getOpcode() == ISD::UNDEF)
4911         continue;
4912       SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
4913       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
4914     }
4915     return Vec;
4916   }
4917
4918   return SDValue();
4919 }
4920
4921 // Gather data to see if the operation can be modelled as a
4922 // shuffle in combination with VEXTs.
4923 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
4924                                               SelectionDAG &DAG) const {
4925   SDLoc dl(Op);
4926   EVT VT = Op.getValueType();
4927   unsigned NumElts = VT.getVectorNumElements();
4928
4929   SmallVector<SDValue, 2> SourceVecs;
4930   SmallVector<unsigned, 2> MinElts;
4931   SmallVector<unsigned, 2> MaxElts;
4932
4933   for (unsigned i = 0; i < NumElts; ++i) {
4934     SDValue V = Op.getOperand(i);
4935     if (V.getOpcode() == ISD::UNDEF)
4936       continue;
4937     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
4938       // A shuffle can only come from building a vector from various
4939       // elements of other vectors.
4940       return SDValue();
4941     } else if (V.getOperand(0).getValueType().getVectorElementType() !=
4942                VT.getVectorElementType()) {
4943       // This code doesn't know how to handle shuffles where the vector
4944       // element types do not match (this happens because type legalization
4945       // promotes the return type of EXTRACT_VECTOR_ELT).
4946       // FIXME: It might be appropriate to extend this code to handle
4947       // mismatched types.
4948       return SDValue();
4949     }
4950
4951     // Record this extraction against the appropriate vector if possible...
4952     SDValue SourceVec = V.getOperand(0);
4953     // If the element number isn't a constant, we can't effectively
4954     // analyze what's going on.
4955     if (!isa<ConstantSDNode>(V.getOperand(1)))
4956       return SDValue();
4957     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
4958     bool FoundSource = false;
4959     for (unsigned j = 0; j < SourceVecs.size(); ++j) {
4960       if (SourceVecs[j] == SourceVec) {
4961         if (MinElts[j] > EltNo)
4962           MinElts[j] = EltNo;
4963         if (MaxElts[j] < EltNo)
4964           MaxElts[j] = EltNo;
4965         FoundSource = true;
4966         break;
4967       }
4968     }
4969
4970     // Or record a new source if not...
4971     if (!FoundSource) {
4972       SourceVecs.push_back(SourceVec);
4973       MinElts.push_back(EltNo);
4974       MaxElts.push_back(EltNo);
4975     }
4976   }
4977
4978   // Currently only do something sane when at most two source vectors
4979   // involved.
4980   if (SourceVecs.size() > 2)
4981     return SDValue();
4982
4983   SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
4984   int VEXTOffsets[2] = {0, 0};
4985
4986   // This loop extracts the usage patterns of the source vectors
4987   // and prepares appropriate SDValues for a shuffle if possible.
4988   for (unsigned i = 0; i < SourceVecs.size(); ++i) {
4989     if (SourceVecs[i].getValueType() == VT) {
4990       // No VEXT necessary
4991       ShuffleSrcs[i] = SourceVecs[i];
4992       VEXTOffsets[i] = 0;
4993       continue;
4994     } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
4995       // It probably isn't worth padding out a smaller vector just to
4996       // break it down again in a shuffle.
4997       return SDValue();
4998     }
4999
5000     // Since only 64-bit and 128-bit vectors are legal on ARM and
5001     // we've eliminated the other cases...
5002     assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
5003            "unexpected vector sizes in ReconstructShuffle");
5004
5005     if (MaxElts[i] - MinElts[i] >= NumElts) {
5006       // Span too large for a VEXT to cope
5007       return SDValue();
5008     }
5009
5010     if (MinElts[i] >= NumElts) {
5011       // The extraction can just take the second half
5012       VEXTOffsets[i] = NumElts;
5013       ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
5014                                    SourceVecs[i],
5015                                    DAG.getIntPtrConstant(NumElts));
5016     } else if (MaxElts[i] < NumElts) {
5017       // The extraction can just take the first half
5018       VEXTOffsets[i] = 0;
5019       ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
5020                                    SourceVecs[i],
5021                                    DAG.getIntPtrConstant(0));
5022     } else {
5023       // An actual VEXT is needed
5024       VEXTOffsets[i] = MinElts[i];
5025       SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
5026                                      SourceVecs[i],
5027                                      DAG.getIntPtrConstant(0));
5028       SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
5029                                      SourceVecs[i],
5030                                      DAG.getIntPtrConstant(NumElts));
5031       ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
5032                                    DAG.getConstant(VEXTOffsets[i], MVT::i32));
5033     }
5034   }
5035
5036   SmallVector<int, 8> Mask;
5037
5038   for (unsigned i = 0; i < NumElts; ++i) {
5039     SDValue Entry = Op.getOperand(i);
5040     if (Entry.getOpcode() == ISD::UNDEF) {
5041       Mask.push_back(-1);
5042       continue;
5043     }
5044
5045     SDValue ExtractVec = Entry.getOperand(0);
5046     int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
5047                                           .getOperand(1))->getSExtValue();
5048     if (ExtractVec == SourceVecs[0]) {
5049       Mask.push_back(ExtractElt - VEXTOffsets[0]);
5050     } else {
5051       Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
5052     }
5053   }
5054
5055   // Final check before we try to produce nonsense...
5056   if (isShuffleMaskLegal(Mask, VT))
5057     return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
5058                                 &Mask[0]);
5059
5060   return SDValue();
5061 }
5062
5063 /// isShuffleMaskLegal - Targets can use this to indicate that they only
5064 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
5065 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
5066 /// are assumed to be legal.
5067 bool
5068 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
5069                                       EVT VT) const {
5070   if (VT.getVectorNumElements() == 4 &&
5071       (VT.is128BitVector() || VT.is64BitVector())) {
5072     unsigned PFIndexes[4];
5073     for (unsigned i = 0; i != 4; ++i) {
5074       if (M[i] < 0)
5075         PFIndexes[i] = 8;
5076       else
5077         PFIndexes[i] = M[i];
5078     }
5079
5080     // Compute the index in the perfect shuffle table.
5081     unsigned PFTableIndex =
5082       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
5083     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
5084     unsigned Cost = (PFEntry >> 30);
5085
5086     if (Cost <= 4)
5087       return true;
5088   }
5089
5090   bool ReverseVEXT;
5091   unsigned Imm, WhichResult;
5092
5093   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5094   return (EltSize >= 32 ||
5095           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
5096           isVREVMask(M, VT, 64) ||
5097           isVREVMask(M, VT, 32) ||
5098           isVREVMask(M, VT, 16) ||
5099           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
5100           isVTBLMask(M, VT) ||
5101           isVTRNMask(M, VT, WhichResult) ||
5102           isVUZPMask(M, VT, WhichResult) ||
5103           isVZIPMask(M, VT, WhichResult) ||
5104           isVTRN_v_undef_Mask(M, VT, WhichResult) ||
5105           isVUZP_v_undef_Mask(M, VT, WhichResult) ||
5106           isVZIP_v_undef_Mask(M, VT, WhichResult) ||
5107           ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
5108 }
5109
5110 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
5111 /// the specified operations to build the shuffle.
5112 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
5113                                       SDValue RHS, SelectionDAG &DAG,
5114                                       SDLoc dl) {
5115   unsigned OpNum = (PFEntry >> 26) & 0x0F;
5116   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
5117   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
5118
5119   enum {
5120     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
5121     OP_VREV,
5122     OP_VDUP0,
5123     OP_VDUP1,
5124     OP_VDUP2,
5125     OP_VDUP3,
5126     OP_VEXT1,
5127     OP_VEXT2,
5128     OP_VEXT3,
5129     OP_VUZPL, // VUZP, left result
5130     OP_VUZPR, // VUZP, right result
5131     OP_VZIPL, // VZIP, left result
5132     OP_VZIPR, // VZIP, right result
5133     OP_VTRNL, // VTRN, left result
5134     OP_VTRNR  // VTRN, right result
5135   };
5136
5137   if (OpNum == OP_COPY) {
5138     if (LHSID == (1*9+2)*9+3) return LHS;
5139     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
5140     return RHS;
5141   }
5142
5143   SDValue OpLHS, OpRHS;
5144   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
5145   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
5146   EVT VT = OpLHS.getValueType();
5147
5148   switch (OpNum) {
5149   default: llvm_unreachable("Unknown shuffle opcode!");
5150   case OP_VREV:
5151     // VREV divides the vector in half and swaps within the half.
5152     if (VT.getVectorElementType() == MVT::i32 ||
5153         VT.getVectorElementType() == MVT::f32)
5154       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
5155     // vrev <4 x i16> -> VREV32
5156     if (VT.getVectorElementType() == MVT::i16)
5157       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
5158     // vrev <4 x i8> -> VREV16
5159     assert(VT.getVectorElementType() == MVT::i8);
5160     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
5161   case OP_VDUP0:
5162   case OP_VDUP1:
5163   case OP_VDUP2:
5164   case OP_VDUP3:
5165     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
5166                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
5167   case OP_VEXT1:
5168   case OP_VEXT2:
5169   case OP_VEXT3:
5170     return DAG.getNode(ARMISD::VEXT, dl, VT,
5171                        OpLHS, OpRHS,
5172                        DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
5173   case OP_VUZPL:
5174   case OP_VUZPR:
5175     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
5176                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
5177   case OP_VZIPL:
5178   case OP_VZIPR:
5179     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
5180                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
5181   case OP_VTRNL:
5182   case OP_VTRNR:
5183     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
5184                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
5185   }
5186 }
5187
5188 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
5189                                        ArrayRef<int> ShuffleMask,
5190                                        SelectionDAG &DAG) {
5191   // Check to see if we can use the VTBL instruction.
5192   SDValue V1 = Op.getOperand(0);
5193   SDValue V2 = Op.getOperand(1);
5194   SDLoc DL(Op);
5195
5196   SmallVector<SDValue, 8> VTBLMask;
5197   for (ArrayRef<int>::iterator
5198          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
5199     VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
5200
5201   if (V2.getNode()->getOpcode() == ISD::UNDEF)
5202     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
5203                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
5204                                    &VTBLMask[0], 8));
5205
5206   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
5207                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
5208                                  &VTBLMask[0], 8));
5209 }
5210
5211 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
5212                                                       SelectionDAG &DAG) {
5213   SDLoc DL(Op);
5214   SDValue OpLHS = Op.getOperand(0);
5215   EVT VT = OpLHS.getValueType();
5216
5217   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
5218          "Expect an v8i16/v16i8 type");
5219   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
5220   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
5221   // extract the first 8 bytes into the top double word and the last 8 bytes
5222   // into the bottom double word. The v8i16 case is similar.
5223   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
5224   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
5225                      DAG.getConstant(ExtractNum, MVT::i32));
5226 }
5227
5228 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
5229   SDValue V1 = Op.getOperand(0);
5230   SDValue V2 = Op.getOperand(1);
5231   SDLoc dl(Op);
5232   EVT VT = Op.getValueType();
5233   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
5234
5235   // Convert shuffles that are directly supported on NEON to target-specific
5236   // DAG nodes, instead of keeping them as shuffles and matching them again
5237   // during code selection.  This is more efficient and avoids the possibility
5238   // of inconsistencies between legalization and selection.
5239   // FIXME: floating-point vectors should be canonicalized to integer vectors
5240   // of the same time so that they get CSEd properly.
5241   ArrayRef<int> ShuffleMask = SVN->getMask();
5242
5243   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5244   if (EltSize <= 32) {
5245     if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
5246       int Lane = SVN->getSplatIndex();
5247       // If this is undef splat, generate it via "just" vdup, if possible.
5248       if (Lane == -1) Lane = 0;
5249
5250       // Test if V1 is a SCALAR_TO_VECTOR.
5251       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5252         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
5253       }
5254       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
5255       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
5256       // reaches it).
5257       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
5258           !isa<ConstantSDNode>(V1.getOperand(0))) {
5259         bool IsScalarToVector = true;
5260         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
5261           if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
5262             IsScalarToVector = false;
5263             break;
5264           }
5265         if (IsScalarToVector)
5266           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
5267       }
5268       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
5269                          DAG.getConstant(Lane, MVT::i32));
5270     }
5271
5272     bool ReverseVEXT;
5273     unsigned Imm;
5274     if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
5275       if (ReverseVEXT)
5276         std::swap(V1, V2);
5277       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
5278                          DAG.getConstant(Imm, MVT::i32));
5279     }
5280
5281     if (isVREVMask(ShuffleMask, VT, 64))
5282       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
5283     if (isVREVMask(ShuffleMask, VT, 32))
5284       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
5285     if (isVREVMask(ShuffleMask, VT, 16))
5286       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
5287
5288     if (V2->getOpcode() == ISD::UNDEF &&
5289         isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
5290       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
5291                          DAG.getConstant(Imm, MVT::i32));
5292     }
5293
5294     // Check for Neon shuffles that modify both input vectors in place.
5295     // If both results are used, i.e., if there are two shuffles with the same
5296     // source operands and with masks corresponding to both results of one of
5297     // these operations, DAG memoization will ensure that a single node is
5298     // used for both shuffles.
5299     unsigned WhichResult;
5300     if (isVTRNMask(ShuffleMask, VT, WhichResult))
5301       return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
5302                          V1, V2).getValue(WhichResult);
5303     if (isVUZPMask(ShuffleMask, VT, WhichResult))
5304       return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
5305                          V1, V2).getValue(WhichResult);
5306     if (isVZIPMask(ShuffleMask, VT, WhichResult))
5307       return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
5308                          V1, V2).getValue(WhichResult);
5309
5310     if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
5311       return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
5312                          V1, V1).getValue(WhichResult);
5313     if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5314       return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
5315                          V1, V1).getValue(WhichResult);
5316     if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5317       return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
5318                          V1, V1).getValue(WhichResult);
5319   }
5320
5321   // If the shuffle is not directly supported and it has 4 elements, use
5322   // the PerfectShuffle-generated table to synthesize it from other shuffles.
5323   unsigned NumElts = VT.getVectorNumElements();
5324   if (NumElts == 4) {
5325     unsigned PFIndexes[4];
5326     for (unsigned i = 0; i != 4; ++i) {
5327       if (ShuffleMask[i] < 0)
5328         PFIndexes[i] = 8;
5329       else
5330         PFIndexes[i] = ShuffleMask[i];
5331     }
5332
5333     // Compute the index in the perfect shuffle table.
5334     unsigned PFTableIndex =
5335       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
5336     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
5337     unsigned Cost = (PFEntry >> 30);
5338
5339     if (Cost <= 4)
5340       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
5341   }
5342
5343   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
5344   if (EltSize >= 32) {
5345     // Do the expansion with floating-point types, since that is what the VFP
5346     // registers are defined to use, and since i64 is not legal.
5347     EVT EltVT = EVT::getFloatingPointVT(EltSize);
5348     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
5349     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
5350     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
5351     SmallVector<SDValue, 8> Ops;
5352     for (unsigned i = 0; i < NumElts; ++i) {
5353       if (ShuffleMask[i] < 0)
5354         Ops.push_back(DAG.getUNDEF(EltVT));
5355       else
5356         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
5357                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
5358                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
5359                                                   MVT::i32)));
5360     }
5361     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
5362     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
5363   }
5364
5365   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
5366     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
5367
5368   if (VT == MVT::v8i8) {
5369     SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
5370     if (NewOp.getNode())
5371       return NewOp;
5372   }
5373
5374   return SDValue();
5375 }
5376
5377 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
5378   // INSERT_VECTOR_ELT is legal only for immediate indexes.
5379   SDValue Lane = Op.getOperand(2);
5380   if (!isa<ConstantSDNode>(Lane))
5381     return SDValue();
5382
5383   return Op;
5384 }
5385
5386 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
5387   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
5388   SDValue Lane = Op.getOperand(1);
5389   if (!isa<ConstantSDNode>(Lane))
5390     return SDValue();
5391
5392   SDValue Vec = Op.getOperand(0);
5393   if (Op.getValueType() == MVT::i32 &&
5394       Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
5395     SDLoc dl(Op);
5396     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
5397   }
5398
5399   return Op;
5400 }
5401
5402 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5403   // The only time a CONCAT_VECTORS operation can have legal types is when
5404   // two 64-bit vectors are concatenated to a 128-bit vector.
5405   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
5406          "unexpected CONCAT_VECTORS");
5407   SDLoc dl(Op);
5408   SDValue Val = DAG.getUNDEF(MVT::v2f64);
5409   SDValue Op0 = Op.getOperand(0);
5410   SDValue Op1 = Op.getOperand(1);
5411   if (Op0.getOpcode() != ISD::UNDEF)
5412     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
5413                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
5414                       DAG.getIntPtrConstant(0));
5415   if (Op1.getOpcode() != ISD::UNDEF)
5416     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
5417                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
5418                       DAG.getIntPtrConstant(1));
5419   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
5420 }
5421
5422 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
5423 /// element has been zero/sign-extended, depending on the isSigned parameter,
5424 /// from an integer type half its size.
5425 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
5426                                    bool isSigned) {
5427   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
5428   EVT VT = N->getValueType(0);
5429   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
5430     SDNode *BVN = N->getOperand(0).getNode();
5431     if (BVN->getValueType(0) != MVT::v4i32 ||
5432         BVN->getOpcode() != ISD::BUILD_VECTOR)
5433       return false;
5434     unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
5435     unsigned HiElt = 1 - LoElt;
5436     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
5437     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
5438     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
5439     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
5440     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
5441       return false;
5442     if (isSigned) {
5443       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
5444           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
5445         return true;
5446     } else {
5447       if (Hi0->isNullValue() && Hi1->isNullValue())
5448         return true;
5449     }
5450     return false;
5451   }
5452
5453   if (N->getOpcode() != ISD::BUILD_VECTOR)
5454     return false;
5455
5456   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
5457     SDNode *Elt = N->getOperand(i).getNode();
5458     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5459       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5460       unsigned HalfSize = EltSize / 2;
5461       if (isSigned) {
5462         if (!isIntN(HalfSize, C->getSExtValue()))
5463           return false;
5464       } else {
5465         if (!isUIntN(HalfSize, C->getZExtValue()))
5466           return false;
5467       }
5468       continue;
5469     }
5470     return false;
5471   }
5472
5473   return true;
5474 }
5475
5476 /// isSignExtended - Check if a node is a vector value that is sign-extended
5477 /// or a constant BUILD_VECTOR with sign-extended elements.
5478 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
5479   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
5480     return true;
5481   if (isExtendedBUILD_VECTOR(N, DAG, true))
5482     return true;
5483   return false;
5484 }
5485
5486 /// isZeroExtended - Check if a node is a vector value that is zero-extended
5487 /// or a constant BUILD_VECTOR with zero-extended elements.
5488 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
5489   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
5490     return true;
5491   if (isExtendedBUILD_VECTOR(N, DAG, false))
5492     return true;
5493   return false;
5494 }
5495
5496 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
5497   if (OrigVT.getSizeInBits() >= 64)
5498     return OrigVT;
5499
5500   assert(OrigVT.isSimple() && "Expecting a simple value type");
5501
5502   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
5503   switch (OrigSimpleTy) {
5504   default: llvm_unreachable("Unexpected Vector Type");
5505   case MVT::v2i8:
5506   case MVT::v2i16:
5507      return MVT::v2i32;
5508   case MVT::v4i8:
5509     return  MVT::v4i16;
5510   }
5511 }
5512
5513 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
5514 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
5515 /// We insert the required extension here to get the vector to fill a D register.
5516 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
5517                                             const EVT &OrigTy,
5518                                             const EVT &ExtTy,
5519                                             unsigned ExtOpcode) {
5520   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
5521   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
5522   // 64-bits we need to insert a new extension so that it will be 64-bits.
5523   assert(ExtTy.is128BitVector() && "Unexpected extension size");
5524   if (OrigTy.getSizeInBits() >= 64)
5525     return N;
5526
5527   // Must extend size to at least 64 bits to be used as an operand for VMULL.
5528   EVT NewVT = getExtensionTo64Bits(OrigTy);
5529
5530   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
5531 }
5532
5533 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
5534 /// does not do any sign/zero extension. If the original vector is less
5535 /// than 64 bits, an appropriate extension will be added after the load to
5536 /// reach a total size of 64 bits. We have to add the extension separately
5537 /// because ARM does not have a sign/zero extending load for vectors.
5538 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
5539   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
5540
5541   // The load already has the right type.
5542   if (ExtendedTy == LD->getMemoryVT())
5543     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
5544                 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
5545                 LD->isNonTemporal(), LD->isInvariant(),
5546                 LD->getAlignment());
5547
5548   // We need to create a zextload/sextload. We cannot just create a load
5549   // followed by a zext/zext node because LowerMUL is also run during normal
5550   // operation legalization where we can't create illegal types.
5551   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
5552                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
5553                         LD->getMemoryVT(), LD->isVolatile(),
5554                         LD->isNonTemporal(), LD->getAlignment());
5555 }
5556
5557 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
5558 /// extending load, or BUILD_VECTOR with extended elements, return the
5559 /// unextended value. The unextended vector should be 64 bits so that it can
5560 /// be used as an operand to a VMULL instruction. If the original vector size
5561 /// before extension is less than 64 bits we add a an extension to resize
5562 /// the vector to 64 bits.
5563 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
5564   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
5565     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
5566                                         N->getOperand(0)->getValueType(0),
5567                                         N->getValueType(0),
5568                                         N->getOpcode());
5569
5570   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
5571     return SkipLoadExtensionForVMULL(LD, DAG);
5572
5573   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
5574   // have been legalized as a BITCAST from v4i32.
5575   if (N->getOpcode() == ISD::BITCAST) {
5576     SDNode *BVN = N->getOperand(0).getNode();
5577     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
5578            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
5579     unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
5580     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
5581                        BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
5582   }
5583   // Construct a new BUILD_VECTOR with elements truncated to half the size.
5584   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
5585   EVT VT = N->getValueType(0);
5586   unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
5587   unsigned NumElts = VT.getVectorNumElements();
5588   MVT TruncVT = MVT::getIntegerVT(EltSize);
5589   SmallVector<SDValue, 8> Ops;
5590   for (unsigned i = 0; i != NumElts; ++i) {
5591     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
5592     const APInt &CInt = C->getAPIntValue();
5593     // Element types smaller than 32 bits are not legal, so use i32 elements.
5594     // The values are implicitly truncated so sext vs. zext doesn't matter.
5595     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
5596   }
5597   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
5598                      MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
5599 }
5600
5601 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
5602   unsigned Opcode = N->getOpcode();
5603   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5604     SDNode *N0 = N->getOperand(0).getNode();
5605     SDNode *N1 = N->getOperand(1).getNode();
5606     return N0->hasOneUse() && N1->hasOneUse() &&
5607       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5608   }
5609   return false;
5610 }
5611
5612 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
5613   unsigned Opcode = N->getOpcode();
5614   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5615     SDNode *N0 = N->getOperand(0).getNode();
5616     SDNode *N1 = N->getOperand(1).getNode();
5617     return N0->hasOneUse() && N1->hasOneUse() &&
5618       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5619   }
5620   return false;
5621 }
5622
5623 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
5624   // Multiplications are only custom-lowered for 128-bit vectors so that
5625   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
5626   EVT VT = Op.getValueType();
5627   assert(VT.is128BitVector() && VT.isInteger() &&
5628          "unexpected type for custom-lowering ISD::MUL");
5629   SDNode *N0 = Op.getOperand(0).getNode();
5630   SDNode *N1 = Op.getOperand(1).getNode();
5631   unsigned NewOpc = 0;
5632   bool isMLA = false;
5633   bool isN0SExt = isSignExtended(N0, DAG);
5634   bool isN1SExt = isSignExtended(N1, DAG);
5635   if (isN0SExt && isN1SExt)
5636     NewOpc = ARMISD::VMULLs;
5637   else {
5638     bool isN0ZExt = isZeroExtended(N0, DAG);
5639     bool isN1ZExt = isZeroExtended(N1, DAG);
5640     if (isN0ZExt && isN1ZExt)
5641       NewOpc = ARMISD::VMULLu;
5642     else if (isN1SExt || isN1ZExt) {
5643       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5644       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5645       if (isN1SExt && isAddSubSExt(N0, DAG)) {
5646         NewOpc = ARMISD::VMULLs;
5647         isMLA = true;
5648       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
5649         NewOpc = ARMISD::VMULLu;
5650         isMLA = true;
5651       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
5652         std::swap(N0, N1);
5653         NewOpc = ARMISD::VMULLu;
5654         isMLA = true;
5655       }
5656     }
5657
5658     if (!NewOpc) {
5659       if (VT == MVT::v2i64)
5660         // Fall through to expand this.  It is not legal.
5661         return SDValue();
5662       else
5663         // Other vector multiplications are legal.
5664         return Op;
5665     }
5666   }
5667
5668   // Legalize to a VMULL instruction.
5669   SDLoc DL(Op);
5670   SDValue Op0;
5671   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
5672   if (!isMLA) {
5673     Op0 = SkipExtensionForVMULL(N0, DAG);
5674     assert(Op0.getValueType().is64BitVector() &&
5675            Op1.getValueType().is64BitVector() &&
5676            "unexpected types for extended operands to VMULL");
5677     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
5678   }
5679
5680   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
5681   // isel lowering to take advantage of no-stall back to back vmul + vmla.
5682   //   vmull q0, d4, d6
5683   //   vmlal q0, d5, d6
5684   // is faster than
5685   //   vaddl q0, d4, d5
5686   //   vmovl q1, d6
5687   //   vmul  q0, q0, q1
5688   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
5689   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
5690   EVT Op1VT = Op1.getValueType();
5691   return DAG.getNode(N0->getOpcode(), DL, VT,
5692                      DAG.getNode(NewOpc, DL, VT,
5693                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5694                      DAG.getNode(NewOpc, DL, VT,
5695                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
5696 }
5697
5698 static SDValue
5699 LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
5700   // Convert to float
5701   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
5702   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
5703   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
5704   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
5705   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
5706   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
5707   // Get reciprocal estimate.
5708   // float4 recip = vrecpeq_f32(yf);
5709   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5710                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
5711   // Because char has a smaller range than uchar, we can actually get away
5712   // without any newton steps.  This requires that we use a weird bias
5713   // of 0xb000, however (again, this has been exhaustively tested).
5714   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
5715   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
5716   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
5717   Y = DAG.getConstant(0xb000, MVT::i32);
5718   Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
5719   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
5720   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
5721   // Convert back to short.
5722   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
5723   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
5724   return X;
5725 }
5726
5727 static SDValue
5728 LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
5729   SDValue N2;
5730   // Convert to float.
5731   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
5732   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
5733   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
5734   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
5735   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
5736   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
5737
5738   // Use reciprocal estimate and one refinement step.
5739   // float4 recip = vrecpeq_f32(yf);
5740   // recip *= vrecpsq_f32(yf, recip);
5741   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5742                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
5743   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5744                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
5745                    N1, N2);
5746   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
5747   // Because short has a smaller range than ushort, we can actually get away
5748   // with only a single newton step.  This requires that we use a weird bias
5749   // of 89, however (again, this has been exhaustively tested).
5750   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
5751   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
5752   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
5753   N1 = DAG.getConstant(0x89, MVT::i32);
5754   N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
5755   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
5756   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
5757   // Convert back to integer and return.
5758   // return vmovn_s32(vcvt_s32_f32(result));
5759   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
5760   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
5761   return N0;
5762 }
5763
5764 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
5765   EVT VT = Op.getValueType();
5766   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
5767          "unexpected type for custom-lowering ISD::SDIV");
5768
5769   SDLoc dl(Op);
5770   SDValue N0 = Op.getOperand(0);
5771   SDValue N1 = Op.getOperand(1);
5772   SDValue N2, N3;
5773
5774   if (VT == MVT::v8i8) {
5775     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
5776     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
5777
5778     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5779                      DAG.getIntPtrConstant(4));
5780     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5781                      DAG.getIntPtrConstant(4));
5782     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5783                      DAG.getIntPtrConstant(0));
5784     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5785                      DAG.getIntPtrConstant(0));
5786
5787     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
5788     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
5789
5790     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
5791     N0 = LowerCONCAT_VECTORS(N0, DAG);
5792
5793     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
5794     return N0;
5795   }
5796   return LowerSDIV_v4i16(N0, N1, dl, DAG);
5797 }
5798
5799 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
5800   EVT VT = Op.getValueType();
5801   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
5802          "unexpected type for custom-lowering ISD::UDIV");
5803
5804   SDLoc dl(Op);
5805   SDValue N0 = Op.getOperand(0);
5806   SDValue N1 = Op.getOperand(1);
5807   SDValue N2, N3;
5808
5809   if (VT == MVT::v8i8) {
5810     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
5811     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
5812
5813     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5814                      DAG.getIntPtrConstant(4));
5815     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5816                      DAG.getIntPtrConstant(4));
5817     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5818                      DAG.getIntPtrConstant(0));
5819     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5820                      DAG.getIntPtrConstant(0));
5821
5822     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
5823     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
5824
5825     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
5826     N0 = LowerCONCAT_VECTORS(N0, DAG);
5827
5828     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
5829                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
5830                      N0);
5831     return N0;
5832   }
5833
5834   // v4i16 sdiv ... Convert to float.
5835   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
5836   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
5837   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
5838   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
5839   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
5840   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
5841
5842   // Use reciprocal estimate and two refinement steps.
5843   // float4 recip = vrecpeq_f32(yf);
5844   // recip *= vrecpsq_f32(yf, recip);
5845   // recip *= vrecpsq_f32(yf, recip);
5846   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5847                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1);
5848   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5849                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
5850                    BN1, N2);
5851   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
5852   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5853                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
5854                    BN1, N2);
5855   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
5856   // Simply multiplying by the reciprocal estimate can leave us a few ulps
5857   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
5858   // and that it will never cause us to return an answer too large).
5859   // float4 result = as_float4(as_int4(xf*recip) + 2);
5860   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
5861   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
5862   N1 = DAG.getConstant(2, MVT::i32);
5863   N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
5864   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
5865   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
5866   // Convert back to integer and return.
5867   // return vmovn_u32(vcvt_s32_f32(result));
5868   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
5869   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
5870   return N0;
5871 }
5872
5873 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
5874   EVT VT = Op.getNode()->getValueType(0);
5875   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
5876
5877   unsigned Opc;
5878   bool ExtraOp = false;
5879   switch (Op.getOpcode()) {
5880   default: llvm_unreachable("Invalid code");
5881   case ISD::ADDC: Opc = ARMISD::ADDC; break;
5882   case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
5883   case ISD::SUBC: Opc = ARMISD::SUBC; break;
5884   case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
5885   }
5886
5887   if (!ExtraOp)
5888     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
5889                        Op.getOperand(1));
5890   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
5891                      Op.getOperand(1), Op.getOperand(2));
5892 }
5893
5894 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
5895   // Monotonic load/store is legal for all targets
5896   if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
5897     return Op;
5898
5899   // Aquire/Release load/store is not legal for targets without a
5900   // dmb or equivalent available.
5901   return SDValue();
5902 }
5903
5904 static void
5905 ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
5906                     SelectionDAG &DAG, unsigned NewOp) {
5907   SDLoc dl(Node);
5908   assert (Node->getValueType(0) == MVT::i64 &&
5909           "Only know how to expand i64 atomics");
5910
5911   SmallVector<SDValue, 6> Ops;
5912   Ops.push_back(Node->getOperand(0)); // Chain
5913   Ops.push_back(Node->getOperand(1)); // Ptr
5914   // Low part of Val1
5915   Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5916                             Node->getOperand(2), DAG.getIntPtrConstant(0)));
5917   // High part of Val1
5918   Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5919                             Node->getOperand(2), DAG.getIntPtrConstant(1)));
5920   if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
5921     // High part of Val1
5922     Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5923                               Node->getOperand(3), DAG.getIntPtrConstant(0)));
5924     // High part of Val2
5925     Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5926                               Node->getOperand(3), DAG.getIntPtrConstant(1)));
5927   }
5928   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
5929   SDValue Result =
5930     DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
5931                             cast<MemSDNode>(Node)->getMemOperand());
5932   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
5933   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
5934   Results.push_back(Result.getValue(2));
5935 }
5936
5937 static void ReplaceREADCYCLECOUNTER(SDNode *N,
5938                                     SmallVectorImpl<SDValue> &Results,
5939                                     SelectionDAG &DAG,
5940                                     const ARMSubtarget *Subtarget) {
5941   SDLoc DL(N);
5942   SDValue Cycles32, OutChain;
5943
5944   if (Subtarget->hasPerfMon()) {
5945     // Under Power Management extensions, the cycle-count is:
5946     //    mrc p15, #0, <Rt>, c9, c13, #0
5947     SDValue Ops[] = { N->getOperand(0), // Chain
5948                       DAG.getConstant(Intrinsic::arm_mrc, MVT::i32),
5949                       DAG.getConstant(15, MVT::i32),
5950                       DAG.getConstant(0, MVT::i32),
5951                       DAG.getConstant(9, MVT::i32),
5952                       DAG.getConstant(13, MVT::i32),
5953                       DAG.getConstant(0, MVT::i32)
5954     };
5955
5956     Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
5957                            DAG.getVTList(MVT::i32, MVT::Other), &Ops[0],
5958                            array_lengthof(Ops));
5959     OutChain = Cycles32.getValue(1);
5960   } else {
5961     // Intrinsic is defined to return 0 on unsupported platforms. Technically
5962     // there are older ARM CPUs that have implementation-specific ways of
5963     // obtaining this information (FIXME!).
5964     Cycles32 = DAG.getConstant(0, MVT::i32);
5965     OutChain = DAG.getEntryNode();
5966   }
5967
5968
5969   SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
5970                                  Cycles32, DAG.getConstant(0, MVT::i32));
5971   Results.push_back(Cycles64);
5972   Results.push_back(OutChain);
5973 }
5974
5975 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
5976   switch (Op.getOpcode()) {
5977   default: llvm_unreachable("Don't know how to custom lower this!");
5978   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
5979   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
5980   case ISD::GlobalAddress:
5981     return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
5982       LowerGlobalAddressELF(Op, DAG);
5983   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
5984   case ISD::SELECT:        return LowerSELECT(Op, DAG);
5985   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
5986   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
5987   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
5988   case ISD::VASTART:       return LowerVASTART(Op, DAG);
5989   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
5990   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
5991   case ISD::SINT_TO_FP:
5992   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
5993   case ISD::FP_TO_SINT:
5994   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
5995   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
5996   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
5997   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
5998   case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
5999   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
6000   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
6001   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
6002                                                                Subtarget);
6003   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
6004   case ISD::SHL:
6005   case ISD::SRL:
6006   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
6007   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
6008   case ISD::SRL_PARTS:
6009   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
6010   case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
6011   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
6012   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
6013   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
6014   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
6015   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
6016   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
6017   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6018   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
6019   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
6020   case ISD::MUL:           return LowerMUL(Op, DAG);
6021   case ISD::SDIV:          return LowerSDIV(Op, DAG);
6022   case ISD::UDIV:          return LowerUDIV(Op, DAG);
6023   case ISD::ADDC:
6024   case ISD::ADDE:
6025   case ISD::SUBC:
6026   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
6027   case ISD::ATOMIC_LOAD:
6028   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
6029   case ISD::SDIVREM:
6030   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
6031   }
6032 }
6033
6034 /// ReplaceNodeResults - Replace the results of node with an illegal result
6035 /// type with new values built out of custom code.
6036 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
6037                                            SmallVectorImpl<SDValue>&Results,
6038                                            SelectionDAG &DAG) const {
6039   SDValue Res;
6040   switch (N->getOpcode()) {
6041   default:
6042     llvm_unreachable("Don't know how to custom expand this!");
6043   case ISD::BITCAST:
6044     Res = ExpandBITCAST(N, DAG);
6045     break;
6046   case ISD::SIGN_EXTEND:
6047   case ISD::ZERO_EXTEND:
6048     Res = ExpandVectorExtension(N, DAG);
6049     break;
6050   case ISD::SRL:
6051   case ISD::SRA:
6052     Res = Expand64BitShift(N, DAG, Subtarget);
6053     break;
6054   case ISD::READCYCLECOUNTER:
6055     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
6056     return;
6057   case ISD::ATOMIC_LOAD_ADD:
6058     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
6059     return;
6060   case ISD::ATOMIC_LOAD_AND:
6061     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
6062     return;
6063   case ISD::ATOMIC_LOAD_NAND:
6064     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
6065     return;
6066   case ISD::ATOMIC_LOAD_OR:
6067     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
6068     return;
6069   case ISD::ATOMIC_LOAD_SUB:
6070     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
6071     return;
6072   case ISD::ATOMIC_LOAD_XOR:
6073     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
6074     return;
6075   case ISD::ATOMIC_SWAP:
6076     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
6077     return;
6078   case ISD::ATOMIC_CMP_SWAP:
6079     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
6080     return;
6081   case ISD::ATOMIC_LOAD_MIN:
6082     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
6083     return;
6084   case ISD::ATOMIC_LOAD_UMIN:
6085     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
6086     return;
6087   case ISD::ATOMIC_LOAD_MAX:
6088     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
6089     return;
6090   case ISD::ATOMIC_LOAD_UMAX:
6091     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
6092     return;
6093   }
6094   if (Res.getNode())
6095     Results.push_back(Res);
6096 }
6097
6098 //===----------------------------------------------------------------------===//
6099 //                           ARM Scheduler Hooks
6100 //===----------------------------------------------------------------------===//
6101
6102 MachineBasicBlock *
6103 ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
6104                                      MachineBasicBlock *BB,
6105                                      unsigned Size) const {
6106   unsigned dest    = MI->getOperand(0).getReg();
6107   unsigned ptr     = MI->getOperand(1).getReg();
6108   unsigned oldval  = MI->getOperand(2).getReg();
6109   unsigned newval  = MI->getOperand(3).getReg();
6110   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6111   DebugLoc dl = MI->getDebugLoc();
6112   bool isThumb2 = Subtarget->isThumb2();
6113
6114   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
6115   unsigned scratch = MRI.createVirtualRegister(isThumb2 ?
6116     (const TargetRegisterClass*)&ARM::rGPRRegClass :
6117     (const TargetRegisterClass*)&ARM::GPRRegClass);
6118
6119   if (isThumb2) {
6120     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
6121     MRI.constrainRegClass(oldval, &ARM::rGPRRegClass);
6122     MRI.constrainRegClass(newval, &ARM::rGPRRegClass);
6123   }
6124
6125   unsigned ldrOpc, strOpc;
6126   switch (Size) {
6127   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
6128   case 1:
6129     ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
6130     strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
6131     break;
6132   case 2:
6133     ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
6134     strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
6135     break;
6136   case 4:
6137     ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
6138     strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
6139     break;
6140   }
6141
6142   MachineFunction *MF = BB->getParent();
6143   const BasicBlock *LLVM_BB = BB->getBasicBlock();
6144   MachineFunction::iterator It = BB;
6145   ++It; // insert the new blocks after the current block
6146
6147   MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
6148   MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
6149   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6150   MF->insert(It, loop1MBB);
6151   MF->insert(It, loop2MBB);
6152   MF->insert(It, exitMBB);
6153
6154   // Transfer the remainder of BB and its successor edges to exitMBB.
6155   exitMBB->splice(exitMBB->begin(), BB,
6156                   llvm::next(MachineBasicBlock::iterator(MI)),
6157                   BB->end());
6158   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6159
6160   //  thisMBB:
6161   //   ...
6162   //   fallthrough --> loop1MBB
6163   BB->addSuccessor(loop1MBB);
6164
6165   // loop1MBB:
6166   //   ldrex dest, [ptr]
6167   //   cmp dest, oldval
6168   //   bne exitMBB
6169   BB = loop1MBB;
6170   MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
6171   if (ldrOpc == ARM::t2LDREX)
6172     MIB.addImm(0);
6173   AddDefaultPred(MIB);
6174   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
6175                  .addReg(dest).addReg(oldval));
6176   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6177     .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6178   BB->addSuccessor(loop2MBB);
6179   BB->addSuccessor(exitMBB);
6180
6181   // loop2MBB:
6182   //   strex scratch, newval, [ptr]
6183   //   cmp scratch, #0
6184   //   bne loop1MBB
6185   BB = loop2MBB;
6186   MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
6187   if (strOpc == ARM::t2STREX)
6188     MIB.addImm(0);
6189   AddDefaultPred(MIB);
6190   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6191                  .addReg(scratch).addImm(0));
6192   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6193     .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6194   BB->addSuccessor(loop1MBB);
6195   BB->addSuccessor(exitMBB);
6196
6197   //  exitMBB:
6198   //   ...
6199   BB = exitMBB;
6200
6201   MI->eraseFromParent();   // The instruction is gone now.
6202
6203   return BB;
6204 }
6205
6206 MachineBasicBlock *
6207 ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
6208                                     unsigned Size, unsigned BinOpcode) const {
6209   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
6210   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6211
6212   const BasicBlock *LLVM_BB = BB->getBasicBlock();
6213   MachineFunction *MF = BB->getParent();
6214   MachineFunction::iterator It = BB;
6215   ++It;
6216
6217   unsigned dest = MI->getOperand(0).getReg();
6218   unsigned ptr = MI->getOperand(1).getReg();
6219   unsigned incr = MI->getOperand(2).getReg();
6220   DebugLoc dl = MI->getDebugLoc();
6221   bool isThumb2 = Subtarget->isThumb2();
6222
6223   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
6224   if (isThumb2) {
6225     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
6226     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
6227   }
6228
6229   unsigned ldrOpc, strOpc;
6230   switch (Size) {
6231   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
6232   case 1:
6233     ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
6234     strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
6235     break;
6236   case 2:
6237     ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
6238     strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
6239     break;
6240   case 4:
6241     ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
6242     strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
6243     break;
6244   }
6245
6246   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6247   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6248   MF->insert(It, loopMBB);
6249   MF->insert(It, exitMBB);
6250
6251   // Transfer the remainder of BB and its successor edges to exitMBB.
6252   exitMBB->splice(exitMBB->begin(), BB,
6253                   llvm::next(MachineBasicBlock::iterator(MI)),
6254                   BB->end());
6255   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6256
6257   const TargetRegisterClass *TRC = isThumb2 ?
6258     (const TargetRegisterClass*)&ARM::rGPRRegClass :
6259     (const TargetRegisterClass*)&ARM::GPRRegClass;
6260   unsigned scratch = MRI.createVirtualRegister(TRC);
6261   unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
6262
6263   //  thisMBB:
6264   //   ...
6265   //   fallthrough --> loopMBB
6266   BB->addSuccessor(loopMBB);
6267
6268   //  loopMBB:
6269   //   ldrex dest, ptr
6270   //   <binop> scratch2, dest, incr
6271   //   strex scratch, scratch2, ptr
6272   //   cmp scratch, #0
6273   //   bne- loopMBB
6274   //   fallthrough --> exitMBB
6275   BB = loopMBB;
6276   MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
6277   if (ldrOpc == ARM::t2LDREX)
6278     MIB.addImm(0);
6279   AddDefaultPred(MIB);
6280   if (BinOpcode) {
6281     // operand order needs to go the other way for NAND
6282     if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
6283       AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
6284                      addReg(incr).addReg(dest)).addReg(0);
6285     else
6286       AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
6287                      addReg(dest).addReg(incr)).addReg(0);
6288   }
6289
6290   MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
6291   if (strOpc == ARM::t2STREX)
6292     MIB.addImm(0);
6293   AddDefaultPred(MIB);
6294   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6295                  .addReg(scratch).addImm(0));
6296   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6297     .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6298
6299   BB->addSuccessor(loopMBB);
6300   BB->addSuccessor(exitMBB);
6301
6302   //  exitMBB:
6303   //   ...
6304   BB = exitMBB;
6305
6306   MI->eraseFromParent();   // The instruction is gone now.
6307
6308   return BB;
6309 }
6310
6311 MachineBasicBlock *
6312 ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
6313                                           MachineBasicBlock *BB,
6314                                           unsigned Size,
6315                                           bool signExtend,
6316                                           ARMCC::CondCodes Cond) const {
6317   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6318
6319   const BasicBlock *LLVM_BB = BB->getBasicBlock();
6320   MachineFunction *MF = BB->getParent();
6321   MachineFunction::iterator It = BB;
6322   ++It;
6323
6324   unsigned dest = MI->getOperand(0).getReg();
6325   unsigned ptr = MI->getOperand(1).getReg();
6326   unsigned incr = MI->getOperand(2).getReg();
6327   unsigned oldval = dest;
6328   DebugLoc dl = MI->getDebugLoc();
6329   bool isThumb2 = Subtarget->isThumb2();
6330
6331   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
6332   if (isThumb2) {
6333     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
6334     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
6335   }
6336
6337   unsigned ldrOpc, strOpc, extendOpc;
6338   switch (Size) {
6339   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
6340   case 1:
6341     ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
6342     strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
6343     extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
6344     break;
6345   case 2:
6346     ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
6347     strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
6348     extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
6349     break;
6350   case 4:
6351     ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
6352     strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
6353     extendOpc = 0;
6354     break;
6355   }
6356
6357   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6358   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6359   MF->insert(It, loopMBB);
6360   MF->insert(It, exitMBB);
6361
6362   // Transfer the remainder of BB and its successor edges to exitMBB.
6363   exitMBB->splice(exitMBB->begin(), BB,
6364                   llvm::next(MachineBasicBlock::iterator(MI)),
6365                   BB->end());
6366   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6367
6368   const TargetRegisterClass *TRC = isThumb2 ?
6369     (const TargetRegisterClass*)&ARM::rGPRRegClass :
6370     (const TargetRegisterClass*)&ARM::GPRRegClass;
6371   unsigned scratch = MRI.createVirtualRegister(TRC);
6372   unsigned scratch2 = MRI.createVirtualRegister(TRC);
6373
6374   //  thisMBB:
6375   //   ...
6376   //   fallthrough --> loopMBB
6377   BB->addSuccessor(loopMBB);
6378
6379   //  loopMBB:
6380   //   ldrex dest, ptr
6381   //   (sign extend dest, if required)
6382   //   cmp dest, incr
6383   //   cmov.cond scratch2, incr, dest
6384   //   strex scratch, scratch2, ptr
6385   //   cmp scratch, #0
6386   //   bne- loopMBB
6387   //   fallthrough --> exitMBB
6388   BB = loopMBB;
6389   MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
6390   if (ldrOpc == ARM::t2LDREX)
6391     MIB.addImm(0);
6392   AddDefaultPred(MIB);
6393
6394   // Sign extend the value, if necessary.
6395   if (signExtend && extendOpc) {
6396     oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
6397     AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
6398                      .addReg(dest)
6399                      .addImm(0));
6400   }
6401
6402   // Build compare and cmov instructions.
6403   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
6404                  .addReg(oldval).addReg(incr));
6405   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
6406          .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR);
6407
6408   MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
6409   if (strOpc == ARM::t2STREX)
6410     MIB.addImm(0);
6411   AddDefaultPred(MIB);
6412   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6413                  .addReg(scratch).addImm(0));
6414   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6415     .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6416
6417   BB->addSuccessor(loopMBB);
6418   BB->addSuccessor(exitMBB);
6419
6420   //  exitMBB:
6421   //   ...
6422   BB = exitMBB;
6423
6424   MI->eraseFromParent();   // The instruction is gone now.
6425
6426   return BB;
6427 }
6428
6429 MachineBasicBlock *
6430 ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
6431                                       unsigned Op1, unsigned Op2,
6432                                       bool NeedsCarry, bool IsCmpxchg,
6433                                       bool IsMinMax, ARMCC::CondCodes CC) const {
6434   // This also handles ATOMIC_SWAP, indicated by Op1==0.
6435   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6436
6437   const BasicBlock *LLVM_BB = BB->getBasicBlock();
6438   MachineFunction *MF = BB->getParent();
6439   MachineFunction::iterator It = BB;
6440   ++It;
6441
6442   unsigned destlo = MI->getOperand(0).getReg();
6443   unsigned desthi = MI->getOperand(1).getReg();
6444   unsigned ptr = MI->getOperand(2).getReg();
6445   unsigned vallo = MI->getOperand(3).getReg();
6446   unsigned valhi = MI->getOperand(4).getReg();
6447   DebugLoc dl = MI->getDebugLoc();
6448   bool isThumb2 = Subtarget->isThumb2();
6449
6450   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
6451   if (isThumb2) {
6452     MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
6453     MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
6454     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
6455     MRI.constrainRegClass(vallo, &ARM::rGPRRegClass);
6456     MRI.constrainRegClass(valhi, &ARM::rGPRRegClass);
6457   }
6458
6459   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6460   MachineBasicBlock *contBB = 0, *cont2BB = 0;
6461   if (IsCmpxchg || IsMinMax)
6462     contBB = MF->CreateMachineBasicBlock(LLVM_BB);
6463   if (IsCmpxchg)
6464     cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
6465   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6466
6467   MF->insert(It, loopMBB);
6468   if (IsCmpxchg || IsMinMax) MF->insert(It, contBB);
6469   if (IsCmpxchg) MF->insert(It, cont2BB);
6470   MF->insert(It, exitMBB);
6471
6472   // Transfer the remainder of BB and its successor edges to exitMBB.
6473   exitMBB->splice(exitMBB->begin(), BB,
6474                   llvm::next(MachineBasicBlock::iterator(MI)),
6475                   BB->end());
6476   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6477
6478   const TargetRegisterClass *TRC = isThumb2 ?
6479     (const TargetRegisterClass*)&ARM::tGPRRegClass :
6480     (const TargetRegisterClass*)&ARM::GPRRegClass;
6481   unsigned storesuccess = MRI.createVirtualRegister(TRC);
6482
6483   //  thisMBB:
6484   //   ...
6485   //   fallthrough --> loopMBB
6486   BB->addSuccessor(loopMBB);
6487
6488   //  loopMBB:
6489   //   ldrexd r2, r3, ptr
6490   //   <binopa> r0, r2, incr
6491   //   <binopb> r1, r3, incr
6492   //   strexd storesuccess, r0, r1, ptr
6493   //   cmp storesuccess, #0
6494   //   bne- loopMBB
6495   //   fallthrough --> exitMBB
6496   BB = loopMBB;
6497
6498   // Load
6499   if (isThumb2) {
6500     AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD))
6501                    .addReg(destlo, RegState::Define)
6502                    .addReg(desthi, RegState::Define)
6503                    .addReg(ptr));
6504   } else {
6505     unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
6506     AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD))
6507                    .addReg(GPRPair0, RegState::Define).addReg(ptr));
6508     // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
6509     BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
6510       .addReg(GPRPair0, 0, ARM::gsub_0);
6511     BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
6512       .addReg(GPRPair0, 0, ARM::gsub_1);
6513   }
6514
6515   unsigned StoreLo, StoreHi;
6516   if (IsCmpxchg) {
6517     // Add early exit
6518     for (unsigned i = 0; i < 2; i++) {
6519       AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr :
6520                                                          ARM::CMPrr))
6521                      .addReg(i == 0 ? destlo : desthi)
6522                      .addReg(i == 0 ? vallo : valhi));
6523       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6524         .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6525       BB->addSuccessor(exitMBB);
6526       BB->addSuccessor(i == 0 ? contBB : cont2BB);
6527       BB = (i == 0 ? contBB : cont2BB);
6528     }
6529
6530     // Copy to physregs for strexd
6531     StoreLo = MI->getOperand(5).getReg();
6532     StoreHi = MI->getOperand(6).getReg();
6533   } else if (Op1) {
6534     // Perform binary operation
6535     unsigned tmpRegLo = MRI.createVirtualRegister(TRC);
6536     AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo)
6537                    .addReg(destlo).addReg(vallo))
6538         .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
6539     unsigned tmpRegHi = MRI.createVirtualRegister(TRC);
6540     AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi)
6541                    .addReg(desthi).addReg(valhi))
6542         .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax));
6543
6544     StoreLo = tmpRegLo;
6545     StoreHi = tmpRegHi;
6546   } else {
6547     // Copy to physregs for strexd
6548     StoreLo = vallo;
6549     StoreHi = valhi;
6550   }
6551   if (IsMinMax) {
6552     // Compare and branch to exit block.
6553     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6554       .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR);
6555     BB->addSuccessor(exitMBB);
6556     BB->addSuccessor(contBB);
6557     BB = contBB;
6558     StoreLo = vallo;
6559     StoreHi = valhi;
6560   }
6561
6562   // Store
6563   if (isThumb2) {
6564     MRI.constrainRegClass(StoreLo, &ARM::rGPRRegClass);
6565     MRI.constrainRegClass(StoreHi, &ARM::rGPRRegClass);
6566     AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess)
6567                    .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
6568   } else {
6569     // Marshal a pair...
6570     unsigned StorePair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
6571     unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
6572     unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
6573     BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
6574     BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
6575       .addReg(UndefPair)
6576       .addReg(StoreLo)
6577       .addImm(ARM::gsub_0);
6578     BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), StorePair)
6579       .addReg(r1)
6580       .addReg(StoreHi)
6581       .addImm(ARM::gsub_1);
6582
6583     // ...and store it
6584     AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess)
6585                    .addReg(StorePair).addReg(ptr));
6586   }
6587   // Cmp+jump
6588   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6589                  .addReg(storesuccess).addImm(0));
6590   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6591     .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6592
6593   BB->addSuccessor(loopMBB);
6594   BB->addSuccessor(exitMBB);
6595
6596   //  exitMBB:
6597   //   ...
6598   BB = exitMBB;
6599
6600   MI->eraseFromParent();   // The instruction is gone now.
6601
6602   return BB;
6603 }
6604
6605 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
6606 /// registers the function context.
6607 void ARMTargetLowering::
6608 SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
6609                        MachineBasicBlock *DispatchBB, int FI) const {
6610   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6611   DebugLoc dl = MI->getDebugLoc();
6612   MachineFunction *MF = MBB->getParent();
6613   MachineRegisterInfo *MRI = &MF->getRegInfo();
6614   MachineConstantPool *MCP = MF->getConstantPool();
6615   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
6616   const Function *F = MF->getFunction();
6617
6618   bool isThumb = Subtarget->isThumb();
6619   bool isThumb2 = Subtarget->isThumb2();
6620
6621   unsigned PCLabelId = AFI->createPICLabelUId();
6622   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
6623   ARMConstantPoolValue *CPV =
6624     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
6625   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
6626
6627   const TargetRegisterClass *TRC = isThumb ?
6628     (const TargetRegisterClass*)&ARM::tGPRRegClass :
6629     (const TargetRegisterClass*)&ARM::GPRRegClass;
6630
6631   // Grab constant pool and fixed stack memory operands.
6632   MachineMemOperand *CPMMO =
6633     MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
6634                              MachineMemOperand::MOLoad, 4, 4);
6635
6636   MachineMemOperand *FIMMOSt =
6637     MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
6638                              MachineMemOperand::MOStore, 4, 4);
6639
6640   // Load the address of the dispatch MBB into the jump buffer.
6641   if (isThumb2) {
6642     // Incoming value: jbuf
6643     //   ldr.n  r5, LCPI1_1
6644     //   orr    r5, r5, #1
6645     //   add    r5, pc
6646     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
6647     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6648     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
6649                    .addConstantPoolIndex(CPI)
6650                    .addMemOperand(CPMMO));
6651     // Set the low bit because of thumb mode.
6652     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6653     AddDefaultCC(
6654       AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
6655                      .addReg(NewVReg1, RegState::Kill)
6656                      .addImm(0x01)));
6657     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6658     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
6659       .addReg(NewVReg2, RegState::Kill)
6660       .addImm(PCLabelId);
6661     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
6662                    .addReg(NewVReg3, RegState::Kill)
6663                    .addFrameIndex(FI)
6664                    .addImm(36)  // &jbuf[1] :: pc
6665                    .addMemOperand(FIMMOSt));
6666   } else if (isThumb) {
6667     // Incoming value: jbuf
6668     //   ldr.n  r1, LCPI1_4
6669     //   add    r1, pc
6670     //   mov    r2, #1
6671     //   orrs   r1, r2
6672     //   add    r2, $jbuf, #+4 ; &jbuf[1]
6673     //   str    r1, [r2]
6674     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6675     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
6676                    .addConstantPoolIndex(CPI)
6677                    .addMemOperand(CPMMO));
6678     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6679     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
6680       .addReg(NewVReg1, RegState::Kill)
6681       .addImm(PCLabelId);
6682     // Set the low bit because of thumb mode.
6683     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6684     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
6685                    .addReg(ARM::CPSR, RegState::Define)
6686                    .addImm(1));
6687     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6688     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
6689                    .addReg(ARM::CPSR, RegState::Define)
6690                    .addReg(NewVReg2, RegState::Kill)
6691                    .addReg(NewVReg3, RegState::Kill));
6692     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
6693     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
6694                    .addFrameIndex(FI)
6695                    .addImm(36)); // &jbuf[1] :: pc
6696     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
6697                    .addReg(NewVReg4, RegState::Kill)
6698                    .addReg(NewVReg5, RegState::Kill)
6699                    .addImm(0)
6700                    .addMemOperand(FIMMOSt));
6701   } else {
6702     // Incoming value: jbuf
6703     //   ldr  r1, LCPI1_1
6704     //   add  r1, pc, r1
6705     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
6706     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6707     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
6708                    .addConstantPoolIndex(CPI)
6709                    .addImm(0)
6710                    .addMemOperand(CPMMO));
6711     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6712     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
6713                    .addReg(NewVReg1, RegState::Kill)
6714                    .addImm(PCLabelId));
6715     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
6716                    .addReg(NewVReg2, RegState::Kill)
6717                    .addFrameIndex(FI)
6718                    .addImm(36)  // &jbuf[1] :: pc
6719                    .addMemOperand(FIMMOSt));
6720   }
6721 }
6722
6723 MachineBasicBlock *ARMTargetLowering::
6724 EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
6725   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6726   DebugLoc dl = MI->getDebugLoc();
6727   MachineFunction *MF = MBB->getParent();
6728   MachineRegisterInfo *MRI = &MF->getRegInfo();
6729   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
6730   MachineFrameInfo *MFI = MF->getFrameInfo();
6731   int FI = MFI->getFunctionContextIndex();
6732
6733   const TargetRegisterClass *TRC = Subtarget->isThumb() ?
6734     (const TargetRegisterClass*)&ARM::tGPRRegClass :
6735     (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
6736
6737   // Get a mapping of the call site numbers to all of the landing pads they're
6738   // associated with.
6739   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
6740   unsigned MaxCSNum = 0;
6741   MachineModuleInfo &MMI = MF->getMMI();
6742   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
6743        ++BB) {
6744     if (!BB->isLandingPad()) continue;
6745
6746     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
6747     // pad.
6748     for (MachineBasicBlock::iterator
6749            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
6750       if (!II->isEHLabel()) continue;
6751
6752       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
6753       if (!MMI.hasCallSiteLandingPad(Sym)) continue;
6754
6755       SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
6756       for (SmallVectorImpl<unsigned>::iterator
6757              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
6758            CSI != CSE; ++CSI) {
6759         CallSiteNumToLPad[*CSI].push_back(BB);
6760         MaxCSNum = std::max(MaxCSNum, *CSI);
6761       }
6762       break;
6763     }
6764   }
6765
6766   // Get an ordered list of the machine basic blocks for the jump table.
6767   std::vector<MachineBasicBlock*> LPadList;
6768   SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
6769   LPadList.reserve(CallSiteNumToLPad.size());
6770   for (unsigned I = 1; I <= MaxCSNum; ++I) {
6771     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
6772     for (SmallVectorImpl<MachineBasicBlock*>::iterator
6773            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
6774       LPadList.push_back(*II);
6775       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
6776     }
6777   }
6778
6779   assert(!LPadList.empty() &&
6780          "No landing pad destinations for the dispatch jump table!");
6781
6782   // Create the jump table and associated information.
6783   MachineJumpTableInfo *JTI =
6784     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
6785   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
6786   unsigned UId = AFI->createJumpTableUId();
6787   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
6788
6789   // Create the MBBs for the dispatch code.
6790
6791   // Shove the dispatch's address into the return slot in the function context.
6792   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
6793   DispatchBB->setIsLandingPad();
6794
6795   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6796   unsigned trap_opcode;
6797   if (Subtarget->isThumb())
6798     trap_opcode = ARM::tTRAP;
6799   else
6800     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
6801
6802   BuildMI(TrapBB, dl, TII->get(trap_opcode));
6803   DispatchBB->addSuccessor(TrapBB);
6804
6805   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
6806   DispatchBB->addSuccessor(DispContBB);
6807
6808   // Insert and MBBs.
6809   MF->insert(MF->end(), DispatchBB);
6810   MF->insert(MF->end(), DispContBB);
6811   MF->insert(MF->end(), TrapBB);
6812
6813   // Insert code into the entry block that creates and registers the function
6814   // context.
6815   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
6816
6817   MachineMemOperand *FIMMOLd =
6818     MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
6819                              MachineMemOperand::MOLoad |
6820                              MachineMemOperand::MOVolatile, 4, 4);
6821
6822   MachineInstrBuilder MIB;
6823   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
6824
6825   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
6826   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
6827
6828   // Add a register mask with no preserved registers.  This results in all
6829   // registers being marked as clobbered.
6830   MIB.addRegMask(RI.getNoPreservedMask());
6831
6832   unsigned NumLPads = LPadList.size();
6833   if (Subtarget->isThumb2()) {
6834     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6835     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
6836                    .addFrameIndex(FI)
6837                    .addImm(4)
6838                    .addMemOperand(FIMMOLd));
6839
6840     if (NumLPads < 256) {
6841       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
6842                      .addReg(NewVReg1)
6843                      .addImm(LPadList.size()));
6844     } else {
6845       unsigned VReg1 = MRI->createVirtualRegister(TRC);
6846       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
6847                      .addImm(NumLPads & 0xFFFF));
6848
6849       unsigned VReg2 = VReg1;
6850       if ((NumLPads & 0xFFFF0000) != 0) {
6851         VReg2 = MRI->createVirtualRegister(TRC);
6852         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
6853                        .addReg(VReg1)
6854                        .addImm(NumLPads >> 16));
6855       }
6856
6857       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
6858                      .addReg(NewVReg1)
6859                      .addReg(VReg2));
6860     }
6861
6862     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
6863       .addMBB(TrapBB)
6864       .addImm(ARMCC::HI)
6865       .addReg(ARM::CPSR);
6866
6867     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6868     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
6869                    .addJumpTableIndex(MJTI)
6870                    .addImm(UId));
6871
6872     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6873     AddDefaultCC(
6874       AddDefaultPred(
6875         BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
6876         .addReg(NewVReg3, RegState::Kill)
6877         .addReg(NewVReg1)
6878         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
6879
6880     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
6881       .addReg(NewVReg4, RegState::Kill)
6882       .addReg(NewVReg1)
6883       .addJumpTableIndex(MJTI)
6884       .addImm(UId);
6885   } else if (Subtarget->isThumb()) {
6886     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6887     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
6888                    .addFrameIndex(FI)
6889                    .addImm(1)
6890                    .addMemOperand(FIMMOLd));
6891
6892     if (NumLPads < 256) {
6893       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
6894                      .addReg(NewVReg1)
6895                      .addImm(NumLPads));
6896     } else {
6897       MachineConstantPool *ConstantPool = MF->getConstantPool();
6898       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
6899       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
6900
6901       // MachineConstantPool wants an explicit alignment.
6902       unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
6903       if (Align == 0)
6904         Align = getDataLayout()->getTypeAllocSize(C->getType());
6905       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
6906
6907       unsigned VReg1 = MRI->createVirtualRegister(TRC);
6908       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
6909                      .addReg(VReg1, RegState::Define)
6910                      .addConstantPoolIndex(Idx));
6911       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
6912                      .addReg(NewVReg1)
6913                      .addReg(VReg1));
6914     }
6915
6916     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
6917       .addMBB(TrapBB)
6918       .addImm(ARMCC::HI)
6919       .addReg(ARM::CPSR);
6920
6921     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6922     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
6923                    .addReg(ARM::CPSR, RegState::Define)
6924                    .addReg(NewVReg1)
6925                    .addImm(2));
6926
6927     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6928     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
6929                    .addJumpTableIndex(MJTI)
6930                    .addImm(UId));
6931
6932     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6933     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
6934                    .addReg(ARM::CPSR, RegState::Define)
6935                    .addReg(NewVReg2, RegState::Kill)
6936                    .addReg(NewVReg3));
6937
6938     MachineMemOperand *JTMMOLd =
6939       MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
6940                                MachineMemOperand::MOLoad, 4, 4);
6941
6942     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
6943     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
6944                    .addReg(NewVReg4, RegState::Kill)
6945                    .addImm(0)
6946                    .addMemOperand(JTMMOLd));
6947
6948     unsigned NewVReg6 = NewVReg5;
6949     if (RelocM == Reloc::PIC_) {
6950       NewVReg6 = MRI->createVirtualRegister(TRC);
6951       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
6952                      .addReg(ARM::CPSR, RegState::Define)
6953                      .addReg(NewVReg5, RegState::Kill)
6954                      .addReg(NewVReg3));
6955     }
6956
6957     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
6958       .addReg(NewVReg6, RegState::Kill)
6959       .addJumpTableIndex(MJTI)
6960       .addImm(UId);
6961   } else {
6962     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6963     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
6964                    .addFrameIndex(FI)
6965                    .addImm(4)
6966                    .addMemOperand(FIMMOLd));
6967
6968     if (NumLPads < 256) {
6969       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
6970                      .addReg(NewVReg1)
6971                      .addImm(NumLPads));
6972     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
6973       unsigned VReg1 = MRI->createVirtualRegister(TRC);
6974       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
6975                      .addImm(NumLPads & 0xFFFF));
6976
6977       unsigned VReg2 = VReg1;
6978       if ((NumLPads & 0xFFFF0000) != 0) {
6979         VReg2 = MRI->createVirtualRegister(TRC);
6980         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
6981                        .addReg(VReg1)
6982                        .addImm(NumLPads >> 16));
6983       }
6984
6985       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
6986                      .addReg(NewVReg1)
6987                      .addReg(VReg2));
6988     } else {
6989       MachineConstantPool *ConstantPool = MF->getConstantPool();
6990       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
6991       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
6992
6993       // MachineConstantPool wants an explicit alignment.
6994       unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
6995       if (Align == 0)
6996         Align = getDataLayout()->getTypeAllocSize(C->getType());
6997       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
6998
6999       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7000       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
7001                      .addReg(VReg1, RegState::Define)
7002                      .addConstantPoolIndex(Idx)
7003                      .addImm(0));
7004       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
7005                      .addReg(NewVReg1)
7006                      .addReg(VReg1, RegState::Kill));
7007     }
7008
7009     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
7010       .addMBB(TrapBB)
7011       .addImm(ARMCC::HI)
7012       .addReg(ARM::CPSR);
7013
7014     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7015     AddDefaultCC(
7016       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
7017                      .addReg(NewVReg1)
7018                      .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
7019     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7020     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
7021                    .addJumpTableIndex(MJTI)
7022                    .addImm(UId));
7023
7024     MachineMemOperand *JTMMOLd =
7025       MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
7026                                MachineMemOperand::MOLoad, 4, 4);
7027     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7028     AddDefaultPred(
7029       BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
7030       .addReg(NewVReg3, RegState::Kill)
7031       .addReg(NewVReg4)
7032       .addImm(0)
7033       .addMemOperand(JTMMOLd));
7034
7035     if (RelocM == Reloc::PIC_) {
7036       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
7037         .addReg(NewVReg5, RegState::Kill)
7038         .addReg(NewVReg4)
7039         .addJumpTableIndex(MJTI)
7040         .addImm(UId);
7041     } else {
7042       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
7043         .addReg(NewVReg5, RegState::Kill)
7044         .addJumpTableIndex(MJTI)
7045         .addImm(UId);
7046     }
7047   }
7048
7049   // Add the jump table entries as successors to the MBB.
7050   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
7051   for (std::vector<MachineBasicBlock*>::iterator
7052          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
7053     MachineBasicBlock *CurMBB = *I;
7054     if (SeenMBBs.insert(CurMBB))
7055       DispContBB->addSuccessor(CurMBB);
7056   }
7057
7058   // N.B. the order the invoke BBs are processed in doesn't matter here.
7059   const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
7060   SmallVector<MachineBasicBlock*, 64> MBBLPads;
7061   for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
7062          I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
7063     MachineBasicBlock *BB = *I;
7064
7065     // Remove the landing pad successor from the invoke block and replace it
7066     // with the new dispatch block.
7067     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
7068                                                   BB->succ_end());
7069     while (!Successors.empty()) {
7070       MachineBasicBlock *SMBB = Successors.pop_back_val();
7071       if (SMBB->isLandingPad()) {
7072         BB->removeSuccessor(SMBB);
7073         MBBLPads.push_back(SMBB);
7074       }
7075     }
7076
7077     BB->addSuccessor(DispatchBB);
7078
7079     // Find the invoke call and mark all of the callee-saved registers as
7080     // 'implicit defined' so that they're spilled. This prevents code from
7081     // moving instructions to before the EH block, where they will never be
7082     // executed.
7083     for (MachineBasicBlock::reverse_iterator
7084            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
7085       if (!II->isCall()) continue;
7086
7087       DenseMap<unsigned, bool> DefRegs;
7088       for (MachineInstr::mop_iterator
7089              OI = II->operands_begin(), OE = II->operands_end();
7090            OI != OE; ++OI) {
7091         if (!OI->isReg()) continue;
7092         DefRegs[OI->getReg()] = true;
7093       }
7094
7095       MachineInstrBuilder MIB(*MF, &*II);
7096
7097       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
7098         unsigned Reg = SavedRegs[i];
7099         if (Subtarget->isThumb2() &&
7100             !ARM::tGPRRegClass.contains(Reg) &&
7101             !ARM::hGPRRegClass.contains(Reg))
7102           continue;
7103         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
7104           continue;
7105         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
7106           continue;
7107         if (!DefRegs[Reg])
7108           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
7109       }
7110
7111       break;
7112     }
7113   }
7114
7115   // Mark all former landing pads as non-landing pads. The dispatch is the only
7116   // landing pad now.
7117   for (SmallVectorImpl<MachineBasicBlock*>::iterator
7118          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
7119     (*I)->setIsLandingPad(false);
7120
7121   // The instruction is gone now.
7122   MI->eraseFromParent();
7123
7124   return MBB;
7125 }
7126
7127 static
7128 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
7129   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
7130        E = MBB->succ_end(); I != E; ++I)
7131     if (*I != Succ)
7132       return *I;
7133   llvm_unreachable("Expecting a BB with two successors!");
7134 }
7135
7136 MachineBasicBlock *ARMTargetLowering::
7137 EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
7138   // This pseudo instruction has 3 operands: dst, src, size
7139   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
7140   // Otherwise, we will generate unrolled scalar copies.
7141   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7142   const BasicBlock *LLVM_BB = BB->getBasicBlock();
7143   MachineFunction::iterator It = BB;
7144   ++It;
7145
7146   unsigned dest = MI->getOperand(0).getReg();
7147   unsigned src = MI->getOperand(1).getReg();
7148   unsigned SizeVal = MI->getOperand(2).getImm();
7149   unsigned Align = MI->getOperand(3).getImm();
7150   DebugLoc dl = MI->getDebugLoc();
7151
7152   bool isThumb2 = Subtarget->isThumb2();
7153   MachineFunction *MF = BB->getParent();
7154   MachineRegisterInfo &MRI = MF->getRegInfo();
7155   unsigned ldrOpc, strOpc, UnitSize = 0;
7156
7157   const TargetRegisterClass *TRC = isThumb2 ?
7158     (const TargetRegisterClass*)&ARM::tGPRRegClass :
7159     (const TargetRegisterClass*)&ARM::GPRRegClass;
7160   const TargetRegisterClass *TRC_Vec = 0;
7161
7162   if (Align & 1) {
7163     ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
7164     strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
7165     UnitSize = 1;
7166   } else if (Align & 2) {
7167     ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
7168     strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
7169     UnitSize = 2;
7170   } else {
7171     // Check whether we can use NEON instructions.
7172     if (!MF->getFunction()->getAttributes().
7173           hasAttribute(AttributeSet::FunctionIndex,
7174                        Attribute::NoImplicitFloat) &&
7175         Subtarget->hasNEON()) {
7176       if ((Align % 16 == 0) && SizeVal >= 16) {
7177         ldrOpc = ARM::VLD1q32wb_fixed;
7178         strOpc = ARM::VST1q32wb_fixed;
7179         UnitSize = 16;
7180         TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
7181       }
7182       else if ((Align % 8 == 0) && SizeVal >= 8) {
7183         ldrOpc = ARM::VLD1d32wb_fixed;
7184         strOpc = ARM::VST1d32wb_fixed;
7185         UnitSize = 8;
7186         TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
7187       }
7188     }
7189     // Can't use NEON instructions.
7190     if (UnitSize == 0) {
7191       ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
7192       strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
7193       UnitSize = 4;
7194     }
7195   }
7196
7197   unsigned BytesLeft = SizeVal % UnitSize;
7198   unsigned LoopSize = SizeVal - BytesLeft;
7199
7200   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
7201     // Use LDR and STR to copy.
7202     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
7203     // [destOut] = STR_POST(scratch, destIn, UnitSize)
7204     unsigned srcIn = src;
7205     unsigned destIn = dest;
7206     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
7207       unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
7208       unsigned srcOut = MRI.createVirtualRegister(TRC);
7209       unsigned destOut = MRI.createVirtualRegister(TRC);
7210       if (UnitSize >= 8) {
7211         AddDefaultPred(BuildMI(*BB, MI, dl,
7212           TII->get(ldrOpc), scratch)
7213           .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
7214
7215         AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7216           .addReg(destIn).addImm(0).addReg(scratch));
7217       } else if (isThumb2) {
7218         AddDefaultPred(BuildMI(*BB, MI, dl,
7219           TII->get(ldrOpc), scratch)
7220           .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
7221
7222         AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7223           .addReg(scratch).addReg(destIn)
7224           .addImm(UnitSize));
7225       } else {
7226         AddDefaultPred(BuildMI(*BB, MI, dl,
7227           TII->get(ldrOpc), scratch)
7228           .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
7229           .addImm(UnitSize));
7230
7231         AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7232           .addReg(scratch).addReg(destIn)
7233           .addReg(0).addImm(UnitSize));
7234       }
7235       srcIn = srcOut;
7236       destIn = destOut;
7237     }
7238
7239     // Handle the leftover bytes with LDRB and STRB.
7240     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
7241     // [destOut] = STRB_POST(scratch, destIn, 1)
7242     ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
7243     strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
7244     for (unsigned i = 0; i < BytesLeft; i++) {
7245       unsigned scratch = MRI.createVirtualRegister(TRC);
7246       unsigned srcOut = MRI.createVirtualRegister(TRC);
7247       unsigned destOut = MRI.createVirtualRegister(TRC);
7248       if (isThumb2) {
7249         AddDefaultPred(BuildMI(*BB, MI, dl,
7250           TII->get(ldrOpc),scratch)
7251           .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
7252
7253         AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7254           .addReg(scratch).addReg(destIn)
7255           .addReg(0).addImm(1));
7256       } else {
7257         AddDefaultPred(BuildMI(*BB, MI, dl,
7258           TII->get(ldrOpc),scratch)
7259           .addReg(srcOut, RegState::Define).addReg(srcIn)
7260           .addReg(0).addImm(1));
7261
7262         AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7263           .addReg(scratch).addReg(destIn)
7264           .addReg(0).addImm(1));
7265       }
7266       srcIn = srcOut;
7267       destIn = destOut;
7268     }
7269     MI->eraseFromParent();   // The instruction is gone now.
7270     return BB;
7271   }
7272
7273   // Expand the pseudo op to a loop.
7274   // thisMBB:
7275   //   ...
7276   //   movw varEnd, # --> with thumb2
7277   //   movt varEnd, #
7278   //   ldrcp varEnd, idx --> without thumb2
7279   //   fallthrough --> loopMBB
7280   // loopMBB:
7281   //   PHI varPhi, varEnd, varLoop
7282   //   PHI srcPhi, src, srcLoop
7283   //   PHI destPhi, dst, destLoop
7284   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
7285   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
7286   //   subs varLoop, varPhi, #UnitSize
7287   //   bne loopMBB
7288   //   fallthrough --> exitMBB
7289   // exitMBB:
7290   //   epilogue to handle left-over bytes
7291   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
7292   //   [destOut] = STRB_POST(scratch, destLoop, 1)
7293   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
7294   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
7295   MF->insert(It, loopMBB);
7296   MF->insert(It, exitMBB);
7297
7298   // Transfer the remainder of BB and its successor edges to exitMBB.
7299   exitMBB->splice(exitMBB->begin(), BB,
7300                   llvm::next(MachineBasicBlock::iterator(MI)),
7301                   BB->end());
7302   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
7303
7304   // Load an immediate to varEnd.
7305   unsigned varEnd = MRI.createVirtualRegister(TRC);
7306   if (isThumb2) {
7307     unsigned VReg1 = varEnd;
7308     if ((LoopSize & 0xFFFF0000) != 0)
7309       VReg1 = MRI.createVirtualRegister(TRC);
7310     AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
7311                    .addImm(LoopSize & 0xFFFF));
7312
7313     if ((LoopSize & 0xFFFF0000) != 0)
7314       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
7315                      .addReg(VReg1)
7316                      .addImm(LoopSize >> 16));
7317   } else {
7318     MachineConstantPool *ConstantPool = MF->getConstantPool();
7319     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7320     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
7321
7322     // MachineConstantPool wants an explicit alignment.
7323     unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
7324     if (Align == 0)
7325       Align = getDataLayout()->getTypeAllocSize(C->getType());
7326     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7327
7328     AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
7329                    .addReg(varEnd, RegState::Define)
7330                    .addConstantPoolIndex(Idx)
7331                    .addImm(0));
7332   }
7333   BB->addSuccessor(loopMBB);
7334
7335   // Generate the loop body:
7336   //   varPhi = PHI(varLoop, varEnd)
7337   //   srcPhi = PHI(srcLoop, src)
7338   //   destPhi = PHI(destLoop, dst)
7339   MachineBasicBlock *entryBB = BB;
7340   BB = loopMBB;
7341   unsigned varLoop = MRI.createVirtualRegister(TRC);
7342   unsigned varPhi = MRI.createVirtualRegister(TRC);
7343   unsigned srcLoop = MRI.createVirtualRegister(TRC);
7344   unsigned srcPhi = MRI.createVirtualRegister(TRC);
7345   unsigned destLoop = MRI.createVirtualRegister(TRC);
7346   unsigned destPhi = MRI.createVirtualRegister(TRC);
7347
7348   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
7349     .addReg(varLoop).addMBB(loopMBB)
7350     .addReg(varEnd).addMBB(entryBB);
7351   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
7352     .addReg(srcLoop).addMBB(loopMBB)
7353     .addReg(src).addMBB(entryBB);
7354   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
7355     .addReg(destLoop).addMBB(loopMBB)
7356     .addReg(dest).addMBB(entryBB);
7357
7358   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
7359   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
7360   unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
7361   if (UnitSize >= 8) {
7362     AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
7363       .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
7364
7365     AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
7366       .addReg(destPhi).addImm(0).addReg(scratch));
7367   } else if (isThumb2) {
7368     AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
7369       .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
7370
7371     AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
7372       .addReg(scratch).addReg(destPhi)
7373       .addImm(UnitSize));
7374   } else {
7375     AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
7376       .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
7377       .addImm(UnitSize));
7378
7379     AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
7380       .addReg(scratch).addReg(destPhi)
7381       .addReg(0).addImm(UnitSize));
7382   }
7383
7384   // Decrement loop variable by UnitSize.
7385   MachineInstrBuilder MIB = BuildMI(BB, dl,
7386     TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
7387   AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
7388   MIB->getOperand(5).setReg(ARM::CPSR);
7389   MIB->getOperand(5).setIsDef(true);
7390
7391   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
7392     .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
7393
7394   // loopMBB can loop back to loopMBB or fall through to exitMBB.
7395   BB->addSuccessor(loopMBB);
7396   BB->addSuccessor(exitMBB);
7397
7398   // Add epilogue to handle BytesLeft.
7399   BB = exitMBB;
7400   MachineInstr *StartOfExit = exitMBB->begin();
7401   ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
7402   strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
7403
7404   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
7405   //   [destOut] = STRB_POST(scratch, destLoop, 1)
7406   unsigned srcIn = srcLoop;
7407   unsigned destIn = destLoop;
7408   for (unsigned i = 0; i < BytesLeft; i++) {
7409     unsigned scratch = MRI.createVirtualRegister(TRC);
7410     unsigned srcOut = MRI.createVirtualRegister(TRC);
7411     unsigned destOut = MRI.createVirtualRegister(TRC);
7412     if (isThumb2) {
7413       AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
7414         TII->get(ldrOpc),scratch)
7415         .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
7416
7417       AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
7418         .addReg(scratch).addReg(destIn)
7419         .addImm(1));
7420     } else {
7421       AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
7422         TII->get(ldrOpc),scratch)
7423         .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
7424
7425       AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
7426         .addReg(scratch).addReg(destIn)
7427         .addReg(0).addImm(1));
7428     }
7429     srcIn = srcOut;
7430     destIn = destOut;
7431   }
7432
7433   MI->eraseFromParent();   // The instruction is gone now.
7434   return BB;
7435 }
7436
7437 MachineBasicBlock *
7438 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
7439                                                MachineBasicBlock *BB) const {
7440   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7441   DebugLoc dl = MI->getDebugLoc();
7442   bool isThumb2 = Subtarget->isThumb2();
7443   switch (MI->getOpcode()) {
7444   default: {
7445     MI->dump();
7446     llvm_unreachable("Unexpected instr type to insert");
7447   }
7448   // The Thumb2 pre-indexed stores have the same MI operands, they just
7449   // define them differently in the .td files from the isel patterns, so
7450   // they need pseudos.
7451   case ARM::t2STR_preidx:
7452     MI->setDesc(TII->get(ARM::t2STR_PRE));
7453     return BB;
7454   case ARM::t2STRB_preidx:
7455     MI->setDesc(TII->get(ARM::t2STRB_PRE));
7456     return BB;
7457   case ARM::t2STRH_preidx:
7458     MI->setDesc(TII->get(ARM::t2STRH_PRE));
7459     return BB;
7460
7461   case ARM::STRi_preidx:
7462   case ARM::STRBi_preidx: {
7463     unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
7464       ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
7465     // Decode the offset.
7466     unsigned Offset = MI->getOperand(4).getImm();
7467     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
7468     Offset = ARM_AM::getAM2Offset(Offset);
7469     if (isSub)
7470       Offset = -Offset;
7471
7472     MachineMemOperand *MMO = *MI->memoperands_begin();
7473     BuildMI(*BB, MI, dl, TII->get(NewOpc))
7474       .addOperand(MI->getOperand(0))  // Rn_wb
7475       .addOperand(MI->getOperand(1))  // Rt
7476       .addOperand(MI->getOperand(2))  // Rn
7477       .addImm(Offset)                 // offset (skip GPR==zero_reg)
7478       .addOperand(MI->getOperand(5))  // pred
7479       .addOperand(MI->getOperand(6))
7480       .addMemOperand(MMO);
7481     MI->eraseFromParent();
7482     return BB;
7483   }
7484   case ARM::STRr_preidx:
7485   case ARM::STRBr_preidx:
7486   case ARM::STRH_preidx: {
7487     unsigned NewOpc;
7488     switch (MI->getOpcode()) {
7489     default: llvm_unreachable("unexpected opcode!");
7490     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
7491     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
7492     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
7493     }
7494     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
7495     for (unsigned i = 0; i < MI->getNumOperands(); ++i)
7496       MIB.addOperand(MI->getOperand(i));
7497     MI->eraseFromParent();
7498     return BB;
7499   }
7500   case ARM::ATOMIC_LOAD_ADD_I8:
7501      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
7502   case ARM::ATOMIC_LOAD_ADD_I16:
7503      return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
7504   case ARM::ATOMIC_LOAD_ADD_I32:
7505      return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
7506
7507   case ARM::ATOMIC_LOAD_AND_I8:
7508      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
7509   case ARM::ATOMIC_LOAD_AND_I16:
7510      return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
7511   case ARM::ATOMIC_LOAD_AND_I32:
7512      return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
7513
7514   case ARM::ATOMIC_LOAD_OR_I8:
7515      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
7516   case ARM::ATOMIC_LOAD_OR_I16:
7517      return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
7518   case ARM::ATOMIC_LOAD_OR_I32:
7519      return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
7520
7521   case ARM::ATOMIC_LOAD_XOR_I8:
7522      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
7523   case ARM::ATOMIC_LOAD_XOR_I16:
7524      return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
7525   case ARM::ATOMIC_LOAD_XOR_I32:
7526      return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
7527
7528   case ARM::ATOMIC_LOAD_NAND_I8:
7529      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
7530   case ARM::ATOMIC_LOAD_NAND_I16:
7531      return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
7532   case ARM::ATOMIC_LOAD_NAND_I32:
7533      return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
7534
7535   case ARM::ATOMIC_LOAD_SUB_I8:
7536      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
7537   case ARM::ATOMIC_LOAD_SUB_I16:
7538      return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
7539   case ARM::ATOMIC_LOAD_SUB_I32:
7540      return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
7541
7542   case ARM::ATOMIC_LOAD_MIN_I8:
7543      return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT);
7544   case ARM::ATOMIC_LOAD_MIN_I16:
7545      return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT);
7546   case ARM::ATOMIC_LOAD_MIN_I32:
7547      return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT);
7548
7549   case ARM::ATOMIC_LOAD_MAX_I8:
7550      return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT);
7551   case ARM::ATOMIC_LOAD_MAX_I16:
7552      return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT);
7553   case ARM::ATOMIC_LOAD_MAX_I32:
7554      return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT);
7555
7556   case ARM::ATOMIC_LOAD_UMIN_I8:
7557      return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO);
7558   case ARM::ATOMIC_LOAD_UMIN_I16:
7559      return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO);
7560   case ARM::ATOMIC_LOAD_UMIN_I32:
7561      return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO);
7562
7563   case ARM::ATOMIC_LOAD_UMAX_I8:
7564      return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI);
7565   case ARM::ATOMIC_LOAD_UMAX_I16:
7566      return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI);
7567   case ARM::ATOMIC_LOAD_UMAX_I32:
7568      return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI);
7569
7570   case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
7571   case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
7572   case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
7573
7574   case ARM::ATOMIC_CMP_SWAP_I8:  return EmitAtomicCmpSwap(MI, BB, 1);
7575   case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
7576   case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
7577
7578
7579   case ARM::ATOMADD6432:
7580     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
7581                               isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
7582                               /*NeedsCarry*/ true);
7583   case ARM::ATOMSUB6432:
7584     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7585                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7586                               /*NeedsCarry*/ true);
7587   case ARM::ATOMOR6432:
7588     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
7589                               isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
7590   case ARM::ATOMXOR6432:
7591     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
7592                               isThumb2 ? ARM::t2EORrr : ARM::EORrr);
7593   case ARM::ATOMAND6432:
7594     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
7595                               isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
7596   case ARM::ATOMSWAP6432:
7597     return EmitAtomicBinary64(MI, BB, 0, 0, false);
7598   case ARM::ATOMCMPXCHG6432:
7599     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7600                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7601                               /*NeedsCarry*/ false, /*IsCmpxchg*/true);
7602   case ARM::ATOMMIN6432:
7603     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7604                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7605                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
7606                               /*IsMinMax*/ true, ARMCC::LT);
7607   case ARM::ATOMMAX6432:
7608     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7609                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7610                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
7611                               /*IsMinMax*/ true, ARMCC::GE);
7612   case ARM::ATOMUMIN6432:
7613     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7614                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7615                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
7616                               /*IsMinMax*/ true, ARMCC::LO);
7617   case ARM::ATOMUMAX6432:
7618     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7619                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7620                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
7621                               /*IsMinMax*/ true, ARMCC::HS);
7622
7623   case ARM::tMOVCCr_pseudo: {
7624     // To "insert" a SELECT_CC instruction, we actually have to insert the
7625     // diamond control-flow pattern.  The incoming instruction knows the
7626     // destination vreg to set, the condition code register to branch on, the
7627     // true/false values to select between, and a branch opcode to use.
7628     const BasicBlock *LLVM_BB = BB->getBasicBlock();
7629     MachineFunction::iterator It = BB;
7630     ++It;
7631
7632     //  thisMBB:
7633     //  ...
7634     //   TrueVal = ...
7635     //   cmpTY ccX, r1, r2
7636     //   bCC copy1MBB
7637     //   fallthrough --> copy0MBB
7638     MachineBasicBlock *thisMBB  = BB;
7639     MachineFunction *F = BB->getParent();
7640     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
7641     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
7642     F->insert(It, copy0MBB);
7643     F->insert(It, sinkMBB);
7644
7645     // Transfer the remainder of BB and its successor edges to sinkMBB.
7646     sinkMBB->splice(sinkMBB->begin(), BB,
7647                     llvm::next(MachineBasicBlock::iterator(MI)),
7648                     BB->end());
7649     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
7650
7651     BB->addSuccessor(copy0MBB);
7652     BB->addSuccessor(sinkMBB);
7653
7654     BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
7655       .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
7656
7657     //  copy0MBB:
7658     //   %FalseValue = ...
7659     //   # fallthrough to sinkMBB
7660     BB = copy0MBB;
7661
7662     // Update machine-CFG edges
7663     BB->addSuccessor(sinkMBB);
7664
7665     //  sinkMBB:
7666     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
7667     //  ...
7668     BB = sinkMBB;
7669     BuildMI(*BB, BB->begin(), dl,
7670             TII->get(ARM::PHI), MI->getOperand(0).getReg())
7671       .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
7672       .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
7673
7674     MI->eraseFromParent();   // The pseudo instruction is gone now.
7675     return BB;
7676   }
7677
7678   case ARM::BCCi64:
7679   case ARM::BCCZi64: {
7680     // If there is an unconditional branch to the other successor, remove it.
7681     BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
7682
7683     // Compare both parts that make up the double comparison separately for
7684     // equality.
7685     bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
7686
7687     unsigned LHS1 = MI->getOperand(1).getReg();
7688     unsigned LHS2 = MI->getOperand(2).getReg();
7689     if (RHSisZero) {
7690       AddDefaultPred(BuildMI(BB, dl,
7691                              TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
7692                      .addReg(LHS1).addImm(0));
7693       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
7694         .addReg(LHS2).addImm(0)
7695         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
7696     } else {
7697       unsigned RHS1 = MI->getOperand(3).getReg();
7698       unsigned RHS2 = MI->getOperand(4).getReg();
7699       AddDefaultPred(BuildMI(BB, dl,
7700                              TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
7701                      .addReg(LHS1).addReg(RHS1));
7702       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
7703         .addReg(LHS2).addReg(RHS2)
7704         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
7705     }
7706
7707     MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
7708     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
7709     if (MI->getOperand(0).getImm() == ARMCC::NE)
7710       std::swap(destMBB, exitMBB);
7711
7712     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
7713       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
7714     if (isThumb2)
7715       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
7716     else
7717       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
7718
7719     MI->eraseFromParent();   // The pseudo instruction is gone now.
7720     return BB;
7721   }
7722
7723   case ARM::Int_eh_sjlj_setjmp:
7724   case ARM::Int_eh_sjlj_setjmp_nofp:
7725   case ARM::tInt_eh_sjlj_setjmp:
7726   case ARM::t2Int_eh_sjlj_setjmp:
7727   case ARM::t2Int_eh_sjlj_setjmp_nofp:
7728     EmitSjLjDispatchBlock(MI, BB);
7729     return BB;
7730
7731   case ARM::ABS:
7732   case ARM::t2ABS: {
7733     // To insert an ABS instruction, we have to insert the
7734     // diamond control-flow pattern.  The incoming instruction knows the
7735     // source vreg to test against 0, the destination vreg to set,
7736     // the condition code register to branch on, the
7737     // true/false values to select between, and a branch opcode to use.
7738     // It transforms
7739     //     V1 = ABS V0
7740     // into
7741     //     V2 = MOVS V0
7742     //     BCC                      (branch to SinkBB if V0 >= 0)
7743     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
7744     //     SinkBB: V1 = PHI(V2, V3)
7745     const BasicBlock *LLVM_BB = BB->getBasicBlock();
7746     MachineFunction::iterator BBI = BB;
7747     ++BBI;
7748     MachineFunction *Fn = BB->getParent();
7749     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
7750     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
7751     Fn->insert(BBI, RSBBB);
7752     Fn->insert(BBI, SinkBB);
7753
7754     unsigned int ABSSrcReg = MI->getOperand(1).getReg();
7755     unsigned int ABSDstReg = MI->getOperand(0).getReg();
7756     bool isThumb2 = Subtarget->isThumb2();
7757     MachineRegisterInfo &MRI = Fn->getRegInfo();
7758     // In Thumb mode S must not be specified if source register is the SP or
7759     // PC and if destination register is the SP, so restrict register class
7760     unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
7761       (const TargetRegisterClass*)&ARM::rGPRRegClass :
7762       (const TargetRegisterClass*)&ARM::GPRRegClass);
7763
7764     // Transfer the remainder of BB and its successor edges to sinkMBB.
7765     SinkBB->splice(SinkBB->begin(), BB,
7766       llvm::next(MachineBasicBlock::iterator(MI)),
7767       BB->end());
7768     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
7769
7770     BB->addSuccessor(RSBBB);
7771     BB->addSuccessor(SinkBB);
7772
7773     // fall through to SinkMBB
7774     RSBBB->addSuccessor(SinkBB);
7775
7776     // insert a cmp at the end of BB
7777     AddDefaultPred(BuildMI(BB, dl,
7778                            TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
7779                    .addReg(ABSSrcReg).addImm(0));
7780
7781     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
7782     BuildMI(BB, dl,
7783       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
7784       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
7785
7786     // insert rsbri in RSBBB
7787     // Note: BCC and rsbri will be converted into predicated rsbmi
7788     // by if-conversion pass
7789     BuildMI(*RSBBB, RSBBB->begin(), dl,
7790       TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
7791       .addReg(ABSSrcReg, RegState::Kill)
7792       .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
7793
7794     // insert PHI in SinkBB,
7795     // reuse ABSDstReg to not change uses of ABS instruction
7796     BuildMI(*SinkBB, SinkBB->begin(), dl,
7797       TII->get(ARM::PHI), ABSDstReg)
7798       .addReg(NewRsbDstReg).addMBB(RSBBB)
7799       .addReg(ABSSrcReg).addMBB(BB);
7800
7801     // remove ABS instruction
7802     MI->eraseFromParent();
7803
7804     // return last added BB
7805     return SinkBB;
7806   }
7807   case ARM::COPY_STRUCT_BYVAL_I32:
7808     ++NumLoopByVals;
7809     return EmitStructByval(MI, BB);
7810   }
7811 }
7812
7813 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
7814                                                       SDNode *Node) const {
7815   if (!MI->hasPostISelHook()) {
7816     assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
7817            "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
7818     return;
7819   }
7820
7821   const MCInstrDesc *MCID = &MI->getDesc();
7822   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
7823   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
7824   // operand is still set to noreg. If needed, set the optional operand's
7825   // register to CPSR, and remove the redundant implicit def.
7826   //
7827   // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
7828
7829   // Rename pseudo opcodes.
7830   unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
7831   if (NewOpc) {
7832     const ARMBaseInstrInfo *TII =
7833       static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
7834     MCID = &TII->get(NewOpc);
7835
7836     assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
7837            "converted opcode should be the same except for cc_out");
7838
7839     MI->setDesc(*MCID);
7840
7841     // Add the optional cc_out operand
7842     MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
7843   }
7844   unsigned ccOutIdx = MCID->getNumOperands() - 1;
7845
7846   // Any ARM instruction that sets the 's' bit should specify an optional
7847   // "cc_out" operand in the last operand position.
7848   if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
7849     assert(!NewOpc && "Optional cc_out operand required");
7850     return;
7851   }
7852   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
7853   // since we already have an optional CPSR def.
7854   bool definesCPSR = false;
7855   bool deadCPSR = false;
7856   for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
7857        i != e; ++i) {
7858     const MachineOperand &MO = MI->getOperand(i);
7859     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
7860       definesCPSR = true;
7861       if (MO.isDead())
7862         deadCPSR = true;
7863       MI->RemoveOperand(i);
7864       break;
7865     }
7866   }
7867   if (!definesCPSR) {
7868     assert(!NewOpc && "Optional cc_out operand required");
7869     return;
7870   }
7871   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
7872   if (deadCPSR) {
7873     assert(!MI->getOperand(ccOutIdx).getReg() &&
7874            "expect uninitialized optional cc_out operand");
7875     return;
7876   }
7877
7878   // If this instruction was defined with an optional CPSR def and its dag node
7879   // had a live implicit CPSR def, then activate the optional CPSR def.
7880   MachineOperand &MO = MI->getOperand(ccOutIdx);
7881   MO.setReg(ARM::CPSR);
7882   MO.setIsDef(true);
7883 }
7884
7885 //===----------------------------------------------------------------------===//
7886 //                           ARM Optimization Hooks
7887 //===----------------------------------------------------------------------===//
7888
7889 // Helper function that checks if N is a null or all ones constant.
7890 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
7891   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
7892   if (!C)
7893     return false;
7894   return AllOnes ? C->isAllOnesValue() : C->isNullValue();
7895 }
7896
7897 // Return true if N is conditionally 0 or all ones.
7898 // Detects these expressions where cc is an i1 value:
7899 //
7900 //   (select cc 0, y)   [AllOnes=0]
7901 //   (select cc y, 0)   [AllOnes=0]
7902 //   (zext cc)          [AllOnes=0]
7903 //   (sext cc)          [AllOnes=0/1]
7904 //   (select cc -1, y)  [AllOnes=1]
7905 //   (select cc y, -1)  [AllOnes=1]
7906 //
7907 // Invert is set when N is the null/all ones constant when CC is false.
7908 // OtherOp is set to the alternative value of N.
7909 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
7910                                        SDValue &CC, bool &Invert,
7911                                        SDValue &OtherOp,
7912                                        SelectionDAG &DAG) {
7913   switch (N->getOpcode()) {
7914   default: return false;
7915   case ISD::SELECT: {
7916     CC = N->getOperand(0);
7917     SDValue N1 = N->getOperand(1);
7918     SDValue N2 = N->getOperand(2);
7919     if (isZeroOrAllOnes(N1, AllOnes)) {
7920       Invert = false;
7921       OtherOp = N2;
7922       return true;
7923     }
7924     if (isZeroOrAllOnes(N2, AllOnes)) {
7925       Invert = true;
7926       OtherOp = N1;
7927       return true;
7928     }
7929     return false;
7930   }
7931   case ISD::ZERO_EXTEND:
7932     // (zext cc) can never be the all ones value.
7933     if (AllOnes)
7934       return false;
7935     // Fall through.
7936   case ISD::SIGN_EXTEND: {
7937     EVT VT = N->getValueType(0);
7938     CC = N->getOperand(0);
7939     if (CC.getValueType() != MVT::i1)
7940       return false;
7941     Invert = !AllOnes;
7942     if (AllOnes)
7943       // When looking for an AllOnes constant, N is an sext, and the 'other'
7944       // value is 0.
7945       OtherOp = DAG.getConstant(0, VT);
7946     else if (N->getOpcode() == ISD::ZERO_EXTEND)
7947       // When looking for a 0 constant, N can be zext or sext.
7948       OtherOp = DAG.getConstant(1, VT);
7949     else
7950       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
7951     return true;
7952   }
7953   }
7954 }
7955
7956 // Combine a constant select operand into its use:
7957 //
7958 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
7959 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
7960 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
7961 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
7962 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
7963 //
7964 // The transform is rejected if the select doesn't have a constant operand that
7965 // is null, or all ones when AllOnes is set.
7966 //
7967 // Also recognize sext/zext from i1:
7968 //
7969 //   (add (zext cc), x) -> (select cc (add x, 1), x)
7970 //   (add (sext cc), x) -> (select cc (add x, -1), x)
7971 //
7972 // These transformations eventually create predicated instructions.
7973 //
7974 // @param N       The node to transform.
7975 // @param Slct    The N operand that is a select.
7976 // @param OtherOp The other N operand (x above).
7977 // @param DCI     Context.
7978 // @param AllOnes Require the select constant to be all ones instead of null.
7979 // @returns The new node, or SDValue() on failure.
7980 static
7981 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
7982                             TargetLowering::DAGCombinerInfo &DCI,
7983                             bool AllOnes = false) {
7984   SelectionDAG &DAG = DCI.DAG;
7985   EVT VT = N->getValueType(0);
7986   SDValue NonConstantVal;
7987   SDValue CCOp;
7988   bool SwapSelectOps;
7989   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
7990                                   NonConstantVal, DAG))
7991     return SDValue();
7992
7993   // Slct is now know to be the desired identity constant when CC is true.
7994   SDValue TrueVal = OtherOp;
7995   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
7996                                  OtherOp, NonConstantVal);
7997   // Unless SwapSelectOps says CC should be false.
7998   if (SwapSelectOps)
7999     std::swap(TrueVal, FalseVal);
8000
8001   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
8002                      CCOp, TrueVal, FalseVal);
8003 }
8004
8005 // Attempt combineSelectAndUse on each operand of a commutative operator N.
8006 static
8007 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
8008                                        TargetLowering::DAGCombinerInfo &DCI) {
8009   SDValue N0 = N->getOperand(0);
8010   SDValue N1 = N->getOperand(1);
8011   if (N0.getNode()->hasOneUse()) {
8012     SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
8013     if (Result.getNode())
8014       return Result;
8015   }
8016   if (N1.getNode()->hasOneUse()) {
8017     SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
8018     if (Result.getNode())
8019       return Result;
8020   }
8021   return SDValue();
8022 }
8023
8024 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
8025 // (only after legalization).
8026 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
8027                                  TargetLowering::DAGCombinerInfo &DCI,
8028                                  const ARMSubtarget *Subtarget) {
8029
8030   // Only perform optimization if after legalize, and if NEON is available. We
8031   // also expected both operands to be BUILD_VECTORs.
8032   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
8033       || N0.getOpcode() != ISD::BUILD_VECTOR
8034       || N1.getOpcode() != ISD::BUILD_VECTOR)
8035     return SDValue();
8036
8037   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
8038   EVT VT = N->getValueType(0);
8039   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
8040     return SDValue();
8041
8042   // Check that the vector operands are of the right form.
8043   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
8044   // operands, where N is the size of the formed vector.
8045   // Each EXTRACT_VECTOR should have the same input vector and odd or even
8046   // index such that we have a pair wise add pattern.
8047
8048   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
8049   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8050     return SDValue();
8051   SDValue Vec = N0->getOperand(0)->getOperand(0);
8052   SDNode *V = Vec.getNode();
8053   unsigned nextIndex = 0;
8054
8055   // For each operands to the ADD which are BUILD_VECTORs,
8056   // check to see if each of their operands are an EXTRACT_VECTOR with
8057   // the same vector and appropriate index.
8058   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
8059     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
8060         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
8061
8062       SDValue ExtVec0 = N0->getOperand(i);
8063       SDValue ExtVec1 = N1->getOperand(i);
8064
8065       // First operand is the vector, verify its the same.
8066       if (V != ExtVec0->getOperand(0).getNode() ||
8067           V != ExtVec1->getOperand(0).getNode())
8068         return SDValue();
8069
8070       // Second is the constant, verify its correct.
8071       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
8072       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
8073
8074       // For the constant, we want to see all the even or all the odd.
8075       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
8076           || C1->getZExtValue() != nextIndex+1)
8077         return SDValue();
8078
8079       // Increment index.
8080       nextIndex+=2;
8081     } else
8082       return SDValue();
8083   }
8084
8085   // Create VPADDL node.
8086   SelectionDAG &DAG = DCI.DAG;
8087   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8088
8089   // Build operand list.
8090   SmallVector<SDValue, 8> Ops;
8091   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls,
8092                                 TLI.getPointerTy()));
8093
8094   // Input is the vector.
8095   Ops.push_back(Vec);
8096
8097   // Get widened type and narrowed type.
8098   MVT widenType;
8099   unsigned numElem = VT.getVectorNumElements();
8100   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
8101     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
8102     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
8103     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
8104     default:
8105       llvm_unreachable("Invalid vector element type for padd optimization.");
8106   }
8107
8108   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
8109                             widenType, &Ops[0], Ops.size());
8110   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, tmp);
8111 }
8112
8113 static SDValue findMUL_LOHI(SDValue V) {
8114   if (V->getOpcode() == ISD::UMUL_LOHI ||
8115       V->getOpcode() == ISD::SMUL_LOHI)
8116     return V;
8117   return SDValue();
8118 }
8119
8120 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
8121                                      TargetLowering::DAGCombinerInfo &DCI,
8122                                      const ARMSubtarget *Subtarget) {
8123
8124   if (Subtarget->isThumb1Only()) return SDValue();
8125
8126   // Only perform the checks after legalize when the pattern is available.
8127   if (DCI.isBeforeLegalize()) return SDValue();
8128
8129   // Look for multiply add opportunities.
8130   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
8131   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
8132   // a glue link from the first add to the second add.
8133   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
8134   // a S/UMLAL instruction.
8135   //          loAdd   UMUL_LOHI
8136   //            \    / :lo    \ :hi
8137   //             \  /          \          [no multiline comment]
8138   //              ADDC         |  hiAdd
8139   //                 \ :glue  /  /
8140   //                  \      /  /
8141   //                    ADDE
8142   //
8143   assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
8144   SDValue AddcOp0 = AddcNode->getOperand(0);
8145   SDValue AddcOp1 = AddcNode->getOperand(1);
8146
8147   // Check if the two operands are from the same mul_lohi node.
8148   if (AddcOp0.getNode() == AddcOp1.getNode())
8149     return SDValue();
8150
8151   assert(AddcNode->getNumValues() == 2 &&
8152          AddcNode->getValueType(0) == MVT::i32 &&
8153          "Expect ADDC with two result values. First: i32");
8154
8155   // Check that we have a glued ADDC node.
8156   if (AddcNode->getValueType(1) != MVT::Glue)
8157     return SDValue();
8158
8159   // Check that the ADDC adds the low result of the S/UMUL_LOHI.
8160   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
8161       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
8162       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
8163       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
8164     return SDValue();
8165
8166   // Look for the glued ADDE.
8167   SDNode* AddeNode = AddcNode->getGluedUser();
8168   if (AddeNode == NULL)
8169     return SDValue();
8170
8171   // Make sure it is really an ADDE.
8172   if (AddeNode->getOpcode() != ISD::ADDE)
8173     return SDValue();
8174
8175   assert(AddeNode->getNumOperands() == 3 &&
8176          AddeNode->getOperand(2).getValueType() == MVT::Glue &&
8177          "ADDE node has the wrong inputs");
8178
8179   // Check for the triangle shape.
8180   SDValue AddeOp0 = AddeNode->getOperand(0);
8181   SDValue AddeOp1 = AddeNode->getOperand(1);
8182
8183   // Make sure that the ADDE operands are not coming from the same node.
8184   if (AddeOp0.getNode() == AddeOp1.getNode())
8185     return SDValue();
8186
8187   // Find the MUL_LOHI node walking up ADDE's operands.
8188   bool IsLeftOperandMUL = false;
8189   SDValue MULOp = findMUL_LOHI(AddeOp0);
8190   if (MULOp == SDValue())
8191    MULOp = findMUL_LOHI(AddeOp1);
8192   else
8193     IsLeftOperandMUL = true;
8194   if (MULOp == SDValue())
8195      return SDValue();
8196
8197   // Figure out the right opcode.
8198   unsigned Opc = MULOp->getOpcode();
8199   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
8200
8201   // Figure out the high and low input values to the MLAL node.
8202   SDValue* HiMul = &MULOp;
8203   SDValue* HiAdd = NULL;
8204   SDValue* LoMul = NULL;
8205   SDValue* LowAdd = NULL;
8206
8207   if (IsLeftOperandMUL)
8208     HiAdd = &AddeOp1;
8209   else
8210     HiAdd = &AddeOp0;
8211
8212
8213   if (AddcOp0->getOpcode() == Opc) {
8214     LoMul = &AddcOp0;
8215     LowAdd = &AddcOp1;
8216   }
8217   if (AddcOp1->getOpcode() == Opc) {
8218     LoMul = &AddcOp1;
8219     LowAdd = &AddcOp0;
8220   }
8221
8222   if (LoMul == NULL)
8223     return SDValue();
8224
8225   if (LoMul->getNode() != HiMul->getNode())
8226     return SDValue();
8227
8228   // Create the merged node.
8229   SelectionDAG &DAG = DCI.DAG;
8230
8231   // Build operand list.
8232   SmallVector<SDValue, 8> Ops;
8233   Ops.push_back(LoMul->getOperand(0));
8234   Ops.push_back(LoMul->getOperand(1));
8235   Ops.push_back(*LowAdd);
8236   Ops.push_back(*HiAdd);
8237
8238   SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
8239                                  DAG.getVTList(MVT::i32, MVT::i32),
8240                                  &Ops[0], Ops.size());
8241
8242   // Replace the ADDs' nodes uses by the MLA node's values.
8243   SDValue HiMLALResult(MLALNode.getNode(), 1);
8244   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
8245
8246   SDValue LoMLALResult(MLALNode.getNode(), 0);
8247   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
8248
8249   // Return original node to notify the driver to stop replacing.
8250   SDValue resNode(AddcNode, 0);
8251   return resNode;
8252 }
8253
8254 /// PerformADDCCombine - Target-specific dag combine transform from
8255 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
8256 static SDValue PerformADDCCombine(SDNode *N,
8257                                  TargetLowering::DAGCombinerInfo &DCI,
8258                                  const ARMSubtarget *Subtarget) {
8259
8260   return AddCombineTo64bitMLAL(N, DCI, Subtarget);
8261
8262 }
8263
8264 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
8265 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
8266 /// called with the default operands, and if that fails, with commuted
8267 /// operands.
8268 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
8269                                           TargetLowering::DAGCombinerInfo &DCI,
8270                                           const ARMSubtarget *Subtarget){
8271
8272   // Attempt to create vpaddl for this add.
8273   SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
8274   if (Result.getNode())
8275     return Result;
8276
8277   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
8278   if (N0.getNode()->hasOneUse()) {
8279     SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
8280     if (Result.getNode()) return Result;
8281   }
8282   return SDValue();
8283 }
8284
8285 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
8286 ///
8287 static SDValue PerformADDCombine(SDNode *N,
8288                                  TargetLowering::DAGCombinerInfo &DCI,
8289                                  const ARMSubtarget *Subtarget) {
8290   SDValue N0 = N->getOperand(0);
8291   SDValue N1 = N->getOperand(1);
8292
8293   // First try with the default operand order.
8294   SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
8295   if (Result.getNode())
8296     return Result;
8297
8298   // If that didn't work, try again with the operands commuted.
8299   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
8300 }
8301
8302 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
8303 ///
8304 static SDValue PerformSUBCombine(SDNode *N,
8305                                  TargetLowering::DAGCombinerInfo &DCI) {
8306   SDValue N0 = N->getOperand(0);
8307   SDValue N1 = N->getOperand(1);
8308
8309   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
8310   if (N1.getNode()->hasOneUse()) {
8311     SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
8312     if (Result.getNode()) return Result;
8313   }
8314
8315   return SDValue();
8316 }
8317
8318 /// PerformVMULCombine
8319 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
8320 /// special multiplier accumulator forwarding.
8321 ///   vmul d3, d0, d2
8322 ///   vmla d3, d1, d2
8323 /// is faster than
8324 ///   vadd d3, d0, d1
8325 ///   vmul d3, d3, d2
8326 static SDValue PerformVMULCombine(SDNode *N,
8327                                   TargetLowering::DAGCombinerInfo &DCI,
8328                                   const ARMSubtarget *Subtarget) {
8329   if (!Subtarget->hasVMLxForwarding())
8330     return SDValue();
8331
8332   SelectionDAG &DAG = DCI.DAG;
8333   SDValue N0 = N->getOperand(0);
8334   SDValue N1 = N->getOperand(1);
8335   unsigned Opcode = N0.getOpcode();
8336   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
8337       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
8338     Opcode = N1.getOpcode();
8339     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
8340         Opcode != ISD::FADD && Opcode != ISD::FSUB)
8341       return SDValue();
8342     std::swap(N0, N1);
8343   }
8344
8345   EVT VT = N->getValueType(0);
8346   SDLoc DL(N);
8347   SDValue N00 = N0->getOperand(0);
8348   SDValue N01 = N0->getOperand(1);
8349   return DAG.getNode(Opcode, DL, VT,
8350                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
8351                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
8352 }
8353
8354 static SDValue PerformMULCombine(SDNode *N,
8355                                  TargetLowering::DAGCombinerInfo &DCI,
8356                                  const ARMSubtarget *Subtarget) {
8357   SelectionDAG &DAG = DCI.DAG;
8358
8359   if (Subtarget->isThumb1Only())
8360     return SDValue();
8361
8362   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
8363     return SDValue();
8364
8365   EVT VT = N->getValueType(0);
8366   if (VT.is64BitVector() || VT.is128BitVector())
8367     return PerformVMULCombine(N, DCI, Subtarget);
8368   if (VT != MVT::i32)
8369     return SDValue();
8370
8371   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8372   if (!C)
8373     return SDValue();
8374
8375   int64_t MulAmt = C->getSExtValue();
8376   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
8377
8378   ShiftAmt = ShiftAmt & (32 - 1);
8379   SDValue V = N->getOperand(0);
8380   SDLoc DL(N);
8381
8382   SDValue Res;
8383   MulAmt >>= ShiftAmt;
8384
8385   if (MulAmt >= 0) {
8386     if (isPowerOf2_32(MulAmt - 1)) {
8387       // (mul x, 2^N + 1) => (add (shl x, N), x)
8388       Res = DAG.getNode(ISD::ADD, DL, VT,
8389                         V,
8390                         DAG.getNode(ISD::SHL, DL, VT,
8391                                     V,
8392                                     DAG.getConstant(Log2_32(MulAmt - 1),
8393                                                     MVT::i32)));
8394     } else if (isPowerOf2_32(MulAmt + 1)) {
8395       // (mul x, 2^N - 1) => (sub (shl x, N), x)
8396       Res = DAG.getNode(ISD::SUB, DL, VT,
8397                         DAG.getNode(ISD::SHL, DL, VT,
8398                                     V,
8399                                     DAG.getConstant(Log2_32(MulAmt + 1),
8400                                                     MVT::i32)),
8401                         V);
8402     } else
8403       return SDValue();
8404   } else {
8405     uint64_t MulAmtAbs = -MulAmt;
8406     if (isPowerOf2_32(MulAmtAbs + 1)) {
8407       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
8408       Res = DAG.getNode(ISD::SUB, DL, VT,
8409                         V,
8410                         DAG.getNode(ISD::SHL, DL, VT,
8411                                     V,
8412                                     DAG.getConstant(Log2_32(MulAmtAbs + 1),
8413                                                     MVT::i32)));
8414     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
8415       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
8416       Res = DAG.getNode(ISD::ADD, DL, VT,
8417                         V,
8418                         DAG.getNode(ISD::SHL, DL, VT,
8419                                     V,
8420                                     DAG.getConstant(Log2_32(MulAmtAbs-1),
8421                                                     MVT::i32)));
8422       Res = DAG.getNode(ISD::SUB, DL, VT,
8423                         DAG.getConstant(0, MVT::i32),Res);
8424
8425     } else
8426       return SDValue();
8427   }
8428
8429   if (ShiftAmt != 0)
8430     Res = DAG.getNode(ISD::SHL, DL, VT,
8431                       Res, DAG.getConstant(ShiftAmt, MVT::i32));
8432
8433   // Do not add new nodes to DAG combiner worklist.
8434   DCI.CombineTo(N, Res, false);
8435   return SDValue();
8436 }
8437
8438 static SDValue PerformANDCombine(SDNode *N,
8439                                  TargetLowering::DAGCombinerInfo &DCI,
8440                                  const ARMSubtarget *Subtarget) {
8441
8442   // Attempt to use immediate-form VBIC
8443   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
8444   SDLoc dl(N);
8445   EVT VT = N->getValueType(0);
8446   SelectionDAG &DAG = DCI.DAG;
8447
8448   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8449     return SDValue();
8450
8451   APInt SplatBits, SplatUndef;
8452   unsigned SplatBitSize;
8453   bool HasAnyUndefs;
8454   if (BVN &&
8455       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
8456     if (SplatBitSize <= 64) {
8457       EVT VbicVT;
8458       SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
8459                                       SplatUndef.getZExtValue(), SplatBitSize,
8460                                       DAG, VbicVT, VT.is128BitVector(),
8461                                       OtherModImm);
8462       if (Val.getNode()) {
8463         SDValue Input =
8464           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
8465         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
8466         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
8467       }
8468     }
8469   }
8470
8471   if (!Subtarget->isThumb1Only()) {
8472     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
8473     SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
8474     if (Result.getNode())
8475       return Result;
8476   }
8477
8478   return SDValue();
8479 }
8480
8481 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
8482 static SDValue PerformORCombine(SDNode *N,
8483                                 TargetLowering::DAGCombinerInfo &DCI,
8484                                 const ARMSubtarget *Subtarget) {
8485   // Attempt to use immediate-form VORR
8486   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
8487   SDLoc dl(N);
8488   EVT VT = N->getValueType(0);
8489   SelectionDAG &DAG = DCI.DAG;
8490
8491   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8492     return SDValue();
8493
8494   APInt SplatBits, SplatUndef;
8495   unsigned SplatBitSize;
8496   bool HasAnyUndefs;
8497   if (BVN && Subtarget->hasNEON() &&
8498       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
8499     if (SplatBitSize <= 64) {
8500       EVT VorrVT;
8501       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
8502                                       SplatUndef.getZExtValue(), SplatBitSize,
8503                                       DAG, VorrVT, VT.is128BitVector(),
8504                                       OtherModImm);
8505       if (Val.getNode()) {
8506         SDValue Input =
8507           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
8508         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
8509         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
8510       }
8511     }
8512   }
8513
8514   if (!Subtarget->isThumb1Only()) {
8515     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
8516     SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
8517     if (Result.getNode())
8518       return Result;
8519   }
8520
8521   // The code below optimizes (or (and X, Y), Z).
8522   // The AND operand needs to have a single user to make these optimizations
8523   // profitable.
8524   SDValue N0 = N->getOperand(0);
8525   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
8526     return SDValue();
8527   SDValue N1 = N->getOperand(1);
8528
8529   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
8530   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
8531       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
8532     APInt SplatUndef;
8533     unsigned SplatBitSize;
8534     bool HasAnyUndefs;
8535
8536     APInt SplatBits0, SplatBits1;
8537     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
8538     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
8539     // Ensure that the second operand of both ands are constants
8540     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
8541                                       HasAnyUndefs) && !HasAnyUndefs) {
8542         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
8543                                           HasAnyUndefs) && !HasAnyUndefs) {
8544             // Ensure that the bit width of the constants are the same and that
8545             // the splat arguments are logical inverses as per the pattern we
8546             // are trying to simplify.
8547             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
8548                 SplatBits0 == ~SplatBits1) {
8549                 // Canonicalize the vector type to make instruction selection
8550                 // simpler.
8551                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
8552                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
8553                                              N0->getOperand(1),
8554                                              N0->getOperand(0),
8555                                              N1->getOperand(0));
8556                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
8557             }
8558         }
8559     }
8560   }
8561
8562   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
8563   // reasonable.
8564
8565   // BFI is only available on V6T2+
8566   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
8567     return SDValue();
8568
8569   SDLoc DL(N);
8570   // 1) or (and A, mask), val => ARMbfi A, val, mask
8571   //      iff (val & mask) == val
8572   //
8573   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
8574   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
8575   //          && mask == ~mask2
8576   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
8577   //          && ~mask == mask2
8578   //  (i.e., copy a bitfield value into another bitfield of the same width)
8579
8580   if (VT != MVT::i32)
8581     return SDValue();
8582
8583   SDValue N00 = N0.getOperand(0);
8584
8585   // The value and the mask need to be constants so we can verify this is
8586   // actually a bitfield set. If the mask is 0xffff, we can do better
8587   // via a movt instruction, so don't use BFI in that case.
8588   SDValue MaskOp = N0.getOperand(1);
8589   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
8590   if (!MaskC)
8591     return SDValue();
8592   unsigned Mask = MaskC->getZExtValue();
8593   if (Mask == 0xffff)
8594     return SDValue();
8595   SDValue Res;
8596   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
8597   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
8598   if (N1C) {
8599     unsigned Val = N1C->getZExtValue();
8600     if ((Val & ~Mask) != Val)
8601       return SDValue();
8602
8603     if (ARM::isBitFieldInvertedMask(Mask)) {
8604       Val >>= countTrailingZeros(~Mask);
8605
8606       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
8607                         DAG.getConstant(Val, MVT::i32),
8608                         DAG.getConstant(Mask, MVT::i32));
8609
8610       // Do not add new nodes to DAG combiner worklist.
8611       DCI.CombineTo(N, Res, false);
8612       return SDValue();
8613     }
8614   } else if (N1.getOpcode() == ISD::AND) {
8615     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
8616     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
8617     if (!N11C)
8618       return SDValue();
8619     unsigned Mask2 = N11C->getZExtValue();
8620
8621     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
8622     // as is to match.
8623     if (ARM::isBitFieldInvertedMask(Mask) &&
8624         (Mask == ~Mask2)) {
8625       // The pack halfword instruction works better for masks that fit it,
8626       // so use that when it's available.
8627       if (Subtarget->hasT2ExtractPack() &&
8628           (Mask == 0xffff || Mask == 0xffff0000))
8629         return SDValue();
8630       // 2a
8631       unsigned amt = countTrailingZeros(Mask2);
8632       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
8633                         DAG.getConstant(amt, MVT::i32));
8634       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
8635                         DAG.getConstant(Mask, MVT::i32));
8636       // Do not add new nodes to DAG combiner worklist.
8637       DCI.CombineTo(N, Res, false);
8638       return SDValue();
8639     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
8640                (~Mask == Mask2)) {
8641       // The pack halfword instruction works better for masks that fit it,
8642       // so use that when it's available.
8643       if (Subtarget->hasT2ExtractPack() &&
8644           (Mask2 == 0xffff || Mask2 == 0xffff0000))
8645         return SDValue();
8646       // 2b
8647       unsigned lsb = countTrailingZeros(Mask);
8648       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
8649                         DAG.getConstant(lsb, MVT::i32));
8650       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
8651                         DAG.getConstant(Mask2, MVT::i32));
8652       // Do not add new nodes to DAG combiner worklist.
8653       DCI.CombineTo(N, Res, false);
8654       return SDValue();
8655     }
8656   }
8657
8658   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
8659       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
8660       ARM::isBitFieldInvertedMask(~Mask)) {
8661     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
8662     // where lsb(mask) == #shamt and masked bits of B are known zero.
8663     SDValue ShAmt = N00.getOperand(1);
8664     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
8665     unsigned LSB = countTrailingZeros(Mask);
8666     if (ShAmtC != LSB)
8667       return SDValue();
8668
8669     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
8670                       DAG.getConstant(~Mask, MVT::i32));
8671
8672     // Do not add new nodes to DAG combiner worklist.
8673     DCI.CombineTo(N, Res, false);
8674   }
8675
8676   return SDValue();
8677 }
8678
8679 static SDValue PerformXORCombine(SDNode *N,
8680                                  TargetLowering::DAGCombinerInfo &DCI,
8681                                  const ARMSubtarget *Subtarget) {
8682   EVT VT = N->getValueType(0);
8683   SelectionDAG &DAG = DCI.DAG;
8684
8685   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8686     return SDValue();
8687
8688   if (!Subtarget->isThumb1Only()) {
8689     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
8690     SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
8691     if (Result.getNode())
8692       return Result;
8693   }
8694
8695   return SDValue();
8696 }
8697
8698 /// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
8699 /// the bits being cleared by the AND are not demanded by the BFI.
8700 static SDValue PerformBFICombine(SDNode *N,
8701                                  TargetLowering::DAGCombinerInfo &DCI) {
8702   SDValue N1 = N->getOperand(1);
8703   if (N1.getOpcode() == ISD::AND) {
8704     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
8705     if (!N11C)
8706       return SDValue();
8707     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
8708     unsigned LSB = countTrailingZeros(~InvMask);
8709     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
8710     unsigned Mask = (1 << Width)-1;
8711     unsigned Mask2 = N11C->getZExtValue();
8712     if ((Mask & (~Mask2)) == 0)
8713       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
8714                              N->getOperand(0), N1.getOperand(0),
8715                              N->getOperand(2));
8716   }
8717   return SDValue();
8718 }
8719
8720 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
8721 /// ARMISD::VMOVRRD.
8722 static SDValue PerformVMOVRRDCombine(SDNode *N,
8723                                      TargetLowering::DAGCombinerInfo &DCI) {
8724   // vmovrrd(vmovdrr x, y) -> x,y
8725   SDValue InDouble = N->getOperand(0);
8726   if (InDouble.getOpcode() == ARMISD::VMOVDRR)
8727     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
8728
8729   // vmovrrd(load f64) -> (load i32), (load i32)
8730   SDNode *InNode = InDouble.getNode();
8731   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
8732       InNode->getValueType(0) == MVT::f64 &&
8733       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
8734       !cast<LoadSDNode>(InNode)->isVolatile()) {
8735     // TODO: Should this be done for non-FrameIndex operands?
8736     LoadSDNode *LD = cast<LoadSDNode>(InNode);
8737
8738     SelectionDAG &DAG = DCI.DAG;
8739     SDLoc DL(LD);
8740     SDValue BasePtr = LD->getBasePtr();
8741     SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
8742                                  LD->getPointerInfo(), LD->isVolatile(),
8743                                  LD->isNonTemporal(), LD->isInvariant(),
8744                                  LD->getAlignment());
8745
8746     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
8747                                     DAG.getConstant(4, MVT::i32));
8748     SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
8749                                  LD->getPointerInfo(), LD->isVolatile(),
8750                                  LD->isNonTemporal(), LD->isInvariant(),
8751                                  std::min(4U, LD->getAlignment() / 2));
8752
8753     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
8754     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
8755     DCI.RemoveFromWorklist(LD);
8756     DAG.DeleteNode(LD);
8757     return Result;
8758   }
8759
8760   return SDValue();
8761 }
8762
8763 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
8764 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
8765 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
8766   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
8767   SDValue Op0 = N->getOperand(0);
8768   SDValue Op1 = N->getOperand(1);
8769   if (Op0.getOpcode() == ISD::BITCAST)
8770     Op0 = Op0.getOperand(0);
8771   if (Op1.getOpcode() == ISD::BITCAST)
8772     Op1 = Op1.getOperand(0);
8773   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
8774       Op0.getNode() == Op1.getNode() &&
8775       Op0.getResNo() == 0 && Op1.getResNo() == 1)
8776     return DAG.getNode(ISD::BITCAST, SDLoc(N),
8777                        N->getValueType(0), Op0.getOperand(0));
8778   return SDValue();
8779 }
8780
8781 /// PerformSTORECombine - Target-specific dag combine xforms for
8782 /// ISD::STORE.
8783 static SDValue PerformSTORECombine(SDNode *N,
8784                                    TargetLowering::DAGCombinerInfo &DCI) {
8785   StoreSDNode *St = cast<StoreSDNode>(N);
8786   if (St->isVolatile())
8787     return SDValue();
8788
8789   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
8790   // pack all of the elements in one place.  Next, store to memory in fewer
8791   // chunks.
8792   SDValue StVal = St->getValue();
8793   EVT VT = StVal.getValueType();
8794   if (St->isTruncatingStore() && VT.isVector()) {
8795     SelectionDAG &DAG = DCI.DAG;
8796     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8797     EVT StVT = St->getMemoryVT();
8798     unsigned NumElems = VT.getVectorNumElements();
8799     assert(StVT != VT && "Cannot truncate to the same type");
8800     unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
8801     unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
8802
8803     // From, To sizes and ElemCount must be pow of two
8804     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
8805
8806     // We are going to use the original vector elt for storing.
8807     // Accumulated smaller vector elements must be a multiple of the store size.
8808     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
8809
8810     unsigned SizeRatio  = FromEltSz / ToEltSz;
8811     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
8812
8813     // Create a type on which we perform the shuffle.
8814     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
8815                                      NumElems*SizeRatio);
8816     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
8817
8818     SDLoc DL(St);
8819     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
8820     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
8821     for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
8822
8823     // Can't shuffle using an illegal type.
8824     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
8825
8826     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
8827                                 DAG.getUNDEF(WideVec.getValueType()),
8828                                 ShuffleVec.data());
8829     // At this point all of the data is stored at the bottom of the
8830     // register. We now need to save it to mem.
8831
8832     // Find the largest store unit
8833     MVT StoreType = MVT::i8;
8834     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
8835          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
8836       MVT Tp = (MVT::SimpleValueType)tp;
8837       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
8838         StoreType = Tp;
8839     }
8840     // Didn't find a legal store type.
8841     if (!TLI.isTypeLegal(StoreType))
8842       return SDValue();
8843
8844     // Bitcast the original vector into a vector of store-size units
8845     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
8846             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
8847     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
8848     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
8849     SmallVector<SDValue, 8> Chains;
8850     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
8851                                         TLI.getPointerTy());
8852     SDValue BasePtr = St->getBasePtr();
8853
8854     // Perform one or more big stores into memory.
8855     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
8856     for (unsigned I = 0; I < E; I++) {
8857       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
8858                                    StoreType, ShuffWide,
8859                                    DAG.getIntPtrConstant(I));
8860       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
8861                                 St->getPointerInfo(), St->isVolatile(),
8862                                 St->isNonTemporal(), St->getAlignment());
8863       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
8864                             Increment);
8865       Chains.push_back(Ch);
8866     }
8867     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
8868                        Chains.size());
8869   }
8870
8871   if (!ISD::isNormalStore(St))
8872     return SDValue();
8873
8874   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
8875   // ARM stores of arguments in the same cache line.
8876   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
8877       StVal.getNode()->hasOneUse()) {
8878     SelectionDAG  &DAG = DCI.DAG;
8879     SDLoc DL(St);
8880     SDValue BasePtr = St->getBasePtr();
8881     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
8882                                   StVal.getNode()->getOperand(0), BasePtr,
8883                                   St->getPointerInfo(), St->isVolatile(),
8884                                   St->isNonTemporal(), St->getAlignment());
8885
8886     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
8887                                     DAG.getConstant(4, MVT::i32));
8888     return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
8889                         OffsetPtr, St->getPointerInfo(), St->isVolatile(),
8890                         St->isNonTemporal(),
8891                         std::min(4U, St->getAlignment() / 2));
8892   }
8893
8894   if (StVal.getValueType() != MVT::i64 ||
8895       StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8896     return SDValue();
8897
8898   // Bitcast an i64 store extracted from a vector to f64.
8899   // Otherwise, the i64 value will be legalized to a pair of i32 values.
8900   SelectionDAG &DAG = DCI.DAG;
8901   SDLoc dl(StVal);
8902   SDValue IntVec = StVal.getOperand(0);
8903   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
8904                                  IntVec.getValueType().getVectorNumElements());
8905   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
8906   SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8907                                Vec, StVal.getOperand(1));
8908   dl = SDLoc(N);
8909   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
8910   // Make the DAGCombiner fold the bitcasts.
8911   DCI.AddToWorklist(Vec.getNode());
8912   DCI.AddToWorklist(ExtElt.getNode());
8913   DCI.AddToWorklist(V.getNode());
8914   return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
8915                       St->getPointerInfo(), St->isVolatile(),
8916                       St->isNonTemporal(), St->getAlignment(),
8917                       St->getTBAAInfo());
8918 }
8919
8920 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
8921 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
8922 /// i64 vector to have f64 elements, since the value can then be loaded
8923 /// directly into a VFP register.
8924 static bool hasNormalLoadOperand(SDNode *N) {
8925   unsigned NumElts = N->getValueType(0).getVectorNumElements();
8926   for (unsigned i = 0; i < NumElts; ++i) {
8927     SDNode *Elt = N->getOperand(i).getNode();
8928     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
8929       return true;
8930   }
8931   return false;
8932 }
8933
8934 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
8935 /// ISD::BUILD_VECTOR.
8936 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
8937                                           TargetLowering::DAGCombinerInfo &DCI){
8938   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
8939   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
8940   // into a pair of GPRs, which is fine when the value is used as a scalar,
8941   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
8942   SelectionDAG &DAG = DCI.DAG;
8943   if (N->getNumOperands() == 2) {
8944     SDValue RV = PerformVMOVDRRCombine(N, DAG);
8945     if (RV.getNode())
8946       return RV;
8947   }
8948
8949   // Load i64 elements as f64 values so that type legalization does not split
8950   // them up into i32 values.
8951   EVT VT = N->getValueType(0);
8952   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
8953     return SDValue();
8954   SDLoc dl(N);
8955   SmallVector<SDValue, 8> Ops;
8956   unsigned NumElts = VT.getVectorNumElements();
8957   for (unsigned i = 0; i < NumElts; ++i) {
8958     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
8959     Ops.push_back(V);
8960     // Make the DAGCombiner fold the bitcast.
8961     DCI.AddToWorklist(V.getNode());
8962   }
8963   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
8964   SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts);
8965   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
8966 }
8967
8968 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
8969 static SDValue
8970 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
8971   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
8972   // At that time, we may have inserted bitcasts from integer to float.
8973   // If these bitcasts have survived DAGCombine, change the lowering of this
8974   // BUILD_VECTOR in something more vector friendly, i.e., that does not
8975   // force to use floating point types.
8976
8977   // Make sure we can change the type of the vector.
8978   // This is possible iff:
8979   // 1. The vector is only used in a bitcast to a integer type. I.e.,
8980   //    1.1. Vector is used only once.
8981   //    1.2. Use is a bit convert to an integer type.
8982   // 2. The size of its operands are 32-bits (64-bits are not legal).
8983   EVT VT = N->getValueType(0);
8984   EVT EltVT = VT.getVectorElementType();
8985
8986   // Check 1.1. and 2.
8987   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
8988     return SDValue();
8989
8990   // By construction, the input type must be float.
8991   assert(EltVT == MVT::f32 && "Unexpected type!");
8992
8993   // Check 1.2.
8994   SDNode *Use = *N->use_begin();
8995   if (Use->getOpcode() != ISD::BITCAST ||
8996       Use->getValueType(0).isFloatingPoint())
8997     return SDValue();
8998
8999   // Check profitability.
9000   // Model is, if more than half of the relevant operands are bitcast from
9001   // i32, turn the build_vector into a sequence of insert_vector_elt.
9002   // Relevant operands are everything that is not statically
9003   // (i.e., at compile time) bitcasted.
9004   unsigned NumOfBitCastedElts = 0;
9005   unsigned NumElts = VT.getVectorNumElements();
9006   unsigned NumOfRelevantElts = NumElts;
9007   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
9008     SDValue Elt = N->getOperand(Idx);
9009     if (Elt->getOpcode() == ISD::BITCAST) {
9010       // Assume only bit cast to i32 will go away.
9011       if (Elt->getOperand(0).getValueType() == MVT::i32)
9012         ++NumOfBitCastedElts;
9013     } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
9014       // Constants are statically casted, thus do not count them as
9015       // relevant operands.
9016       --NumOfRelevantElts;
9017   }
9018
9019   // Check if more than half of the elements require a non-free bitcast.
9020   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
9021     return SDValue();
9022
9023   SelectionDAG &DAG = DCI.DAG;
9024   // Create the new vector type.
9025   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
9026   // Check if the type is legal.
9027   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9028   if (!TLI.isTypeLegal(VecVT))
9029     return SDValue();
9030
9031   // Combine:
9032   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
9033   // => BITCAST INSERT_VECTOR_ELT
9034   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
9035   //                      (BITCAST EN), N.
9036   SDValue Vec = DAG.getUNDEF(VecVT);
9037   SDLoc dl(N);
9038   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
9039     SDValue V = N->getOperand(Idx);
9040     if (V.getOpcode() == ISD::UNDEF)
9041       continue;
9042     if (V.getOpcode() == ISD::BITCAST &&
9043         V->getOperand(0).getValueType() == MVT::i32)
9044       // Fold obvious case.
9045       V = V.getOperand(0);
9046     else {
9047       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
9048       // Make the DAGCombiner fold the bitcasts.
9049       DCI.AddToWorklist(V.getNode());
9050     }
9051     SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
9052     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
9053   }
9054   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
9055   // Make the DAGCombiner fold the bitcasts.
9056   DCI.AddToWorklist(Vec.getNode());
9057   return Vec;
9058 }
9059
9060 /// PerformInsertEltCombine - Target-specific dag combine xforms for
9061 /// ISD::INSERT_VECTOR_ELT.
9062 static SDValue PerformInsertEltCombine(SDNode *N,
9063                                        TargetLowering::DAGCombinerInfo &DCI) {
9064   // Bitcast an i64 load inserted into a vector to f64.
9065   // Otherwise, the i64 value will be legalized to a pair of i32 values.
9066   EVT VT = N->getValueType(0);
9067   SDNode *Elt = N->getOperand(1).getNode();
9068   if (VT.getVectorElementType() != MVT::i64 ||
9069       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
9070     return SDValue();
9071
9072   SelectionDAG &DAG = DCI.DAG;
9073   SDLoc dl(N);
9074   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
9075                                  VT.getVectorNumElements());
9076   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
9077   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
9078   // Make the DAGCombiner fold the bitcasts.
9079   DCI.AddToWorklist(Vec.getNode());
9080   DCI.AddToWorklist(V.getNode());
9081   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
9082                                Vec, V, N->getOperand(2));
9083   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
9084 }
9085
9086 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
9087 /// ISD::VECTOR_SHUFFLE.
9088 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
9089   // The LLVM shufflevector instruction does not require the shuffle mask
9090   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
9091   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
9092   // operands do not match the mask length, they are extended by concatenating
9093   // them with undef vectors.  That is probably the right thing for other
9094   // targets, but for NEON it is better to concatenate two double-register
9095   // size vector operands into a single quad-register size vector.  Do that
9096   // transformation here:
9097   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
9098   //   shuffle(concat(v1, v2), undef)
9099   SDValue Op0 = N->getOperand(0);
9100   SDValue Op1 = N->getOperand(1);
9101   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
9102       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
9103       Op0.getNumOperands() != 2 ||
9104       Op1.getNumOperands() != 2)
9105     return SDValue();
9106   SDValue Concat0Op1 = Op0.getOperand(1);
9107   SDValue Concat1Op1 = Op1.getOperand(1);
9108   if (Concat0Op1.getOpcode() != ISD::UNDEF ||
9109       Concat1Op1.getOpcode() != ISD::UNDEF)
9110     return SDValue();
9111   // Skip the transformation if any of the types are illegal.
9112   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9113   EVT VT = N->getValueType(0);
9114   if (!TLI.isTypeLegal(VT) ||
9115       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
9116       !TLI.isTypeLegal(Concat1Op1.getValueType()))
9117     return SDValue();
9118
9119   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
9120                                   Op0.getOperand(0), Op1.getOperand(0));
9121   // Translate the shuffle mask.
9122   SmallVector<int, 16> NewMask;
9123   unsigned NumElts = VT.getVectorNumElements();
9124   unsigned HalfElts = NumElts/2;
9125   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
9126   for (unsigned n = 0; n < NumElts; ++n) {
9127     int MaskElt = SVN->getMaskElt(n);
9128     int NewElt = -1;
9129     if (MaskElt < (int)HalfElts)
9130       NewElt = MaskElt;
9131     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
9132       NewElt = HalfElts + MaskElt - NumElts;
9133     NewMask.push_back(NewElt);
9134   }
9135   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
9136                               DAG.getUNDEF(VT), NewMask.data());
9137 }
9138
9139 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
9140 /// NEON load/store intrinsics to merge base address updates.
9141 static SDValue CombineBaseUpdate(SDNode *N,
9142                                  TargetLowering::DAGCombinerInfo &DCI) {
9143   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9144     return SDValue();
9145
9146   SelectionDAG &DAG = DCI.DAG;
9147   bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
9148                       N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
9149   unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
9150   SDValue Addr = N->getOperand(AddrOpIdx);
9151
9152   // Search for a use of the address operand that is an increment.
9153   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
9154          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
9155     SDNode *User = *UI;
9156     if (User->getOpcode() != ISD::ADD ||
9157         UI.getUse().getResNo() != Addr.getResNo())
9158       continue;
9159
9160     // Check that the add is independent of the load/store.  Otherwise, folding
9161     // it would create a cycle.
9162     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
9163       continue;
9164
9165     // Find the new opcode for the updating load/store.
9166     bool isLoad = true;
9167     bool isLaneOp = false;
9168     unsigned NewOpc = 0;
9169     unsigned NumVecs = 0;
9170     if (isIntrinsic) {
9171       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
9172       switch (IntNo) {
9173       default: llvm_unreachable("unexpected intrinsic for Neon base update");
9174       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
9175         NumVecs = 1; break;
9176       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
9177         NumVecs = 2; break;
9178       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
9179         NumVecs = 3; break;
9180       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
9181         NumVecs = 4; break;
9182       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
9183         NumVecs = 2; isLaneOp = true; break;
9184       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
9185         NumVecs = 3; isLaneOp = true; break;
9186       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
9187         NumVecs = 4; isLaneOp = true; break;
9188       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
9189         NumVecs = 1; isLoad = false; break;
9190       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
9191         NumVecs = 2; isLoad = false; break;
9192       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
9193         NumVecs = 3; isLoad = false; break;
9194       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
9195         NumVecs = 4; isLoad = false; break;
9196       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
9197         NumVecs = 2; isLoad = false; isLaneOp = true; break;
9198       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
9199         NumVecs = 3; isLoad = false; isLaneOp = true; break;
9200       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
9201         NumVecs = 4; isLoad = false; isLaneOp = true; break;
9202       }
9203     } else {
9204       isLaneOp = true;
9205       switch (N->getOpcode()) {
9206       default: llvm_unreachable("unexpected opcode for Neon base update");
9207       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
9208       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
9209       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
9210       }
9211     }
9212
9213     // Find the size of memory referenced by the load/store.
9214     EVT VecTy;
9215     if (isLoad)
9216       VecTy = N->getValueType(0);
9217     else
9218       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
9219     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
9220     if (isLaneOp)
9221       NumBytes /= VecTy.getVectorNumElements();
9222
9223     // If the increment is a constant, it must match the memory ref size.
9224     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
9225     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
9226       uint64_t IncVal = CInc->getZExtValue();
9227       if (IncVal != NumBytes)
9228         continue;
9229     } else if (NumBytes >= 3 * 16) {
9230       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
9231       // separate instructions that make it harder to use a non-constant update.
9232       continue;
9233     }
9234
9235     // Create the new updating load/store node.
9236     EVT Tys[6];
9237     unsigned NumResultVecs = (isLoad ? NumVecs : 0);
9238     unsigned n;
9239     for (n = 0; n < NumResultVecs; ++n)
9240       Tys[n] = VecTy;
9241     Tys[n++] = MVT::i32;
9242     Tys[n] = MVT::Other;
9243     SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
9244     SmallVector<SDValue, 8> Ops;
9245     Ops.push_back(N->getOperand(0)); // incoming chain
9246     Ops.push_back(N->getOperand(AddrOpIdx));
9247     Ops.push_back(Inc);
9248     for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
9249       Ops.push_back(N->getOperand(i));
9250     }
9251     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
9252     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
9253                                            Ops.data(), Ops.size(),
9254                                            MemInt->getMemoryVT(),
9255                                            MemInt->getMemOperand());
9256
9257     // Update the uses.
9258     std::vector<SDValue> NewResults;
9259     for (unsigned i = 0; i < NumResultVecs; ++i) {
9260       NewResults.push_back(SDValue(UpdN.getNode(), i));
9261     }
9262     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
9263     DCI.CombineTo(N, NewResults);
9264     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
9265
9266     break;
9267   }
9268   return SDValue();
9269 }
9270
9271 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
9272 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
9273 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
9274 /// return true.
9275 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
9276   SelectionDAG &DAG = DCI.DAG;
9277   EVT VT = N->getValueType(0);
9278   // vldN-dup instructions only support 64-bit vectors for N > 1.
9279   if (!VT.is64BitVector())
9280     return false;
9281
9282   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
9283   SDNode *VLD = N->getOperand(0).getNode();
9284   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
9285     return false;
9286   unsigned NumVecs = 0;
9287   unsigned NewOpc = 0;
9288   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
9289   if (IntNo == Intrinsic::arm_neon_vld2lane) {
9290     NumVecs = 2;
9291     NewOpc = ARMISD::VLD2DUP;
9292   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
9293     NumVecs = 3;
9294     NewOpc = ARMISD::VLD3DUP;
9295   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
9296     NumVecs = 4;
9297     NewOpc = ARMISD::VLD4DUP;
9298   } else {
9299     return false;
9300   }
9301
9302   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
9303   // numbers match the load.
9304   unsigned VLDLaneNo =
9305     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
9306   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
9307        UI != UE; ++UI) {
9308     // Ignore uses of the chain result.
9309     if (UI.getUse().getResNo() == NumVecs)
9310       continue;
9311     SDNode *User = *UI;
9312     if (User->getOpcode() != ARMISD::VDUPLANE ||
9313         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
9314       return false;
9315   }
9316
9317   // Create the vldN-dup node.
9318   EVT Tys[5];
9319   unsigned n;
9320   for (n = 0; n < NumVecs; ++n)
9321     Tys[n] = VT;
9322   Tys[n] = MVT::Other;
9323   SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
9324   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
9325   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
9326   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
9327                                            Ops, 2, VLDMemInt->getMemoryVT(),
9328                                            VLDMemInt->getMemOperand());
9329
9330   // Update the uses.
9331   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
9332        UI != UE; ++UI) {
9333     unsigned ResNo = UI.getUse().getResNo();
9334     // Ignore uses of the chain result.
9335     if (ResNo == NumVecs)
9336       continue;
9337     SDNode *User = *UI;
9338     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
9339   }
9340
9341   // Now the vldN-lane intrinsic is dead except for its chain result.
9342   // Update uses of the chain.
9343   std::vector<SDValue> VLDDupResults;
9344   for (unsigned n = 0; n < NumVecs; ++n)
9345     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
9346   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
9347   DCI.CombineTo(VLD, VLDDupResults);
9348
9349   return true;
9350 }
9351
9352 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
9353 /// ARMISD::VDUPLANE.
9354 static SDValue PerformVDUPLANECombine(SDNode *N,
9355                                       TargetLowering::DAGCombinerInfo &DCI) {
9356   SDValue Op = N->getOperand(0);
9357
9358   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
9359   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
9360   if (CombineVLDDUP(N, DCI))
9361     return SDValue(N, 0);
9362
9363   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
9364   // redundant.  Ignore bit_converts for now; element sizes are checked below.
9365   while (Op.getOpcode() == ISD::BITCAST)
9366     Op = Op.getOperand(0);
9367   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
9368     return SDValue();
9369
9370   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
9371   unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits();
9372   // The canonical VMOV for a zero vector uses a 32-bit element size.
9373   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9374   unsigned EltBits;
9375   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
9376     EltSize = 8;
9377   EVT VT = N->getValueType(0);
9378   if (EltSize > VT.getVectorElementType().getSizeInBits())
9379     return SDValue();
9380
9381   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
9382 }
9383
9384 // isConstVecPow2 - Return true if each vector element is a power of 2, all
9385 // elements are the same constant, C, and Log2(C) ranges from 1 to 32.
9386 static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
9387 {
9388   integerPart cN;
9389   integerPart c0 = 0;
9390   for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements();
9391        I != E; I++) {
9392     ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I));
9393     if (!C)
9394       return false;
9395
9396     bool isExact;
9397     APFloat APF = C->getValueAPF();
9398     if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact)
9399         != APFloat::opOK || !isExact)
9400       return false;
9401
9402     c0 = (I == 0) ? cN : c0;
9403     if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32)
9404       return false;
9405   }
9406   C = c0;
9407   return true;
9408 }
9409
9410 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
9411 /// can replace combinations of VMUL and VCVT (floating-point to integer)
9412 /// when the VMUL has a constant operand that is a power of 2.
9413 ///
9414 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
9415 ///  vmul.f32        d16, d17, d16
9416 ///  vcvt.s32.f32    d16, d16
9417 /// becomes:
9418 ///  vcvt.s32.f32    d16, d16, #3
9419 static SDValue PerformVCVTCombine(SDNode *N,
9420                                   TargetLowering::DAGCombinerInfo &DCI,
9421                                   const ARMSubtarget *Subtarget) {
9422   SelectionDAG &DAG = DCI.DAG;
9423   SDValue Op = N->getOperand(0);
9424
9425   if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
9426       Op.getOpcode() != ISD::FMUL)
9427     return SDValue();
9428
9429   uint64_t C;
9430   SDValue N0 = Op->getOperand(0);
9431   SDValue ConstVec = Op->getOperand(1);
9432   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
9433
9434   if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
9435       !isConstVecPow2(ConstVec, isSigned, C))
9436     return SDValue();
9437
9438   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
9439   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
9440   if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
9441     // These instructions only exist converting from f32 to i32. We can handle
9442     // smaller integers by generating an extra truncate, but larger ones would
9443     // be lossy.
9444     return SDValue();
9445   }
9446
9447   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
9448     Intrinsic::arm_neon_vcvtfp2fxu;
9449   unsigned NumLanes = Op.getValueType().getVectorNumElements();
9450   SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
9451                                  NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
9452                                  DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
9453                                  DAG.getConstant(Log2_64(C), MVT::i32));
9454
9455   if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
9456     FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv);
9457
9458   return FixConv;
9459 }
9460
9461 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
9462 /// can replace combinations of VCVT (integer to floating-point) and VDIV
9463 /// when the VDIV has a constant operand that is a power of 2.
9464 ///
9465 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
9466 ///  vcvt.f32.s32    d16, d16
9467 ///  vdiv.f32        d16, d17, d16
9468 /// becomes:
9469 ///  vcvt.f32.s32    d16, d16, #3
9470 static SDValue PerformVDIVCombine(SDNode *N,
9471                                   TargetLowering::DAGCombinerInfo &DCI,
9472                                   const ARMSubtarget *Subtarget) {
9473   SelectionDAG &DAG = DCI.DAG;
9474   SDValue Op = N->getOperand(0);
9475   unsigned OpOpcode = Op.getNode()->getOpcode();
9476
9477   if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
9478       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
9479     return SDValue();
9480
9481   uint64_t C;
9482   SDValue ConstVec = N->getOperand(1);
9483   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
9484
9485   if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
9486       !isConstVecPow2(ConstVec, isSigned, C))
9487     return SDValue();
9488
9489   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
9490   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
9491   if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
9492     // These instructions only exist converting from i32 to f32. We can handle
9493     // smaller integers by generating an extra extend, but larger ones would
9494     // be lossy.
9495     return SDValue();
9496   }
9497
9498   SDValue ConvInput = Op.getOperand(0);
9499   unsigned NumLanes = Op.getValueType().getVectorNumElements();
9500   if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
9501     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
9502                             SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
9503                             ConvInput);
9504
9505   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
9506     Intrinsic::arm_neon_vcvtfxu2fp;
9507   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
9508                      Op.getValueType(),
9509                      DAG.getConstant(IntrinsicOpcode, MVT::i32),
9510                      ConvInput, DAG.getConstant(Log2_64(C), MVT::i32));
9511 }
9512
9513 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
9514 /// operand of a vector shift operation, where all the elements of the
9515 /// build_vector must have the same constant integer value.
9516 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
9517   // Ignore bit_converts.
9518   while (Op.getOpcode() == ISD::BITCAST)
9519     Op = Op.getOperand(0);
9520   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
9521   APInt SplatBits, SplatUndef;
9522   unsigned SplatBitSize;
9523   bool HasAnyUndefs;
9524   if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
9525                                       HasAnyUndefs, ElementBits) ||
9526       SplatBitSize > ElementBits)
9527     return false;
9528   Cnt = SplatBits.getSExtValue();
9529   return true;
9530 }
9531
9532 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
9533 /// operand of a vector shift left operation.  That value must be in the range:
9534 ///   0 <= Value < ElementBits for a left shift; or
9535 ///   0 <= Value <= ElementBits for a long left shift.
9536 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
9537   assert(VT.isVector() && "vector shift count is not a vector type");
9538   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
9539   if (! getVShiftImm(Op, ElementBits, Cnt))
9540     return false;
9541   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
9542 }
9543
9544 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
9545 /// operand of a vector shift right operation.  For a shift opcode, the value
9546 /// is positive, but for an intrinsic the value count must be negative. The
9547 /// absolute value must be in the range:
9548 ///   1 <= |Value| <= ElementBits for a right shift; or
9549 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
9550 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
9551                          int64_t &Cnt) {
9552   assert(VT.isVector() && "vector shift count is not a vector type");
9553   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
9554   if (! getVShiftImm(Op, ElementBits, Cnt))
9555     return false;
9556   if (isIntrinsic)
9557     Cnt = -Cnt;
9558   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
9559 }
9560
9561 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
9562 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
9563   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9564   switch (IntNo) {
9565   default:
9566     // Don't do anything for most intrinsics.
9567     break;
9568
9569   // Vector shifts: check for immediate versions and lower them.
9570   // Note: This is done during DAG combining instead of DAG legalizing because
9571   // the build_vectors for 64-bit vector element shift counts are generally
9572   // not legal, and it is hard to see their values after they get legalized to
9573   // loads from a constant pool.
9574   case Intrinsic::arm_neon_vshifts:
9575   case Intrinsic::arm_neon_vshiftu:
9576   case Intrinsic::arm_neon_vshiftls:
9577   case Intrinsic::arm_neon_vshiftlu:
9578   case Intrinsic::arm_neon_vshiftn:
9579   case Intrinsic::arm_neon_vrshifts:
9580   case Intrinsic::arm_neon_vrshiftu:
9581   case Intrinsic::arm_neon_vrshiftn:
9582   case Intrinsic::arm_neon_vqshifts:
9583   case Intrinsic::arm_neon_vqshiftu:
9584   case Intrinsic::arm_neon_vqshiftsu:
9585   case Intrinsic::arm_neon_vqshiftns:
9586   case Intrinsic::arm_neon_vqshiftnu:
9587   case Intrinsic::arm_neon_vqshiftnsu:
9588   case Intrinsic::arm_neon_vqrshiftns:
9589   case Intrinsic::arm_neon_vqrshiftnu:
9590   case Intrinsic::arm_neon_vqrshiftnsu: {
9591     EVT VT = N->getOperand(1).getValueType();
9592     int64_t Cnt;
9593     unsigned VShiftOpc = 0;
9594
9595     switch (IntNo) {
9596     case Intrinsic::arm_neon_vshifts:
9597     case Intrinsic::arm_neon_vshiftu:
9598       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
9599         VShiftOpc = ARMISD::VSHL;
9600         break;
9601       }
9602       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
9603         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
9604                      ARMISD::VSHRs : ARMISD::VSHRu);
9605         break;
9606       }
9607       return SDValue();
9608
9609     case Intrinsic::arm_neon_vshiftls:
9610     case Intrinsic::arm_neon_vshiftlu:
9611       if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
9612         break;
9613       llvm_unreachable("invalid shift count for vshll intrinsic");
9614
9615     case Intrinsic::arm_neon_vrshifts:
9616     case Intrinsic::arm_neon_vrshiftu:
9617       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
9618         break;
9619       return SDValue();
9620
9621     case Intrinsic::arm_neon_vqshifts:
9622     case Intrinsic::arm_neon_vqshiftu:
9623       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
9624         break;
9625       return SDValue();
9626
9627     case Intrinsic::arm_neon_vqshiftsu:
9628       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
9629         break;
9630       llvm_unreachable("invalid shift count for vqshlu intrinsic");
9631
9632     case Intrinsic::arm_neon_vshiftn:
9633     case Intrinsic::arm_neon_vrshiftn:
9634     case Intrinsic::arm_neon_vqshiftns:
9635     case Intrinsic::arm_neon_vqshiftnu:
9636     case Intrinsic::arm_neon_vqshiftnsu:
9637     case Intrinsic::arm_neon_vqrshiftns:
9638     case Intrinsic::arm_neon_vqrshiftnu:
9639     case Intrinsic::arm_neon_vqrshiftnsu:
9640       // Narrowing shifts require an immediate right shift.
9641       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
9642         break;
9643       llvm_unreachable("invalid shift count for narrowing vector shift "
9644                        "intrinsic");
9645
9646     default:
9647       llvm_unreachable("unhandled vector shift");
9648     }
9649
9650     switch (IntNo) {
9651     case Intrinsic::arm_neon_vshifts:
9652     case Intrinsic::arm_neon_vshiftu:
9653       // Opcode already set above.
9654       break;
9655     case Intrinsic::arm_neon_vshiftls:
9656     case Intrinsic::arm_neon_vshiftlu:
9657       if (Cnt == VT.getVectorElementType().getSizeInBits())
9658         VShiftOpc = ARMISD::VSHLLi;
9659       else
9660         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
9661                      ARMISD::VSHLLs : ARMISD::VSHLLu);
9662       break;
9663     case Intrinsic::arm_neon_vshiftn:
9664       VShiftOpc = ARMISD::VSHRN; break;
9665     case Intrinsic::arm_neon_vrshifts:
9666       VShiftOpc = ARMISD::VRSHRs; break;
9667     case Intrinsic::arm_neon_vrshiftu:
9668       VShiftOpc = ARMISD::VRSHRu; break;
9669     case Intrinsic::arm_neon_vrshiftn:
9670       VShiftOpc = ARMISD::VRSHRN; break;
9671     case Intrinsic::arm_neon_vqshifts:
9672       VShiftOpc = ARMISD::VQSHLs; break;
9673     case Intrinsic::arm_neon_vqshiftu:
9674       VShiftOpc = ARMISD::VQSHLu; break;
9675     case Intrinsic::arm_neon_vqshiftsu:
9676       VShiftOpc = ARMISD::VQSHLsu; break;
9677     case Intrinsic::arm_neon_vqshiftns:
9678       VShiftOpc = ARMISD::VQSHRNs; break;
9679     case Intrinsic::arm_neon_vqshiftnu:
9680       VShiftOpc = ARMISD::VQSHRNu; break;
9681     case Intrinsic::arm_neon_vqshiftnsu:
9682       VShiftOpc = ARMISD::VQSHRNsu; break;
9683     case Intrinsic::arm_neon_vqrshiftns:
9684       VShiftOpc = ARMISD::VQRSHRNs; break;
9685     case Intrinsic::arm_neon_vqrshiftnu:
9686       VShiftOpc = ARMISD::VQRSHRNu; break;
9687     case Intrinsic::arm_neon_vqrshiftnsu:
9688       VShiftOpc = ARMISD::VQRSHRNsu; break;
9689     }
9690
9691     return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
9692                        N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
9693   }
9694
9695   case Intrinsic::arm_neon_vshiftins: {
9696     EVT VT = N->getOperand(1).getValueType();
9697     int64_t Cnt;
9698     unsigned VShiftOpc = 0;
9699
9700     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
9701       VShiftOpc = ARMISD::VSLI;
9702     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
9703       VShiftOpc = ARMISD::VSRI;
9704     else {
9705       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
9706     }
9707
9708     return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
9709                        N->getOperand(1), N->getOperand(2),
9710                        DAG.getConstant(Cnt, MVT::i32));
9711   }
9712
9713   case Intrinsic::arm_neon_vqrshifts:
9714   case Intrinsic::arm_neon_vqrshiftu:
9715     // No immediate versions of these to check for.
9716     break;
9717   }
9718
9719   return SDValue();
9720 }
9721
9722 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
9723 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
9724 /// combining instead of DAG legalizing because the build_vectors for 64-bit
9725 /// vector element shift counts are generally not legal, and it is hard to see
9726 /// their values after they get legalized to loads from a constant pool.
9727 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
9728                                    const ARMSubtarget *ST) {
9729   EVT VT = N->getValueType(0);
9730   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
9731     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
9732     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
9733     SDValue N1 = N->getOperand(1);
9734     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
9735       SDValue N0 = N->getOperand(0);
9736       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
9737           DAG.MaskedValueIsZero(N0.getOperand(0),
9738                                 APInt::getHighBitsSet(32, 16)))
9739         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
9740     }
9741   }
9742
9743   // Nothing to be done for scalar shifts.
9744   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9745   if (!VT.isVector() || !TLI.isTypeLegal(VT))
9746     return SDValue();
9747
9748   assert(ST->hasNEON() && "unexpected vector shift");
9749   int64_t Cnt;
9750
9751   switch (N->getOpcode()) {
9752   default: llvm_unreachable("unexpected shift opcode");
9753
9754   case ISD::SHL:
9755     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
9756       return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0),
9757                          DAG.getConstant(Cnt, MVT::i32));
9758     break;
9759
9760   case ISD::SRA:
9761   case ISD::SRL:
9762     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
9763       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
9764                             ARMISD::VSHRs : ARMISD::VSHRu);
9765       return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0),
9766                          DAG.getConstant(Cnt, MVT::i32));
9767     }
9768   }
9769   return SDValue();
9770 }
9771
9772 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
9773 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
9774 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
9775                                     const ARMSubtarget *ST) {
9776   SDValue N0 = N->getOperand(0);
9777
9778   // Check for sign- and zero-extensions of vector extract operations of 8-
9779   // and 16-bit vector elements.  NEON supports these directly.  They are
9780   // handled during DAG combining because type legalization will promote them
9781   // to 32-bit types and it is messy to recognize the operations after that.
9782   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
9783     SDValue Vec = N0.getOperand(0);
9784     SDValue Lane = N0.getOperand(1);
9785     EVT VT = N->getValueType(0);
9786     EVT EltVT = N0.getValueType();
9787     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9788
9789     if (VT == MVT::i32 &&
9790         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
9791         TLI.isTypeLegal(Vec.getValueType()) &&
9792         isa<ConstantSDNode>(Lane)) {
9793
9794       unsigned Opc = 0;
9795       switch (N->getOpcode()) {
9796       default: llvm_unreachable("unexpected opcode");
9797       case ISD::SIGN_EXTEND:
9798         Opc = ARMISD::VGETLANEs;
9799         break;
9800       case ISD::ZERO_EXTEND:
9801       case ISD::ANY_EXTEND:
9802         Opc = ARMISD::VGETLANEu;
9803         break;
9804       }
9805       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
9806     }
9807   }
9808
9809   return SDValue();
9810 }
9811
9812 /// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
9813 /// to match f32 max/min patterns to use NEON vmax/vmin instructions.
9814 static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
9815                                        const ARMSubtarget *ST) {
9816   // If the target supports NEON, try to use vmax/vmin instructions for f32
9817   // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
9818   // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
9819   // a NaN; only do the transformation when it matches that behavior.
9820
9821   // For now only do this when using NEON for FP operations; if using VFP, it
9822   // is not obvious that the benefit outweighs the cost of switching to the
9823   // NEON pipeline.
9824   if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
9825       N->getValueType(0) != MVT::f32)
9826     return SDValue();
9827
9828   SDValue CondLHS = N->getOperand(0);
9829   SDValue CondRHS = N->getOperand(1);
9830   SDValue LHS = N->getOperand(2);
9831   SDValue RHS = N->getOperand(3);
9832   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
9833
9834   unsigned Opcode = 0;
9835   bool IsReversed;
9836   if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
9837     IsReversed = false; // x CC y ? x : y
9838   } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
9839     IsReversed = true ; // x CC y ? y : x
9840   } else {
9841     return SDValue();
9842   }
9843
9844   bool IsUnordered;
9845   switch (CC) {
9846   default: break;
9847   case ISD::SETOLT:
9848   case ISD::SETOLE:
9849   case ISD::SETLT:
9850   case ISD::SETLE:
9851   case ISD::SETULT:
9852   case ISD::SETULE:
9853     // If LHS is NaN, an ordered comparison will be false and the result will
9854     // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
9855     // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
9856     IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
9857     if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
9858       break;
9859     // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
9860     // will return -0, so vmin can only be used for unsafe math or if one of
9861     // the operands is known to be nonzero.
9862     if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
9863         !DAG.getTarget().Options.UnsafeFPMath &&
9864         !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9865       break;
9866     Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
9867     break;
9868
9869   case ISD::SETOGT:
9870   case ISD::SETOGE:
9871   case ISD::SETGT:
9872   case ISD::SETGE:
9873   case ISD::SETUGT:
9874   case ISD::SETUGE:
9875     // If LHS is NaN, an ordered comparison will be false and the result will
9876     // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
9877     // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
9878     IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
9879     if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
9880       break;
9881     // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
9882     // will return +0, so vmax can only be used for unsafe math or if one of
9883     // the operands is known to be nonzero.
9884     if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
9885         !DAG.getTarget().Options.UnsafeFPMath &&
9886         !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9887       break;
9888     Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
9889     break;
9890   }
9891
9892   if (!Opcode)
9893     return SDValue();
9894   return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
9895 }
9896
9897 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
9898 SDValue
9899 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
9900   SDValue Cmp = N->getOperand(4);
9901   if (Cmp.getOpcode() != ARMISD::CMPZ)
9902     // Only looking at EQ and NE cases.
9903     return SDValue();
9904
9905   EVT VT = N->getValueType(0);
9906   SDLoc dl(N);
9907   SDValue LHS = Cmp.getOperand(0);
9908   SDValue RHS = Cmp.getOperand(1);
9909   SDValue FalseVal = N->getOperand(0);
9910   SDValue TrueVal = N->getOperand(1);
9911   SDValue ARMcc = N->getOperand(2);
9912   ARMCC::CondCodes CC =
9913     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
9914
9915   // Simplify
9916   //   mov     r1, r0
9917   //   cmp     r1, x
9918   //   mov     r0, y
9919   //   moveq   r0, x
9920   // to
9921   //   cmp     r0, x
9922   //   movne   r0, y
9923   //
9924   //   mov     r1, r0
9925   //   cmp     r1, x
9926   //   mov     r0, x
9927   //   movne   r0, y
9928   // to
9929   //   cmp     r0, x
9930   //   movne   r0, y
9931   /// FIXME: Turn this into a target neutral optimization?
9932   SDValue Res;
9933   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
9934     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
9935                       N->getOperand(3), Cmp);
9936   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
9937     SDValue ARMcc;
9938     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
9939     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
9940                       N->getOperand(3), NewCmp);
9941   }
9942
9943   if (Res.getNode()) {
9944     APInt KnownZero, KnownOne;
9945     DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne);
9946     // Capture demanded bits information that would be otherwise lost.
9947     if (KnownZero == 0xfffffffe)
9948       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
9949                         DAG.getValueType(MVT::i1));
9950     else if (KnownZero == 0xffffff00)
9951       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
9952                         DAG.getValueType(MVT::i8));
9953     else if (KnownZero == 0xffff0000)
9954       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
9955                         DAG.getValueType(MVT::i16));
9956   }
9957
9958   return Res;
9959 }
9960
9961 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
9962                                              DAGCombinerInfo &DCI) const {
9963   switch (N->getOpcode()) {
9964   default: break;
9965   case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
9966   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
9967   case ISD::SUB:        return PerformSUBCombine(N, DCI);
9968   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
9969   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
9970   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
9971   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
9972   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
9973   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
9974   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
9975   case ISD::STORE:      return PerformSTORECombine(N, DCI);
9976   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI);
9977   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
9978   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
9979   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
9980   case ISD::FP_TO_SINT:
9981   case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
9982   case ISD::FDIV:       return PerformVDIVCombine(N, DCI, Subtarget);
9983   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
9984   case ISD::SHL:
9985   case ISD::SRA:
9986   case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
9987   case ISD::SIGN_EXTEND:
9988   case ISD::ZERO_EXTEND:
9989   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
9990   case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
9991   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
9992   case ARMISD::VLD2DUP:
9993   case ARMISD::VLD3DUP:
9994   case ARMISD::VLD4DUP:
9995     return CombineBaseUpdate(N, DCI);
9996   case ARMISD::BUILD_VECTOR:
9997     return PerformARMBUILD_VECTORCombine(N, DCI);
9998   case ISD::INTRINSIC_VOID:
9999   case ISD::INTRINSIC_W_CHAIN:
10000     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
10001     case Intrinsic::arm_neon_vld1:
10002     case Intrinsic::arm_neon_vld2:
10003     case Intrinsic::arm_neon_vld3:
10004     case Intrinsic::arm_neon_vld4:
10005     case Intrinsic::arm_neon_vld2lane:
10006     case Intrinsic::arm_neon_vld3lane:
10007     case Intrinsic::arm_neon_vld4lane:
10008     case Intrinsic::arm_neon_vst1:
10009     case Intrinsic::arm_neon_vst2:
10010     case Intrinsic::arm_neon_vst3:
10011     case Intrinsic::arm_neon_vst4:
10012     case Intrinsic::arm_neon_vst2lane:
10013     case Intrinsic::arm_neon_vst3lane:
10014     case Intrinsic::arm_neon_vst4lane:
10015       return CombineBaseUpdate(N, DCI);
10016     default: break;
10017     }
10018     break;
10019   }
10020   return SDValue();
10021 }
10022
10023 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
10024                                                           EVT VT) const {
10025   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
10026 }
10027
10028 bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
10029   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
10030   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
10031
10032   switch (VT.getSimpleVT().SimpleTy) {
10033   default:
10034     return false;
10035   case MVT::i8:
10036   case MVT::i16:
10037   case MVT::i32: {
10038     // Unaligned access can use (for example) LRDB, LRDH, LDR
10039     if (AllowsUnaligned) {
10040       if (Fast)
10041         *Fast = Subtarget->hasV7Ops();
10042       return true;
10043     }
10044     return false;
10045   }
10046   case MVT::f64:
10047   case MVT::v2f64: {
10048     // For any little-endian targets with neon, we can support unaligned ld/st
10049     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
10050     // A big-endian target may also explictly support unaligned accesses
10051     if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
10052       if (Fast)
10053         *Fast = true;
10054       return true;
10055     }
10056     return false;
10057   }
10058   }
10059 }
10060
10061 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
10062                        unsigned AlignCheck) {
10063   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
10064           (DstAlign == 0 || DstAlign % AlignCheck == 0));
10065 }
10066
10067 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
10068                                            unsigned DstAlign, unsigned SrcAlign,
10069                                            bool IsMemset, bool ZeroMemset,
10070                                            bool MemcpyStrSrc,
10071                                            MachineFunction &MF) const {
10072   const Function *F = MF.getFunction();
10073
10074   // See if we can use NEON instructions for this...
10075   if ((!IsMemset || ZeroMemset) &&
10076       Subtarget->hasNEON() &&
10077       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
10078                                        Attribute::NoImplicitFloat)) {
10079     bool Fast;
10080     if (Size >= 16 &&
10081         (memOpAlign(SrcAlign, DstAlign, 16) ||
10082          (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) {
10083       return MVT::v2f64;
10084     } else if (Size >= 8 &&
10085                (memOpAlign(SrcAlign, DstAlign, 8) ||
10086                 (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) {
10087       return MVT::f64;
10088     }
10089   }
10090
10091   // Lowering to i32/i16 if the size permits.
10092   if (Size >= 4)
10093     return MVT::i32;
10094   else if (Size >= 2)
10095     return MVT::i16;
10096
10097   // Let the target-independent logic figure it out.
10098   return MVT::Other;
10099 }
10100
10101 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
10102   if (Val.getOpcode() != ISD::LOAD)
10103     return false;
10104
10105   EVT VT1 = Val.getValueType();
10106   if (!VT1.isSimple() || !VT1.isInteger() ||
10107       !VT2.isSimple() || !VT2.isInteger())
10108     return false;
10109
10110   switch (VT1.getSimpleVT().SimpleTy) {
10111   default: break;
10112   case MVT::i1:
10113   case MVT::i8:
10114   case MVT::i16:
10115     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
10116     return true;
10117   }
10118
10119   return false;
10120 }
10121
10122 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
10123   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
10124     return false;
10125
10126   if (!isTypeLegal(EVT::getEVT(Ty1)))
10127     return false;
10128
10129   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
10130
10131   // Assuming the caller doesn't have a zeroext or signext return parameter,
10132   // truncation all the way down to i1 is valid.
10133   return true;
10134 }
10135
10136
10137 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
10138   if (V < 0)
10139     return false;
10140
10141   unsigned Scale = 1;
10142   switch (VT.getSimpleVT().SimpleTy) {
10143   default: return false;
10144   case MVT::i1:
10145   case MVT::i8:
10146     // Scale == 1;
10147     break;
10148   case MVT::i16:
10149     // Scale == 2;
10150     Scale = 2;
10151     break;
10152   case MVT::i32:
10153     // Scale == 4;
10154     Scale = 4;
10155     break;
10156   }
10157
10158   if ((V & (Scale - 1)) != 0)
10159     return false;
10160   V /= Scale;
10161   return V == (V & ((1LL << 5) - 1));
10162 }
10163
10164 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
10165                                       const ARMSubtarget *Subtarget) {
10166   bool isNeg = false;
10167   if (V < 0) {
10168     isNeg = true;
10169     V = - V;
10170   }
10171
10172   switch (VT.getSimpleVT().SimpleTy) {
10173   default: return false;
10174   case MVT::i1:
10175   case MVT::i8:
10176   case MVT::i16:
10177   case MVT::i32:
10178     // + imm12 or - imm8
10179     if (isNeg)
10180       return V == (V & ((1LL << 8) - 1));
10181     return V == (V & ((1LL << 12) - 1));
10182   case MVT::f32:
10183   case MVT::f64:
10184     // Same as ARM mode. FIXME: NEON?
10185     if (!Subtarget->hasVFP2())
10186       return false;
10187     if ((V & 3) != 0)
10188       return false;
10189     V >>= 2;
10190     return V == (V & ((1LL << 8) - 1));
10191   }
10192 }
10193
10194 /// isLegalAddressImmediate - Return true if the integer value can be used
10195 /// as the offset of the target addressing mode for load / store of the
10196 /// given type.
10197 static bool isLegalAddressImmediate(int64_t V, EVT VT,
10198                                     const ARMSubtarget *Subtarget) {
10199   if (V == 0)
10200     return true;
10201
10202   if (!VT.isSimple())
10203     return false;
10204
10205   if (Subtarget->isThumb1Only())
10206     return isLegalT1AddressImmediate(V, VT);
10207   else if (Subtarget->isThumb2())
10208     return isLegalT2AddressImmediate(V, VT, Subtarget);
10209
10210   // ARM mode.
10211   if (V < 0)
10212     V = - V;
10213   switch (VT.getSimpleVT().SimpleTy) {
10214   default: return false;
10215   case MVT::i1:
10216   case MVT::i8:
10217   case MVT::i32:
10218     // +- imm12
10219     return V == (V & ((1LL << 12) - 1));
10220   case MVT::i16:
10221     // +- imm8
10222     return V == (V & ((1LL << 8) - 1));
10223   case MVT::f32:
10224   case MVT::f64:
10225     if (!Subtarget->hasVFP2()) // FIXME: NEON?
10226       return false;
10227     if ((V & 3) != 0)
10228       return false;
10229     V >>= 2;
10230     return V == (V & ((1LL << 8) - 1));
10231   }
10232 }
10233
10234 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
10235                                                       EVT VT) const {
10236   int Scale = AM.Scale;
10237   if (Scale < 0)
10238     return false;
10239
10240   switch (VT.getSimpleVT().SimpleTy) {
10241   default: return false;
10242   case MVT::i1:
10243   case MVT::i8:
10244   case MVT::i16:
10245   case MVT::i32:
10246     if (Scale == 1)
10247       return true;
10248     // r + r << imm
10249     Scale = Scale & ~1;
10250     return Scale == 2 || Scale == 4 || Scale == 8;
10251   case MVT::i64:
10252     // r + r
10253     if (((unsigned)AM.HasBaseReg + Scale) <= 2)
10254       return true;
10255     return false;
10256   case MVT::isVoid:
10257     // Note, we allow "void" uses (basically, uses that aren't loads or
10258     // stores), because arm allows folding a scale into many arithmetic
10259     // operations.  This should be made more precise and revisited later.
10260
10261     // Allow r << imm, but the imm has to be a multiple of two.
10262     if (Scale & 1) return false;
10263     return isPowerOf2_32(Scale);
10264   }
10265 }
10266
10267 /// isLegalAddressingMode - Return true if the addressing mode represented
10268 /// by AM is legal for this target, for a load/store of the specified type.
10269 bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
10270                                               Type *Ty) const {
10271   EVT VT = getValueType(Ty, true);
10272   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
10273     return false;
10274
10275   // Can never fold addr of global into load/store.
10276   if (AM.BaseGV)
10277     return false;
10278
10279   switch (AM.Scale) {
10280   case 0:  // no scale reg, must be "r+i" or "r", or "i".
10281     break;
10282   case 1:
10283     if (Subtarget->isThumb1Only())
10284       return false;
10285     // FALL THROUGH.
10286   default:
10287     // ARM doesn't support any R+R*scale+imm addr modes.
10288     if (AM.BaseOffs)
10289       return false;
10290
10291     if (!VT.isSimple())
10292       return false;
10293
10294     if (Subtarget->isThumb2())
10295       return isLegalT2ScaledAddressingMode(AM, VT);
10296
10297     int Scale = AM.Scale;
10298     switch (VT.getSimpleVT().SimpleTy) {
10299     default: return false;
10300     case MVT::i1:
10301     case MVT::i8:
10302     case MVT::i32:
10303       if (Scale < 0) Scale = -Scale;
10304       if (Scale == 1)
10305         return true;
10306       // r + r << imm
10307       return isPowerOf2_32(Scale & ~1);
10308     case MVT::i16:
10309     case MVT::i64:
10310       // r + r
10311       if (((unsigned)AM.HasBaseReg + Scale) <= 2)
10312         return true;
10313       return false;
10314
10315     case MVT::isVoid:
10316       // Note, we allow "void" uses (basically, uses that aren't loads or
10317       // stores), because arm allows folding a scale into many arithmetic
10318       // operations.  This should be made more precise and revisited later.
10319
10320       // Allow r << imm, but the imm has to be a multiple of two.
10321       if (Scale & 1) return false;
10322       return isPowerOf2_32(Scale);
10323     }
10324   }
10325   return true;
10326 }
10327
10328 /// isLegalICmpImmediate - Return true if the specified immediate is legal
10329 /// icmp immediate, that is the target has icmp instructions which can compare
10330 /// a register against the immediate without having to materialize the
10331 /// immediate into a register.
10332 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
10333   // Thumb2 and ARM modes can use cmn for negative immediates.
10334   if (!Subtarget->isThumb())
10335     return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1;
10336   if (Subtarget->isThumb2())
10337     return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1;
10338   // Thumb1 doesn't have cmn, and only 8-bit immediates.
10339   return Imm >= 0 && Imm <= 255;
10340 }
10341
10342 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
10343 /// *or sub* immediate, that is the target has add or sub instructions which can
10344 /// add a register with the immediate without having to materialize the
10345 /// immediate into a register.
10346 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
10347   // Same encoding for add/sub, just flip the sign.
10348   int64_t AbsImm = llvm::abs64(Imm);
10349   if (!Subtarget->isThumb())
10350     return ARM_AM::getSOImmVal(AbsImm) != -1;
10351   if (Subtarget->isThumb2())
10352     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
10353   // Thumb1 only has 8-bit unsigned immediate.
10354   return AbsImm >= 0 && AbsImm <= 255;
10355 }
10356
10357 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
10358                                       bool isSEXTLoad, SDValue &Base,
10359                                       SDValue &Offset, bool &isInc,
10360                                       SelectionDAG &DAG) {
10361   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
10362     return false;
10363
10364   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
10365     // AddressingMode 3
10366     Base = Ptr->getOperand(0);
10367     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10368       int RHSC = (int)RHS->getZExtValue();
10369       if (RHSC < 0 && RHSC > -256) {
10370         assert(Ptr->getOpcode() == ISD::ADD);
10371         isInc = false;
10372         Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
10373         return true;
10374       }
10375     }
10376     isInc = (Ptr->getOpcode() == ISD::ADD);
10377     Offset = Ptr->getOperand(1);
10378     return true;
10379   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
10380     // AddressingMode 2
10381     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10382       int RHSC = (int)RHS->getZExtValue();
10383       if (RHSC < 0 && RHSC > -0x1000) {
10384         assert(Ptr->getOpcode() == ISD::ADD);
10385         isInc = false;
10386         Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
10387         Base = Ptr->getOperand(0);
10388         return true;
10389       }
10390     }
10391
10392     if (Ptr->getOpcode() == ISD::ADD) {
10393       isInc = true;
10394       ARM_AM::ShiftOpc ShOpcVal=
10395         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
10396       if (ShOpcVal != ARM_AM::no_shift) {
10397         Base = Ptr->getOperand(1);
10398         Offset = Ptr->getOperand(0);
10399       } else {
10400         Base = Ptr->getOperand(0);
10401         Offset = Ptr->getOperand(1);
10402       }
10403       return true;
10404     }
10405
10406     isInc = (Ptr->getOpcode() == ISD::ADD);
10407     Base = Ptr->getOperand(0);
10408     Offset = Ptr->getOperand(1);
10409     return true;
10410   }
10411
10412   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
10413   return false;
10414 }
10415
10416 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
10417                                      bool isSEXTLoad, SDValue &Base,
10418                                      SDValue &Offset, bool &isInc,
10419                                      SelectionDAG &DAG) {
10420   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
10421     return false;
10422
10423   Base = Ptr->getOperand(0);
10424   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10425     int RHSC = (int)RHS->getZExtValue();
10426     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
10427       assert(Ptr->getOpcode() == ISD::ADD);
10428       isInc = false;
10429       Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
10430       return true;
10431     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
10432       isInc = Ptr->getOpcode() == ISD::ADD;
10433       Offset = DAG.getConstant(RHSC, RHS->getValueType(0));
10434       return true;
10435     }
10436   }
10437
10438   return false;
10439 }
10440
10441 /// getPreIndexedAddressParts - returns true by value, base pointer and
10442 /// offset pointer and addressing mode by reference if the node's address
10443 /// can be legally represented as pre-indexed load / store address.
10444 bool
10445 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
10446                                              SDValue &Offset,
10447                                              ISD::MemIndexedMode &AM,
10448                                              SelectionDAG &DAG) const {
10449   if (Subtarget->isThumb1Only())
10450     return false;
10451
10452   EVT VT;
10453   SDValue Ptr;
10454   bool isSEXTLoad = false;
10455   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
10456     Ptr = LD->getBasePtr();
10457     VT  = LD->getMemoryVT();
10458     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
10459   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
10460     Ptr = ST->getBasePtr();
10461     VT  = ST->getMemoryVT();
10462   } else
10463     return false;
10464
10465   bool isInc;
10466   bool isLegal = false;
10467   if (Subtarget->isThumb2())
10468     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
10469                                        Offset, isInc, DAG);
10470   else
10471     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
10472                                         Offset, isInc, DAG);
10473   if (!isLegal)
10474     return false;
10475
10476   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
10477   return true;
10478 }
10479
10480 /// getPostIndexedAddressParts - returns true by value, base pointer and
10481 /// offset pointer and addressing mode by reference if this node can be
10482 /// combined with a load / store to form a post-indexed load / store.
10483 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
10484                                                    SDValue &Base,
10485                                                    SDValue &Offset,
10486                                                    ISD::MemIndexedMode &AM,
10487                                                    SelectionDAG &DAG) const {
10488   if (Subtarget->isThumb1Only())
10489     return false;
10490
10491   EVT VT;
10492   SDValue Ptr;
10493   bool isSEXTLoad = false;
10494   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
10495     VT  = LD->getMemoryVT();
10496     Ptr = LD->getBasePtr();
10497     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
10498   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
10499     VT  = ST->getMemoryVT();
10500     Ptr = ST->getBasePtr();
10501   } else
10502     return false;
10503
10504   bool isInc;
10505   bool isLegal = false;
10506   if (Subtarget->isThumb2())
10507     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
10508                                        isInc, DAG);
10509   else
10510     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
10511                                         isInc, DAG);
10512   if (!isLegal)
10513     return false;
10514
10515   if (Ptr != Base) {
10516     // Swap base ptr and offset to catch more post-index load / store when
10517     // it's legal. In Thumb2 mode, offset must be an immediate.
10518     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
10519         !Subtarget->isThumb2())
10520       std::swap(Base, Offset);
10521
10522     // Post-indexed load / store update the base pointer.
10523     if (Ptr != Base)
10524       return false;
10525   }
10526
10527   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
10528   return true;
10529 }
10530
10531 void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
10532                                                        APInt &KnownZero,
10533                                                        APInt &KnownOne,
10534                                                        const SelectionDAG &DAG,
10535                                                        unsigned Depth) const {
10536   unsigned BitWidth = KnownOne.getBitWidth();
10537   KnownZero = KnownOne = APInt(BitWidth, 0);
10538   switch (Op.getOpcode()) {
10539   default: break;
10540   case ARMISD::ADDC:
10541   case ARMISD::ADDE:
10542   case ARMISD::SUBC:
10543   case ARMISD::SUBE:
10544     // These nodes' second result is a boolean
10545     if (Op.getResNo() == 0)
10546       break;
10547     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
10548     break;
10549   case ARMISD::CMOV: {
10550     // Bits are known zero/one if known on the LHS and RHS.
10551     DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
10552     if (KnownZero == 0 && KnownOne == 0) return;
10553
10554     APInt KnownZeroRHS, KnownOneRHS;
10555     DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
10556     KnownZero &= KnownZeroRHS;
10557     KnownOne  &= KnownOneRHS;
10558     return;
10559   }
10560   }
10561 }
10562
10563 //===----------------------------------------------------------------------===//
10564 //                           ARM Inline Assembly Support
10565 //===----------------------------------------------------------------------===//
10566
10567 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
10568   // Looking for "rev" which is V6+.
10569   if (!Subtarget->hasV6Ops())
10570     return false;
10571
10572   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
10573   std::string AsmStr = IA->getAsmString();
10574   SmallVector<StringRef, 4> AsmPieces;
10575   SplitString(AsmStr, AsmPieces, ";\n");
10576
10577   switch (AsmPieces.size()) {
10578   default: return false;
10579   case 1:
10580     AsmStr = AsmPieces[0];
10581     AsmPieces.clear();
10582     SplitString(AsmStr, AsmPieces, " \t,");
10583
10584     // rev $0, $1
10585     if (AsmPieces.size() == 3 &&
10586         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
10587         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
10588       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
10589       if (Ty && Ty->getBitWidth() == 32)
10590         return IntrinsicLowering::LowerToByteSwap(CI);
10591     }
10592     break;
10593   }
10594
10595   return false;
10596 }
10597
10598 /// getConstraintType - Given a constraint letter, return the type of
10599 /// constraint it is for this target.
10600 ARMTargetLowering::ConstraintType
10601 ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
10602   if (Constraint.size() == 1) {
10603     switch (Constraint[0]) {
10604     default:  break;
10605     case 'l': return C_RegisterClass;
10606     case 'w': return C_RegisterClass;
10607     case 'h': return C_RegisterClass;
10608     case 'x': return C_RegisterClass;
10609     case 't': return C_RegisterClass;
10610     case 'j': return C_Other; // Constant for movw.
10611       // An address with a single base register. Due to the way we
10612       // currently handle addresses it is the same as an 'r' memory constraint.
10613     case 'Q': return C_Memory;
10614     }
10615   } else if (Constraint.size() == 2) {
10616     switch (Constraint[0]) {
10617     default: break;
10618     // All 'U+' constraints are addresses.
10619     case 'U': return C_Memory;
10620     }
10621   }
10622   return TargetLowering::getConstraintType(Constraint);
10623 }
10624
10625 /// Examine constraint type and operand type and determine a weight value.
10626 /// This object must already have been set up with the operand type
10627 /// and the current alternative constraint selected.
10628 TargetLowering::ConstraintWeight
10629 ARMTargetLowering::getSingleConstraintMatchWeight(
10630     AsmOperandInfo &info, const char *constraint) const {
10631   ConstraintWeight weight = CW_Invalid;
10632   Value *CallOperandVal = info.CallOperandVal;
10633     // If we don't have a value, we can't do a match,
10634     // but allow it at the lowest weight.
10635   if (CallOperandVal == NULL)
10636     return CW_Default;
10637   Type *type = CallOperandVal->getType();
10638   // Look at the constraint type.
10639   switch (*constraint) {
10640   default:
10641     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
10642     break;
10643   case 'l':
10644     if (type->isIntegerTy()) {
10645       if (Subtarget->isThumb())
10646         weight = CW_SpecificReg;
10647       else
10648         weight = CW_Register;
10649     }
10650     break;
10651   case 'w':
10652     if (type->isFloatingPointTy())
10653       weight = CW_Register;
10654     break;
10655   }
10656   return weight;
10657 }
10658
10659 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
10660 RCPair
10661 ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
10662                                                 MVT VT) const {
10663   if (Constraint.size() == 1) {
10664     // GCC ARM Constraint Letters
10665     switch (Constraint[0]) {
10666     case 'l': // Low regs or general regs.
10667       if (Subtarget->isThumb())
10668         return RCPair(0U, &ARM::tGPRRegClass);
10669       return RCPair(0U, &ARM::GPRRegClass);
10670     case 'h': // High regs or no regs.
10671       if (Subtarget->isThumb())
10672         return RCPair(0U, &ARM::hGPRRegClass);
10673       break;
10674     case 'r':
10675       return RCPair(0U, &ARM::GPRRegClass);
10676     case 'w':
10677       if (VT == MVT::f32)
10678         return RCPair(0U, &ARM::SPRRegClass);
10679       if (VT.getSizeInBits() == 64)
10680         return RCPair(0U, &ARM::DPRRegClass);
10681       if (VT.getSizeInBits() == 128)
10682         return RCPair(0U, &ARM::QPRRegClass);
10683       break;
10684     case 'x':
10685       if (VT == MVT::f32)
10686         return RCPair(0U, &ARM::SPR_8RegClass);
10687       if (VT.getSizeInBits() == 64)
10688         return RCPair(0U, &ARM::DPR_8RegClass);
10689       if (VT.getSizeInBits() == 128)
10690         return RCPair(0U, &ARM::QPR_8RegClass);
10691       break;
10692     case 't':
10693       if (VT == MVT::f32)
10694         return RCPair(0U, &ARM::SPRRegClass);
10695       break;
10696     }
10697   }
10698   if (StringRef("{cc}").equals_lower(Constraint))
10699     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
10700
10701   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
10702 }
10703
10704 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
10705 /// vector.  If it is invalid, don't add anything to Ops.
10706 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
10707                                                      std::string &Constraint,
10708                                                      std::vector<SDValue>&Ops,
10709                                                      SelectionDAG &DAG) const {
10710   SDValue Result(0, 0);
10711
10712   // Currently only support length 1 constraints.
10713   if (Constraint.length() != 1) return;
10714
10715   char ConstraintLetter = Constraint[0];
10716   switch (ConstraintLetter) {
10717   default: break;
10718   case 'j':
10719   case 'I': case 'J': case 'K': case 'L':
10720   case 'M': case 'N': case 'O':
10721     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
10722     if (!C)
10723       return;
10724
10725     int64_t CVal64 = C->getSExtValue();
10726     int CVal = (int) CVal64;
10727     // None of these constraints allow values larger than 32 bits.  Check
10728     // that the value fits in an int.
10729     if (CVal != CVal64)
10730       return;
10731
10732     switch (ConstraintLetter) {
10733       case 'j':
10734         // Constant suitable for movw, must be between 0 and
10735         // 65535.
10736         if (Subtarget->hasV6T2Ops())
10737           if (CVal >= 0 && CVal <= 65535)
10738             break;
10739         return;
10740       case 'I':
10741         if (Subtarget->isThumb1Only()) {
10742           // This must be a constant between 0 and 255, for ADD
10743           // immediates.
10744           if (CVal >= 0 && CVal <= 255)
10745             break;
10746         } else if (Subtarget->isThumb2()) {
10747           // A constant that can be used as an immediate value in a
10748           // data-processing instruction.
10749           if (ARM_AM::getT2SOImmVal(CVal) != -1)
10750             break;
10751         } else {
10752           // A constant that can be used as an immediate value in a
10753           // data-processing instruction.
10754           if (ARM_AM::getSOImmVal(CVal) != -1)
10755             break;
10756         }
10757         return;
10758
10759       case 'J':
10760         if (Subtarget->isThumb()) {  // FIXME thumb2
10761           // This must be a constant between -255 and -1, for negated ADD
10762           // immediates. This can be used in GCC with an "n" modifier that
10763           // prints the negated value, for use with SUB instructions. It is
10764           // not useful otherwise but is implemented for compatibility.
10765           if (CVal >= -255 && CVal <= -1)
10766             break;
10767         } else {
10768           // This must be a constant between -4095 and 4095. It is not clear
10769           // what this constraint is intended for. Implemented for
10770           // compatibility with GCC.
10771           if (CVal >= -4095 && CVal <= 4095)
10772             break;
10773         }
10774         return;
10775
10776       case 'K':
10777         if (Subtarget->isThumb1Only()) {
10778           // A 32-bit value where only one byte has a nonzero value. Exclude
10779           // zero to match GCC. This constraint is used by GCC internally for
10780           // constants that can be loaded with a move/shift combination.
10781           // It is not useful otherwise but is implemented for compatibility.
10782           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
10783             break;
10784         } else if (Subtarget->isThumb2()) {
10785           // A constant whose bitwise inverse can be used as an immediate
10786           // value in a data-processing instruction. This can be used in GCC
10787           // with a "B" modifier that prints the inverted value, for use with
10788           // BIC and MVN instructions. It is not useful otherwise but is
10789           // implemented for compatibility.
10790           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
10791             break;
10792         } else {
10793           // A constant whose bitwise inverse can be used as an immediate
10794           // value in a data-processing instruction. This can be used in GCC
10795           // with a "B" modifier that prints the inverted value, for use with
10796           // BIC and MVN instructions. It is not useful otherwise but is
10797           // implemented for compatibility.
10798           if (ARM_AM::getSOImmVal(~CVal) != -1)
10799             break;
10800         }
10801         return;
10802
10803       case 'L':
10804         if (Subtarget->isThumb1Only()) {
10805           // This must be a constant between -7 and 7,
10806           // for 3-operand ADD/SUB immediate instructions.
10807           if (CVal >= -7 && CVal < 7)
10808             break;
10809         } else if (Subtarget->isThumb2()) {
10810           // A constant whose negation can be used as an immediate value in a
10811           // data-processing instruction. This can be used in GCC with an "n"
10812           // modifier that prints the negated value, for use with SUB
10813           // instructions. It is not useful otherwise but is implemented for
10814           // compatibility.
10815           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
10816             break;
10817         } else {
10818           // A constant whose negation can be used as an immediate value in a
10819           // data-processing instruction. This can be used in GCC with an "n"
10820           // modifier that prints the negated value, for use with SUB
10821           // instructions. It is not useful otherwise but is implemented for
10822           // compatibility.
10823           if (ARM_AM::getSOImmVal(-CVal) != -1)
10824             break;
10825         }
10826         return;
10827
10828       case 'M':
10829         if (Subtarget->isThumb()) { // FIXME thumb2
10830           // This must be a multiple of 4 between 0 and 1020, for
10831           // ADD sp + immediate.
10832           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
10833             break;
10834         } else {
10835           // A power of two or a constant between 0 and 32.  This is used in
10836           // GCC for the shift amount on shifted register operands, but it is
10837           // useful in general for any shift amounts.
10838           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
10839             break;
10840         }
10841         return;
10842
10843       case 'N':
10844         if (Subtarget->isThumb()) {  // FIXME thumb2
10845           // This must be a constant between 0 and 31, for shift amounts.
10846           if (CVal >= 0 && CVal <= 31)
10847             break;
10848         }
10849         return;
10850
10851       case 'O':
10852         if (Subtarget->isThumb()) {  // FIXME thumb2
10853           // This must be a multiple of 4 between -508 and 508, for
10854           // ADD/SUB sp = sp + immediate.
10855           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
10856             break;
10857         }
10858         return;
10859     }
10860     Result = DAG.getTargetConstant(CVal, Op.getValueType());
10861     break;
10862   }
10863
10864   if (Result.getNode()) {
10865     Ops.push_back(Result);
10866     return;
10867   }
10868   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
10869 }
10870
10871 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
10872   assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
10873   unsigned Opcode = Op->getOpcode();
10874   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
10875       "Invalid opcode for Div/Rem lowering");
10876   bool isSigned = (Opcode == ISD::SDIVREM);
10877   EVT VT = Op->getValueType(0);
10878   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
10879
10880   RTLIB::Libcall LC;
10881   switch (VT.getSimpleVT().SimpleTy) {
10882   default: llvm_unreachable("Unexpected request for libcall!");
10883   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
10884   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
10885   case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
10886   case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
10887   }
10888
10889   SDValue InChain = DAG.getEntryNode();
10890
10891   TargetLowering::ArgListTy Args;
10892   TargetLowering::ArgListEntry Entry;
10893   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
10894     EVT ArgVT = Op->getOperand(i).getValueType();
10895     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
10896     Entry.Node = Op->getOperand(i);
10897     Entry.Ty = ArgTy;
10898     Entry.isSExt = isSigned;
10899     Entry.isZExt = !isSigned;
10900     Args.push_back(Entry);
10901   }
10902
10903   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
10904                                          getPointerTy());
10905
10906   Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
10907
10908   SDLoc dl(Op);
10909   TargetLowering::
10910   CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, true,
10911                     0, getLibcallCallingConv(LC), /*isTailCall=*/false,
10912                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
10913                     Callee, Args, DAG, dl);
10914   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
10915
10916   return CallInfo.first;
10917 }
10918
10919 bool
10920 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
10921   // The ARM target isn't yet aware of offsets.
10922   return false;
10923 }
10924
10925 bool ARM::isBitFieldInvertedMask(unsigned v) {
10926   if (v == 0xffffffff)
10927     return false;
10928
10929   // there can be 1's on either or both "outsides", all the "inside"
10930   // bits must be 0's
10931   unsigned TO = CountTrailingOnes_32(v);
10932   unsigned LO = CountLeadingOnes_32(v);
10933   v = (v >> TO) << TO;
10934   v = (v << LO) >> LO;
10935   return v == 0;
10936 }
10937
10938 /// isFPImmLegal - Returns true if the target can instruction select the
10939 /// specified FP immediate natively. If false, the legalizer will
10940 /// materialize the FP immediate as a load from a constant pool.
10941 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
10942   if (!Subtarget->hasVFP3())
10943     return false;
10944   if (VT == MVT::f32)
10945     return ARM_AM::getFP32Imm(Imm) != -1;
10946   if (VT == MVT::f64)
10947     return ARM_AM::getFP64Imm(Imm) != -1;
10948   return false;
10949 }
10950
10951 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
10952 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
10953 /// specified in the intrinsic calls.
10954 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
10955                                            const CallInst &I,
10956                                            unsigned Intrinsic) const {
10957   switch (Intrinsic) {
10958   case Intrinsic::arm_neon_vld1:
10959   case Intrinsic::arm_neon_vld2:
10960   case Intrinsic::arm_neon_vld3:
10961   case Intrinsic::arm_neon_vld4:
10962   case Intrinsic::arm_neon_vld2lane:
10963   case Intrinsic::arm_neon_vld3lane:
10964   case Intrinsic::arm_neon_vld4lane: {
10965     Info.opc = ISD::INTRINSIC_W_CHAIN;
10966     // Conservatively set memVT to the entire set of vectors loaded.
10967     uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
10968     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
10969     Info.ptrVal = I.getArgOperand(0);
10970     Info.offset = 0;
10971     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
10972     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
10973     Info.vol = false; // volatile loads with NEON intrinsics not supported
10974     Info.readMem = true;
10975     Info.writeMem = false;
10976     return true;
10977   }
10978   case Intrinsic::arm_neon_vst1:
10979   case Intrinsic::arm_neon_vst2:
10980   case Intrinsic::arm_neon_vst3:
10981   case Intrinsic::arm_neon_vst4:
10982   case Intrinsic::arm_neon_vst2lane:
10983   case Intrinsic::arm_neon_vst3lane:
10984   case Intrinsic::arm_neon_vst4lane: {
10985     Info.opc = ISD::INTRINSIC_VOID;
10986     // Conservatively set memVT to the entire set of vectors stored.
10987     unsigned NumElts = 0;
10988     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
10989       Type *ArgTy = I.getArgOperand(ArgI)->getType();
10990       if (!ArgTy->isVectorTy())
10991         break;
10992       NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
10993     }
10994     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
10995     Info.ptrVal = I.getArgOperand(0);
10996     Info.offset = 0;
10997     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
10998     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
10999     Info.vol = false; // volatile stores with NEON intrinsics not supported
11000     Info.readMem = false;
11001     Info.writeMem = true;
11002     return true;
11003   }
11004   case Intrinsic::arm_ldrex: {
11005     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
11006     Info.opc = ISD::INTRINSIC_W_CHAIN;
11007     Info.memVT = MVT::getVT(PtrTy->getElementType());
11008     Info.ptrVal = I.getArgOperand(0);
11009     Info.offset = 0;
11010     Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
11011     Info.vol = true;
11012     Info.readMem = true;
11013     Info.writeMem = false;
11014     return true;
11015   }
11016   case Intrinsic::arm_strex: {
11017     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
11018     Info.opc = ISD::INTRINSIC_W_CHAIN;
11019     Info.memVT = MVT::getVT(PtrTy->getElementType());
11020     Info.ptrVal = I.getArgOperand(1);
11021     Info.offset = 0;
11022     Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
11023     Info.vol = true;
11024     Info.readMem = false;
11025     Info.writeMem = true;
11026     return true;
11027   }
11028   case Intrinsic::arm_strexd: {
11029     Info.opc = ISD::INTRINSIC_W_CHAIN;
11030     Info.memVT = MVT::i64;
11031     Info.ptrVal = I.getArgOperand(2);
11032     Info.offset = 0;
11033     Info.align = 8;
11034     Info.vol = true;
11035     Info.readMem = false;
11036     Info.writeMem = true;
11037     return true;
11038   }
11039   case Intrinsic::arm_ldrexd: {
11040     Info.opc = ISD::INTRINSIC_W_CHAIN;
11041     Info.memVT = MVT::i64;
11042     Info.ptrVal = I.getArgOperand(0);
11043     Info.offset = 0;
11044     Info.align = 8;
11045     Info.vol = true;
11046     Info.readMem = true;
11047     Info.writeMem = false;
11048     return true;
11049   }
11050   default:
11051     break;
11052   }
11053
11054   return false;
11055 }