lib/Target/ARM/ARMISelLowering.cpp

   1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that ARM uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "ARMISelLowering.h"
  16 #include "ARMCallingConv.h"
  17 #include "ARMConstantPoolValue.h"
  18 #include "ARMMachineFunctionInfo.h"
  19 #include "ARMPerfectShuffle.h"
  20 #include "ARMSubtarget.h"
  21 #include "ARMTargetMachine.h"
  22 #include "ARMTargetObjectFile.h"
  23 #include "MCTargetDesc/ARMAddressingModes.h"
  24 #include "llvm/ADT/Statistic.h"
  25 #include "llvm/ADT/StringExtras.h"
  26 #include "llvm/ADT/StringSwitch.h"
  27 #include "llvm/CodeGen/CallingConvLower.h"
  28 #include "llvm/CodeGen/IntrinsicLowering.h"
  29 #include "llvm/CodeGen/MachineBasicBlock.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/CodeGen/SelectionDAG.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/Function.h"
  40 #include "llvm/IR/GlobalValue.h"
  41 #include "llvm/IR/IRBuilder.h"
  42 #include "llvm/IR/Instruction.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/IntrinsicInst.h"
  45 #include "llvm/IR/Intrinsics.h"
  46 #include "llvm/IR/Type.h"
  47 #include "llvm/MC/MCSectionMachO.h"
  48 #include "llvm/Support/CommandLine.h"
  49 #include "llvm/Support/Debug.h"
  50 #include "llvm/Support/ErrorHandling.h"
  51 #include "llvm/Support/MathExtras.h"
  52 #include "llvm/Support/raw_ostream.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include <utility>
  55 using namespace llvm;
  56
  57 #define DEBUG_TYPE "arm-isel"
  58
  59 STATISTIC(NumTailCalls, "Number of tail calls");
  60 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
  61 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
  62
  63 static cl::opt<bool>
  64 ARMInterworking("arm-interworking", cl::Hidden,
  65   cl::desc("Enable / disable ARM interworking (for debugging only)"),
  66   cl::init(true));
  67
  68 namespace {
  69   class ARMCCState : public CCState {
  70   public:
  71     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
  72                SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
  73                ParmContext PC)
  74         : CCState(CC, isVarArg, MF, locs, C) {
  75       assert(((PC == Call) || (PC == Prologue)) &&
  76              "ARMCCState users must specify whether their context is call"
  77              "or prologue generation.");
  78       CallOrPrologue = PC;
  79     }
  80   };
  81 }
  82
  83 // The APCS parameter registers.
  84 static const MCPhysReg GPRArgRegs[] = {
  85   ARM::R0, ARM::R1, ARM::R2, ARM::R3
  86 };
  87
  88 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
  89                                        MVT PromotedBitwiseVT) {
  90   if (VT != PromotedLdStVT) {
  91     setOperationAction(ISD::LOAD, VT, Promote);
  92     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
  93
  94     setOperationAction(ISD::STORE, VT, Promote);
  95     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
  96   }
  97
  98   MVT ElemTy = VT.getVectorElementType();
  99   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
 100     setOperationAction(ISD::SETCC, VT, Custom);
 101   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 102   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 103   if (ElemTy == MVT::i32) {
 104     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
 105     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
 106     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
 107     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 108   } else {
 109     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 110     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 111     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 112     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 113   }
 114   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
 115   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
 116   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
 117   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 118   setOperationAction(ISD::SELECT,            VT, Expand);
 119   setOperationAction(ISD::SELECT_CC,         VT, Expand);
 120   setOperationAction(ISD::VSELECT,           VT, Expand);
 121   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 122   if (VT.isInteger()) {
 123     setOperationAction(ISD::SHL, VT, Custom);
 124     setOperationAction(ISD::SRA, VT, Custom);
 125     setOperationAction(ISD::SRL, VT, Custom);
 126   }
 127
 128   // Promote all bit-wise operations.
 129   if (VT.isInteger() && VT != PromotedBitwiseVT) {
 130     setOperationAction(ISD::AND, VT, Promote);
 131     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
 132     setOperationAction(ISD::OR,  VT, Promote);
 133     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
 134     setOperationAction(ISD::XOR, VT, Promote);
 135     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
 136   }
 137
 138   // Neon does not support vector divide/remainder operations.
 139   setOperationAction(ISD::SDIV, VT, Expand);
 140   setOperationAction(ISD::UDIV, VT, Expand);
 141   setOperationAction(ISD::FDIV, VT, Expand);
 142   setOperationAction(ISD::SREM, VT, Expand);
 143   setOperationAction(ISD::UREM, VT, Expand);
 144   setOperationAction(ISD::FREM, VT, Expand);
 145
 146   if (VT.isInteger()) {
 147     setOperationAction(ISD::SABSDIFF, VT, Legal);
 148     setOperationAction(ISD::UABSDIFF, VT, Legal);
 149   }
 150 }
 151
 152 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
 153   addRegisterClass(VT, &ARM::DPRRegClass);
 154   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 155 }
 156
 157 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
 158   addRegisterClass(VT, &ARM::DPairRegClass);
 159   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 160 }
 161
 162 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 163                                      const ARMSubtarget &STI)
 164     : TargetLowering(TM), Subtarget(&STI) {
 165   RegInfo = Subtarget->getRegisterInfo();
 166   Itins = Subtarget->getInstrItineraryData();
 167
 168   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 169
 170   if (Subtarget->isTargetMachO()) {
 171     // Uses VFP for Thumb libfuncs if available.
 172     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
 173         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
 174       static const struct {
 175         const RTLIB::Libcall Op;
 176         const char * const Name;
 177         const ISD::CondCode Cond;
 178       } LibraryCalls[] = {
 179         // Single-precision floating-point arithmetic.
 180         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
 181         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
 182         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
 183         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
 184
 185         // Double-precision floating-point arithmetic.
 186         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
 187         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
 188         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
 189         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
 190
 191         // Single-precision comparisons.
 192         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
 193         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
 194         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
 195         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
 196         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
 197         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
 198         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
 199         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
 200
 201         // Double-precision comparisons.
 202         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
 203         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
 204         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
 205         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
 206         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
 207         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
 208         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
 209         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
 210
 211         // Floating-point to integer conversions.
 212         // i64 conversions are done via library routines even when generating VFP
 213         // instructions, so use the same ones.
 214         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
 215         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
 216         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
 217         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
 218
 219         // Conversions between floating types.
 220         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
 221         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
 222
 223         // Integer to floating-point conversions.
 224         // i64 conversions are done via library routines even when generating VFP
 225         // instructions, so use the same ones.
 226         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
 227         // e.g., __floatunsidf vs. __floatunssidfvfp.
 228         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
 229         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
 230         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
 231         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
 232       };
 233
 234       for (const auto &LC : LibraryCalls) {
 235         setLibcallName(LC.Op, LC.Name);
 236         if (LC.Cond != ISD::SETCC_INVALID)
 237           setCmpLibcallCC(LC.Op, LC.Cond);
 238       }
 239     }
 240   }
 241
 242   // These libcalls are not available in 32-bit.
 243   setLibcallName(RTLIB::SHL_I128, nullptr);
 244   setLibcallName(RTLIB::SRL_I128, nullptr);
 245   setLibcallName(RTLIB::SRA_I128, nullptr);
 246
 247   if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() &&
 248       !Subtarget->isTargetWindows()) {
 249     static const struct {
 250       const RTLIB::Libcall Op;
 251       const char * const Name;
 252       const CallingConv::ID CC;
 253       const ISD::CondCode Cond;
 254     } LibraryCalls[] = {
 255       // Double-precision floating-point arithmetic helper functions
 256       // RTABI chapter 4.1.2, Table 2
 257       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 258       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 259       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 260       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 261
 262       // Double-precision floating-point comparison helper functions
 263       // RTABI chapter 4.1.2, Table 3
 264       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
 265       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
 266       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
 267       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
 268       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
 269       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
 270       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
 271       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
 272
 273       // Single-precision floating-point arithmetic helper functions
 274       // RTABI chapter 4.1.2, Table 4
 275       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 276       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 277       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 278       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 279
 280       // Single-precision floating-point comparison helper functions
 281       // RTABI chapter 4.1.2, Table 5
 282       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
 283       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
 284       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
 285       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
 286       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
 287       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
 288       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
 289       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
 290
 291       // Floating-point to integer conversions.
 292       // RTABI chapter 4.1.2, Table 6
 293       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 294       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 295       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 296       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 297       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 298       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 299       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 300       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 301
 302       // Conversions between floating types.
 303       // RTABI chapter 4.1.2, Table 7
 304       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 305       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 306       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 307
 308       // Integer to floating-point conversions.
 309       // RTABI chapter 4.1.2, Table 8
 310       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 311       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 312       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 313       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 314       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 315       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 316       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 317       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 318
 319       // Long long helper functions
 320       // RTABI chapter 4.2, Table 9
 321       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 322       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 323       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 324       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 325
 326       // Integer division functions
 327       // RTABI chapter 4.3.1
 328       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 329       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 330       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 331       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 332       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 333       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 334       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 335       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 336
 337       // Memory operations
 338       // RTABI chapter 4.3.4
 339       { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 340       { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 341       { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 342     };
 343
 344     for (const auto &LC : LibraryCalls) {
 345       setLibcallName(LC.Op, LC.Name);
 346       setLibcallCallingConv(LC.Op, LC.CC);
 347       if (LC.Cond != ISD::SETCC_INVALID)
 348         setCmpLibcallCC(LC.Op, LC.Cond);
 349     }
 350   }
 351
 352   if (Subtarget->isTargetWindows()) {
 353     static const struct {
 354       const RTLIB::Libcall Op;
 355       const char * const Name;
 356       const CallingConv::ID CC;
 357     } LibraryCalls[] = {
 358       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
 359       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
 360       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
 361       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
 362       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
 363       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
 364       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
 365       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
 366
 367       { RTLIB::SDIV_I32, "__rt_sdiv",   CallingConv::ARM_AAPCS_VFP },
 368       { RTLIB::UDIV_I32, "__rt_udiv",   CallingConv::ARM_AAPCS_VFP },
 369       { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP },
 370       { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP },
 371     };
 372
 373     for (const auto &LC : LibraryCalls) {
 374       setLibcallName(LC.Op, LC.Name);
 375       setLibcallCallingConv(LC.Op, LC.CC);
 376     }
 377   }
 378
 379   // Use divmod compiler-rt calls for iOS 5.0 and later.
 380   if (Subtarget->getTargetTriple().isiOS() &&
 381       !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
 382     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
 383     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
 384   }
 385
 386   // The half <-> float conversion functions are always soft-float, but are
 387   // needed for some targets which use a hard-float calling convention by
 388   // default.
 389   if (Subtarget->isAAPCS_ABI()) {
 390     setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
 391     setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
 392     setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
 393   } else {
 394     setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
 395     setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
 396     setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
 397   }
 398
 399   if (Subtarget->isThumb1Only())
 400     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
 401   else
 402     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
 403   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
 404       !Subtarget->isThumb1Only()) {
 405     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
 406     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
 407   }
 408
 409   for (MVT VT : MVT::vector_valuetypes()) {
 410     for (MVT InnerVT : MVT::vector_valuetypes()) {
 411       setTruncStoreAction(VT, InnerVT, Expand);
 412       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
 413       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
 414       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
 415     }
 416
 417     setOperationAction(ISD::MULHS, VT, Expand);
 418     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 419     setOperationAction(ISD::MULHU, VT, Expand);
 420     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 421
 422     setOperationAction(ISD::BSWAP, VT, Expand);
 423   }
 424
 425   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
 426   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 427
 428   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
 429   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
 430
 431   if (Subtarget->hasNEON()) {
 432     addDRTypeForNEON(MVT::v2f32);
 433     addDRTypeForNEON(MVT::v8i8);
 434     addDRTypeForNEON(MVT::v4i16);
 435     addDRTypeForNEON(MVT::v2i32);
 436     addDRTypeForNEON(MVT::v1i64);
 437
 438     addQRTypeForNEON(MVT::v4f32);
 439     addQRTypeForNEON(MVT::v2f64);
 440     addQRTypeForNEON(MVT::v16i8);
 441     addQRTypeForNEON(MVT::v8i16);
 442     addQRTypeForNEON(MVT::v4i32);
 443     addQRTypeForNEON(MVT::v2i64);
 444
 445     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
 446     // neither Neon nor VFP support any arithmetic operations on it.
 447     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
 448     // supported for v4f32.
 449     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
 450     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
 451     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
 452     // FIXME: Code duplication: FDIV and FREM are expanded always, see
 453     // ARMTargetLowering::addTypeForNEON method for details.
 454     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
 455     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
 456     // FIXME: Create unittest.
 457     // In another words, find a way when "copysign" appears in DAG with vector
 458     // operands.
 459     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
 460     // FIXME: Code duplication: SETCC has custom operation action, see
 461     // ARMTargetLowering::addTypeForNEON method for details.
 462     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
 463     // FIXME: Create unittest for FNEG and for FABS.
 464     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
 465     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
 466     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
 467     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
 468     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
 469     setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
 470     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
 471     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
 472     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
 473     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
 474     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
 475     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
 476     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
 477     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
 478     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
 479     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
 480     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
 481     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
 482     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
 483
 484     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
 485     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
 486     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
 487     setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
 488     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
 489     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
 490     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
 491     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
 492     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
 493     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
 494     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
 495     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
 496     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
 497     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
 498     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 499
 500     // Mark v2f32 intrinsics.
 501     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
 502     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
 503     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
 504     setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
 505     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
 506     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
 507     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
 508     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
 509     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
 510     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
 511     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
 512     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
 513     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
 514     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
 515     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
 516
 517     // Neon does not support some operations on v1i64 and v2i64 types.
 518     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 519     // Custom handling for some quad-vector types to detect VMULL.
 520     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
 521     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 522     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 523     // Custom handling for some vector types to avoid expensive expansions
 524     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
 525     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
 526     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
 527     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
 528     setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
 529     setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
 530     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
 531     // a destination type that is wider than the source, and nor does
 532     // it have a FP_TO_[SU]INT instruction with a narrower destination than
 533     // source.
 534     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
 535     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
 536     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
 537     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
 538
 539     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
 540     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
 541
 542     // NEON does not have single instruction CTPOP for vectors with element
 543     // types wider than 8-bits.  However, custom lowering can leverage the
 544     // v8i8/v16i8 vcnt instruction.
 545     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
 546     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
 547     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
 548     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
 549
 550     // NEON does not have single instruction CTTZ for vectors.
 551     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
 552     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
 553     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
 554     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
 555
 556     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
 557     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
 558     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
 559     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
 560
 561     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
 562     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
 563     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
 564     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
 565
 566     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
 567     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
 568     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
 569     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
 570
 571     // NEON only has FMA instructions as of VFP4.
 572     if (!Subtarget->hasVFP4()) {
 573       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
 574       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
 575     }
 576
 577     setTargetDAGCombine(ISD::INTRINSIC_VOID);
 578     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 579     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 580     setTargetDAGCombine(ISD::SHL);
 581     setTargetDAGCombine(ISD::SRL);
 582     setTargetDAGCombine(ISD::SRA);
 583     setTargetDAGCombine(ISD::SIGN_EXTEND);
 584     setTargetDAGCombine(ISD::ZERO_EXTEND);
 585     setTargetDAGCombine(ISD::ANY_EXTEND);
 586     setTargetDAGCombine(ISD::SELECT_CC);
 587     setTargetDAGCombine(ISD::BUILD_VECTOR);
 588     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
 589     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 590     setTargetDAGCombine(ISD::STORE);
 591     setTargetDAGCombine(ISD::FP_TO_SINT);
 592     setTargetDAGCombine(ISD::FP_TO_UINT);
 593     setTargetDAGCombine(ISD::FDIV);
 594     setTargetDAGCombine(ISD::LOAD);
 595
 596     // It is legal to extload from v4i8 to v4i16 or v4i32.
 597     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
 598                    MVT::v2i32}) {
 599       for (MVT VT : MVT::integer_vector_valuetypes()) {
 600         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
 601         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
 602         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
 603       }
 604     }
 605   }
 606
 607   // ARM and Thumb2 support UMLAL/SMLAL.
 608   if (!Subtarget->isThumb1Only())
 609     setTargetDAGCombine(ISD::ADDC);
 610
 611   if (Subtarget->isFPOnlySP()) {
 612     // When targeting a floating-point unit with only single-precision
 613     // operations, f64 is legal for the few double-precision instructions which
 614     // are present However, no double-precision operations other than moves,
 615     // loads and stores are provided by the hardware.
 616     setOperationAction(ISD::FADD,       MVT::f64, Expand);
 617     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
 618     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
 619     setOperationAction(ISD::FMA,        MVT::f64, Expand);
 620     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
 621     setOperationAction(ISD::FREM,       MVT::f64, Expand);
 622     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
 623     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
 624     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
 625     setOperationAction(ISD::FABS,       MVT::f64, Expand);
 626     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
 627     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
 628     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
 629     setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
 630     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
 631     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
 632     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
 633     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
 634     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
 635     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
 636     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
 637     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
 638     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
 639     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
 640     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
 641     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 642     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 643     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 644     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 645     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
 646     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
 647     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
 648     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
 649   }
 650
 651   computeRegisterProperties(Subtarget->getRegisterInfo());
 652
 653   // ARM does not have floating-point extending loads.
 654   for (MVT VT : MVT::fp_valuetypes()) {
 655     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
 656     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
 657   }
 658
 659   // ... or truncating stores
 660   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 661   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 662   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 663
 664   // ARM does not have i1 sign extending load.
 665   for (MVT VT : MVT::integer_valuetypes())
 666     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 667
 668   // ARM supports all 4 flavors of integer indexed load / store.
 669   if (!Subtarget->isThumb1Only()) {
 670     for (unsigned im = (unsigned)ISD::PRE_INC;
 671          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 672       setIndexedLoadAction(im,  MVT::i1,  Legal);
 673       setIndexedLoadAction(im,  MVT::i8,  Legal);
 674       setIndexedLoadAction(im,  MVT::i16, Legal);
 675       setIndexedLoadAction(im,  MVT::i32, Legal);
 676       setIndexedStoreAction(im, MVT::i1,  Legal);
 677       setIndexedStoreAction(im, MVT::i8,  Legal);
 678       setIndexedStoreAction(im, MVT::i16, Legal);
 679       setIndexedStoreAction(im, MVT::i32, Legal);
 680     }
 681   }
 682
 683   setOperationAction(ISD::SADDO, MVT::i32, Custom);
 684   setOperationAction(ISD::UADDO, MVT::i32, Custom);
 685   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
 686   setOperationAction(ISD::USUBO, MVT::i32, Custom);
 687
 688   // i64 operation support.
 689   setOperationAction(ISD::MUL,     MVT::i64, Expand);
 690   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
 691   if (Subtarget->isThumb1Only()) {
 692     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
 693     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 694   }
 695   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
 696       || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP()))
 697     setOperationAction(ISD::MULHS, MVT::i32, Expand);
 698
 699   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 700   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 701   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 702   setOperationAction(ISD::SRL,       MVT::i64, Custom);
 703   setOperationAction(ISD::SRA,       MVT::i64, Custom);
 704
 705   if (!Subtarget->isThumb1Only()) {
 706     // FIXME: We should do this for Thumb1 as well.
 707     setOperationAction(ISD::ADDC,    MVT::i32, Custom);
 708     setOperationAction(ISD::ADDE,    MVT::i32, Custom);
 709     setOperationAction(ISD::SUBC,    MVT::i32, Custom);
 710     setOperationAction(ISD::SUBE,    MVT::i32, Custom);
 711   }
 712
 713   // ARM does not have ROTL.
 714   setOperationAction(ISD::ROTL,  MVT::i32, Expand);
 715   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
 716   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 717   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
 718     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
 719
 720   // These just redirect to CTTZ and CTLZ on ARM.
 721   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
 722   setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
 723
 724   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
 725
 726   // Only ARMv6 has BSWAP.
 727   if (!Subtarget->hasV6Ops())
 728     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 729
 730   if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
 731       !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
 732     // These are expanded into libcalls if the cpu doesn't have HW divider.
 733     setOperationAction(ISD::SDIV,  MVT::i32, Expand);
 734     setOperationAction(ISD::UDIV,  MVT::i32, Expand);
 735   }
 736
 737   // FIXME: Also set divmod for SREM on EABI/androideabi
 738   setOperationAction(ISD::SREM,  MVT::i32, Expand);
 739   setOperationAction(ISD::UREM,  MVT::i32, Expand);
 740   // Register based DivRem for AEABI (RTABI 4.2)
 741   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) {
 742     setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
 743     setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
 744     setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
 745     setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod");
 746     setLibcallName(RTLIB::UDIVREM_I8,  "__aeabi_uidivmod");
 747     setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod");
 748     setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod");
 749     setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod");
 750
 751     setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
 752     setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
 753     setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
 754     setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
 755     setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
 756     setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
 757     setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
 758     setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
 759
 760     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
 761     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
 762   } else {
 763     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
 764     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 765   }
 766
 767   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
 768   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
 769   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
 770   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
 771   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
 772
 773   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 774
 775   // Use the default implementation.
 776   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
 777   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
 778   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
 779   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
 780   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 781   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 782
 783   if (!Subtarget->isTargetMachO()) {
 784     // Non-MachO platforms may return values in these registers via the
 785     // personality function.
 786     setExceptionPointerRegister(ARM::R0);
 787     setExceptionSelectorRegister(ARM::R1);
 788   }
 789
 790   if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
 791     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 792   else
 793     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 794
 795   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
 796   // the default expansion. If we are targeting a single threaded system,
 797   // then set them all for expand so we can lower them later into their
 798   // non-atomic form.
 799   if (TM.Options.ThreadModel == ThreadModel::Single)
 800     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
 801   else if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
 802     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
 803     // to ldrex/strex loops already.
 804     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
 805
 806     // On v8, we have particularly efficient implementations of atomic fences
 807     // if they can be combined with nearby atomic loads and stores.
 808     if (!Subtarget->hasV8Ops()) {
 809       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
 810       setInsertFencesForAtomic(true);
 811     }
 812   } else {
 813     // If there's anything we can use as a barrier, go through custom lowering
 814     // for ATOMIC_FENCE.
 815     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
 816                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
 817
 818     // Set them all for expansion, which will force libcalls.
 819     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
 820     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
 821     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
 822     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
 823     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
 824     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
 825     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
 826     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
 827     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
 828     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
 829     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
 830     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
 831     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
 832     // Unordered/Monotonic case.
 833     setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
 834     setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
 835   }
 836
 837   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 838
 839   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
 840   if (!Subtarget->hasV6Ops()) {
 841     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 842     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
 843   }
 844   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 845
 846   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
 847       !Subtarget->isThumb1Only()) {
 848     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
 849     // iff target supports vfp2.
 850     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
 851     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 852   }
 853
 854   // We want to custom lower some of our intrinsics.
 855   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 856   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 857   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 858   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 859   if (Subtarget->isTargetDarwin())
 860     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 861
 862   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
 863   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
 864   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
 865   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
 866   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
 867   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
 868   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 869   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
 870   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 871
 872   setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
 873   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
 874   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
 875   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
 876   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
 877
 878   // We don't support sin/cos/fmod/copysign/pow
 879   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
 880   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
 881   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
 882   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
 883   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
 884   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
 885   setOperationAction(ISD::FREM,      MVT::f64, Expand);
 886   setOperationAction(ISD::FREM,      MVT::f32, Expand);
 887   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
 888       !Subtarget->isThumb1Only()) {
 889     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 890     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 891   }
 892   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
 893   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
 894
 895   if (!Subtarget->hasVFP4()) {
 896     setOperationAction(ISD::FMA, MVT::f64, Expand);
 897     setOperationAction(ISD::FMA, MVT::f32, Expand);
 898   }
 899
 900   // Various VFP goodness
 901   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
 902     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
 903     if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
 904       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 905       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 906     }
 907
 908     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
 909     if (!Subtarget->hasFP16()) {
 910       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 911       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 912     }
 913   }
 914
 915   // Combine sin / cos into one node or libcall if possible.
 916   if (Subtarget->hasSinCos()) {
 917     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
 918     setLibcallName(RTLIB::SINCOS_F64, "sincos");
 919     if (Subtarget->getTargetTriple().isiOS()) {
 920       // For iOS, we don't want to the normal expansion of a libcall to
 921       // sincos. We want to issue a libcall to __sincos_stret.
 922       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
 923       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 924     }
 925   }
 926
 927   // FP-ARMv8 implements a lot of rounding-like FP operations.
 928   if (Subtarget->hasFPARMv8()) {
 929     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
 930     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
 931     setOperationAction(ISD::FROUND, MVT::f32, Legal);
 932     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 933     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
 934     setOperationAction(ISD::FRINT, MVT::f32, Legal);
 935     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
 936     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
 937     setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
 938     setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
 939     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
 940     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
 941
 942     if (!Subtarget->isFPOnlySP()) {
 943       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
 944       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
 945       setOperationAction(ISD::FROUND, MVT::f64, Legal);
 946       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
 947       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
 948       setOperationAction(ISD::FRINT, MVT::f64, Legal);
 949       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
 950       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
 951     }
 952   }
 953
 954   if (Subtarget->hasVFP3()) {
 955     setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
 956     setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
 957     setOperationAction(ISD::FMINNAN, MVT::f64, Legal);
 958     setOperationAction(ISD::FMAXNAN, MVT::f64, Legal);
 959   }
 960   if (Subtarget->hasNEON()) {
 961     setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
 962     setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
 963     setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
 964     setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
 965   }
 966
 967   // We have target-specific dag combine patterns for the following nodes:
 968   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
 969   setTargetDAGCombine(ISD::ADD);
 970   setTargetDAGCombine(ISD::SUB);
 971   setTargetDAGCombine(ISD::MUL);
 972   setTargetDAGCombine(ISD::AND);
 973   setTargetDAGCombine(ISD::OR);
 974   setTargetDAGCombine(ISD::XOR);
 975
 976   if (Subtarget->hasV6Ops())
 977     setTargetDAGCombine(ISD::SRL);
 978
 979   setStackPointerRegisterToSaveRestore(ARM::SP);
 980
 981   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
 982       !Subtarget->hasVFP2())
 983     setSchedulingPreference(Sched::RegPressure);
 984   else
 985     setSchedulingPreference(Sched::Hybrid);
 986
 987   //// temporary - rewrite interface to use type
 988   MaxStoresPerMemset = 8;
 989   MaxStoresPerMemsetOptSize = 4;
 990   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
 991   MaxStoresPerMemcpyOptSize = 2;
 992   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
 993   MaxStoresPerMemmoveOptSize = 2;
 994
 995   // On ARM arguments smaller than 4 bytes are extended, so all arguments
 996   // are at least 4 bytes aligned.
 997   setMinStackArgumentAlignment(4);
 998
 999   // Prefer likely predicted branches to selects on out-of-order cores.
1000   PredictableSelectIsExpensive = Subtarget->isLikeA9();
1001
1002   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
1003 }
1004
1005 bool ARMTargetLowering::useSoftFloat() const {
1006   return Subtarget->useSoftFloat();
1007 }
1008
1009 // FIXME: It might make sense to define the representative register class as the
1010 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1011 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1012 // SPR's representative would be DPR_VFP2. This should work well if register
1013 // pressure tracking were modified such that a register use would increment the
1014 // pressure of the register class's representative and all of it's super
1015 // classes' representatives transitively. We have not implemented this because
1016 // of the difficulty prior to coalescing of modeling operand register classes
1017 // due to the common occurrence of cross class copies and subregister insertions
1018 // and extractions.
1019 std::pair<const TargetRegisterClass *, uint8_t>
1020 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1021                                            MVT VT) const {
1022   const TargetRegisterClass *RRC = nullptr;
1023   uint8_t Cost = 1;
1024   switch (VT.SimpleTy) {
1025   default:
1026     return TargetLowering::findRepresentativeClass(TRI, VT);
1027   // Use DPR as representative register class for all floating point
1028   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1029   // the cost is 1 for both f32 and f64.
1030   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1031   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1032     RRC = &ARM::DPRRegClass;
1033     // When NEON is used for SP, only half of the register file is available
1034     // because operations that define both SP and DP results will be constrained
1035     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1036     // coalescing by double-counting the SP regs. See the FIXME above.
1037     if (Subtarget->useNEONForSinglePrecisionFP())
1038       Cost = 2;
1039     break;
1040   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1041   case MVT::v4f32: case MVT::v2f64:
1042     RRC = &ARM::DPRRegClass;
1043     Cost = 2;
1044     break;
1045   case MVT::v4i64:
1046     RRC = &ARM::DPRRegClass;
1047     Cost = 4;
1048     break;
1049   case MVT::v8i64:
1050     RRC = &ARM::DPRRegClass;
1051     Cost = 8;
1052     break;
1053   }
1054   return std::make_pair(RRC, Cost);
1055 }
1056
1057 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1058   switch ((ARMISD::NodeType)Opcode) {
1059   case ARMISD::FIRST_NUMBER:  break;
1060   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1061   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1062   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1063   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1064   case ARMISD::CALL:          return "ARMISD::CALL";
1065   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1066   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1067   case ARMISD::tCALL:         return "ARMISD::tCALL";
1068   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1069   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1070   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1071   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1072   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1073   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1074   case ARMISD::CMP:           return "ARMISD::CMP";
1075   case ARMISD::CMN:           return "ARMISD::CMN";
1076   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1077   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1078   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1079   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1080   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1081
1082   case ARMISD::CMOV:          return "ARMISD::CMOV";
1083
1084   case ARMISD::RBIT:          return "ARMISD::RBIT";
1085
1086   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1087   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1088   case ARMISD::RRX:           return "ARMISD::RRX";
1089
1090   case ARMISD::ADDC:          return "ARMISD::ADDC";
1091   case ARMISD::ADDE:          return "ARMISD::ADDE";
1092   case ARMISD::SUBC:          return "ARMISD::SUBC";
1093   case ARMISD::SUBE:          return "ARMISD::SUBE";
1094
1095   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1096   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1097
1098   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1099   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1100   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1101
1102   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1103
1104   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1105
1106   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1107
1108   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1109
1110   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1111
1112   case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
1113
1114   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
1115   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
1116   case ARMISD::VCGE:          return "ARMISD::VCGE";
1117   case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
1118   case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
1119   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
1120   case ARMISD::VCGT:          return "ARMISD::VCGT";
1121   case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
1122   case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
1123   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
1124   case ARMISD::VTST:          return "ARMISD::VTST";
1125
1126   case ARMISD::VSHL:          return "ARMISD::VSHL";
1127   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
1128   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
1129   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
1130   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
1131   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
1132   case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
1133   case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
1134   case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
1135   case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
1136   case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
1137   case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
1138   case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
1139   case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
1140   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
1141   case ARMISD::VSLI:          return "ARMISD::VSLI";
1142   case ARMISD::VSRI:          return "ARMISD::VSRI";
1143   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1144   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1145   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1146   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1147   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1148   case ARMISD::VDUP:          return "ARMISD::VDUP";
1149   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1150   case ARMISD::VEXT:          return "ARMISD::VEXT";
1151   case ARMISD::VREV64:        return "ARMISD::VREV64";
1152   case ARMISD::VREV32:        return "ARMISD::VREV32";
1153   case ARMISD::VREV16:        return "ARMISD::VREV16";
1154   case ARMISD::VZIP:          return "ARMISD::VZIP";
1155   case ARMISD::VUZP:          return "ARMISD::VUZP";
1156   case ARMISD::VTRN:          return "ARMISD::VTRN";
1157   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1158   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1159   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1160   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1161   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1162   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1163   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1164   case ARMISD::BFI:           return "ARMISD::BFI";
1165   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1166   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1167   case ARMISD::VBSL:          return "ARMISD::VBSL";
1168   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1169   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1170   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1171   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1172   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1173   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1174   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1175   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1176   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1177   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1178   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1179   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1180   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1181   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1182   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1183   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1184   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1185   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1186   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1187   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1188   }
1189   return nullptr;
1190 }
1191
1192 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1193                                           EVT VT) const {
1194   if (!VT.isVector())
1195     return getPointerTy(DL);
1196   return VT.changeVectorElementTypeToInteger();
1197 }
1198
1199 /// getRegClassFor - Return the register class that should be used for the
1200 /// specified value type.
1201 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
1202   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1203   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1204   // load / store 4 to 8 consecutive D registers.
1205   if (Subtarget->hasNEON()) {
1206     if (VT == MVT::v4i64)
1207       return &ARM::QQPRRegClass;
1208     if (VT == MVT::v8i64)
1209       return &ARM::QQQQPRRegClass;
1210   }
1211   return TargetLowering::getRegClassFor(VT);
1212 }
1213
1214 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1215 // source/dest is aligned and the copy size is large enough. We therefore want
1216 // to align such objects passed to memory intrinsics.
1217 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1218                                                unsigned &PrefAlign) const {
1219   if (!isa<MemIntrinsic>(CI))
1220     return false;
1221   MinSize = 8;
1222   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1223   // cycle faster than 4-byte aligned LDM.
1224   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1225   return true;
1226 }
1227
1228 // Create a fast isel object.
1229 FastISel *
1230 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1231                                   const TargetLibraryInfo *libInfo) const {
1232   return ARM::createFastISel(funcInfo, libInfo);
1233 }
1234
1235 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1236   unsigned NumVals = N->getNumValues();
1237   if (!NumVals)
1238     return Sched::RegPressure;
1239
1240   for (unsigned i = 0; i != NumVals; ++i) {
1241     EVT VT = N->getValueType(i);
1242     if (VT == MVT::Glue || VT == MVT::Other)
1243       continue;
1244     if (VT.isFloatingPoint() || VT.isVector())
1245       return Sched::ILP;
1246   }
1247
1248   if (!N->isMachineOpcode())
1249     return Sched::RegPressure;
1250
1251   // Load are scheduled for latency even if there instruction itinerary
1252   // is not available.
1253   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1254   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1255
1256   if (MCID.getNumDefs() == 0)
1257     return Sched::RegPressure;
1258   if (!Itins->isEmpty() &&
1259       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1260     return Sched::ILP;
1261
1262   return Sched::RegPressure;
1263 }
1264
1265 //===----------------------------------------------------------------------===//
1266 // Lowering Code
1267 //===----------------------------------------------------------------------===//
1268
1269 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1270 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1271   switch (CC) {
1272   default: llvm_unreachable("Unknown condition code!");
1273   case ISD::SETNE:  return ARMCC::NE;
1274   case ISD::SETEQ:  return ARMCC::EQ;
1275   case ISD::SETGT:  return ARMCC::GT;
1276   case ISD::SETGE:  return ARMCC::GE;
1277   case ISD::SETLT:  return ARMCC::LT;
1278   case ISD::SETLE:  return ARMCC::LE;
1279   case ISD::SETUGT: return ARMCC::HI;
1280   case ISD::SETUGE: return ARMCC::HS;
1281   case ISD::SETULT: return ARMCC::LO;
1282   case ISD::SETULE: return ARMCC::LS;
1283   }
1284 }
1285
1286 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1287 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1288                         ARMCC::CondCodes &CondCode2) {
1289   CondCode2 = ARMCC::AL;
1290   switch (CC) {
1291   default: llvm_unreachable("Unknown FP condition!");
1292   case ISD::SETEQ:
1293   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1294   case ISD::SETGT:
1295   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1296   case ISD::SETGE:
1297   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1298   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1299   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1300   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1301   case ISD::SETO:   CondCode = ARMCC::VC; break;
1302   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1303   case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1304   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1305   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1306   case ISD::SETLT:
1307   case ISD::SETULT: CondCode = ARMCC::LT; break;
1308   case ISD::SETLE:
1309   case ISD::SETULE: CondCode = ARMCC::LE; break;
1310   case ISD::SETNE:
1311   case ISD::SETUNE: CondCode = ARMCC::NE; break;
1312   }
1313 }
1314
1315 //===----------------------------------------------------------------------===//
1316 //                      Calling Convention Implementation
1317 //===----------------------------------------------------------------------===//
1318
1319 #include "ARMGenCallingConv.inc"
1320
1321 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1322 /// account presence of floating point hardware and calling convention
1323 /// limitations, such as support for variadic functions.
1324 CallingConv::ID
1325 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1326                                            bool isVarArg) const {
1327   switch (CC) {
1328   default:
1329     llvm_unreachable("Unsupported calling convention");
1330   case CallingConv::ARM_AAPCS:
1331   case CallingConv::ARM_APCS:
1332   case CallingConv::GHC:
1333     return CC;
1334   case CallingConv::ARM_AAPCS_VFP:
1335     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
1336   case CallingConv::C:
1337     if (!Subtarget->isAAPCS_ABI())
1338       return CallingConv::ARM_APCS;
1339     else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
1340              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1341              !isVarArg)
1342       return CallingConv::ARM_AAPCS_VFP;
1343     else
1344       return CallingConv::ARM_AAPCS;
1345   case CallingConv::Fast:
1346     if (!Subtarget->isAAPCS_ABI()) {
1347       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1348         return CallingConv::Fast;
1349       return CallingConv::ARM_APCS;
1350     } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1351       return CallingConv::ARM_AAPCS_VFP;
1352     else
1353       return CallingConv::ARM_AAPCS;
1354   }
1355 }
1356
1357 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1358 /// CallingConvention.
1359 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1360                                                  bool Return,
1361                                                  bool isVarArg) const {
1362   switch (getEffectiveCallingConv(CC, isVarArg)) {
1363   default:
1364     llvm_unreachable("Unsupported calling convention");
1365   case CallingConv::ARM_APCS:
1366     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1367   case CallingConv::ARM_AAPCS:
1368     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1369   case CallingConv::ARM_AAPCS_VFP:
1370     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1371   case CallingConv::Fast:
1372     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1373   case CallingConv::GHC:
1374     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1375   }
1376 }
1377
1378 /// LowerCallResult - Lower the result values of a call into the
1379 /// appropriate copies out of appropriate physical registers.
1380 SDValue
1381 ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1382                                    CallingConv::ID CallConv, bool isVarArg,
1383                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1384                                    SDLoc dl, SelectionDAG &DAG,
1385                                    SmallVectorImpl<SDValue> &InVals,
1386                                    bool isThisReturn, SDValue ThisVal) const {
1387
1388   // Assign locations to each value returned by this call.
1389   SmallVector<CCValAssign, 16> RVLocs;
1390   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1391                     *DAG.getContext(), Call);
1392   CCInfo.AnalyzeCallResult(Ins,
1393                            CCAssignFnForNode(CallConv, /* Return*/ true,
1394                                              isVarArg));
1395
1396   // Copy all of the result registers out of their specified physreg.
1397   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1398     CCValAssign VA = RVLocs[i];
1399
1400     // Pass 'this' value directly from the argument to return value, to avoid
1401     // reg unit interference
1402     if (i == 0 && isThisReturn) {
1403       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1404              "unexpected return calling convention register assignment");
1405       InVals.push_back(ThisVal);
1406       continue;
1407     }
1408
1409     SDValue Val;
1410     if (VA.needsCustom()) {
1411       // Handle f64 or half of a v2f64.
1412       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1413                                       InFlag);
1414       Chain = Lo.getValue(1);
1415       InFlag = Lo.getValue(2);
1416       VA = RVLocs[++i]; // skip ahead to next loc
1417       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1418                                       InFlag);
1419       Chain = Hi.getValue(1);
1420       InFlag = Hi.getValue(2);
1421       if (!Subtarget->isLittle())
1422         std::swap (Lo, Hi);
1423       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1424
1425       if (VA.getLocVT() == MVT::v2f64) {
1426         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1427         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1428                           DAG.getConstant(0, dl, MVT::i32));
1429
1430         VA = RVLocs[++i]; // skip ahead to next loc
1431         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1432         Chain = Lo.getValue(1);
1433         InFlag = Lo.getValue(2);
1434         VA = RVLocs[++i]; // skip ahead to next loc
1435         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1436         Chain = Hi.getValue(1);
1437         InFlag = Hi.getValue(2);
1438         if (!Subtarget->isLittle())
1439           std::swap (Lo, Hi);
1440         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1441         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1442                           DAG.getConstant(1, dl, MVT::i32));
1443       }
1444     } else {
1445       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1446                                InFlag);
1447       Chain = Val.getValue(1);
1448       InFlag = Val.getValue(2);
1449     }
1450
1451     switch (VA.getLocInfo()) {
1452     default: llvm_unreachable("Unknown loc info!");
1453     case CCValAssign::Full: break;
1454     case CCValAssign::BCvt:
1455       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1456       break;
1457     }
1458
1459     InVals.push_back(Val);
1460   }
1461
1462   return Chain;
1463 }
1464
1465 /// LowerMemOpCallTo - Store the argument to the stack.
1466 SDValue
1467 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
1468                                     SDValue StackPtr, SDValue Arg,
1469                                     SDLoc dl, SelectionDAG &DAG,
1470                                     const CCValAssign &VA,
1471                                     ISD::ArgFlagsTy Flags) const {
1472   unsigned LocMemOffset = VA.getLocMemOffset();
1473   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1474   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1475                        StackPtr, PtrOff);
1476   return DAG.getStore(
1477       Chain, dl, Arg, PtrOff,
1478       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
1479       false, false, 0);
1480 }
1481
1482 void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
1483                                          SDValue Chain, SDValue &Arg,
1484                                          RegsToPassVector &RegsToPass,
1485                                          CCValAssign &VA, CCValAssign &NextVA,
1486                                          SDValue &StackPtr,
1487                                          SmallVectorImpl<SDValue> &MemOpChains,
1488                                          ISD::ArgFlagsTy Flags) const {
1489
1490   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1491                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
1492   unsigned id = Subtarget->isLittle() ? 0 : 1;
1493   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1494
1495   if (NextVA.isRegLoc())
1496     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1497   else {
1498     assert(NextVA.isMemLoc());
1499     if (!StackPtr.getNode())
1500       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1501                                     getPointerTy(DAG.getDataLayout()));
1502
1503     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
1504                                            dl, DAG, NextVA,
1505                                            Flags));
1506   }
1507 }
1508
1509 /// LowerCall - Lowering a call into a callseq_start <-
1510 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1511 /// nodes.
1512 SDValue
1513 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1514                              SmallVectorImpl<SDValue> &InVals) const {
1515   SelectionDAG &DAG                     = CLI.DAG;
1516   SDLoc &dl                             = CLI.DL;
1517   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1518   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1519   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1520   SDValue Chain                         = CLI.Chain;
1521   SDValue Callee                        = CLI.Callee;
1522   bool &isTailCall                      = CLI.IsTailCall;
1523   CallingConv::ID CallConv              = CLI.CallConv;
1524   bool doesNotRet                       = CLI.DoesNotReturn;
1525   bool isVarArg                         = CLI.IsVarArg;
1526
1527   MachineFunction &MF = DAG.getMachineFunction();
1528   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1529   bool isThisReturn   = false;
1530   bool isSibCall      = false;
1531   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
1532
1533   // Disable tail calls if they're not supported.
1534   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
1535     isTailCall = false;
1536
1537   if (isTailCall) {
1538     // Check if it's really possible to do a tail call.
1539     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1540                     isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
1541                                                    Outs, OutVals, Ins, DAG);
1542     if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
1543       report_fatal_error("failed to perform tail call elimination on a call "
1544                          "site marked musttail");
1545     // We don't support GuaranteedTailCallOpt for ARM, only automatically
1546     // detected sibcalls.
1547     if (isTailCall) {
1548       ++NumTailCalls;
1549       isSibCall = true;
1550     }
1551   }
1552
1553   // Analyze operands of the call, assigning locations to each operand.
1554   SmallVector<CCValAssign, 16> ArgLocs;
1555   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1556                     *DAG.getContext(), Call);
1557   CCInfo.AnalyzeCallOperands(Outs,
1558                              CCAssignFnForNode(CallConv, /* Return*/ false,
1559                                                isVarArg));
1560
1561   // Get a count of how many bytes are to be pushed on the stack.
1562   unsigned NumBytes = CCInfo.getNextStackOffset();
1563
1564   // For tail calls, memory operands are available in our caller's stack.
1565   if (isSibCall)
1566     NumBytes = 0;
1567
1568   // Adjust the stack pointer for the new arguments...
1569   // These operations are automatically eliminated by the prolog/epilog pass
1570   if (!isSibCall)
1571     Chain = DAG.getCALLSEQ_START(Chain,
1572                                  DAG.getIntPtrConstant(NumBytes, dl, true), dl);
1573
1574   SDValue StackPtr =
1575       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
1576
1577   RegsToPassVector RegsToPass;
1578   SmallVector<SDValue, 8> MemOpChains;
1579
1580   // Walk the register/memloc assignments, inserting copies/loads.  In the case
1581   // of tail call optimization, arguments are handled later.
1582   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1583        i != e;
1584        ++i, ++realArgIdx) {
1585     CCValAssign &VA = ArgLocs[i];
1586     SDValue Arg = OutVals[realArgIdx];
1587     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1588     bool isByVal = Flags.isByVal();
1589
1590     // Promote the value if needed.
1591     switch (VA.getLocInfo()) {
1592     default: llvm_unreachable("Unknown loc info!");
1593     case CCValAssign::Full: break;
1594     case CCValAssign::SExt:
1595       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1596       break;
1597     case CCValAssign::ZExt:
1598       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1599       break;
1600     case CCValAssign::AExt:
1601       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1602       break;
1603     case CCValAssign::BCvt:
1604       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1605       break;
1606     }
1607
1608     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1609     if (VA.needsCustom()) {
1610       if (VA.getLocVT() == MVT::v2f64) {
1611         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1612                                   DAG.getConstant(0, dl, MVT::i32));
1613         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1614                                   DAG.getConstant(1, dl, MVT::i32));
1615
1616         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1617                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1618
1619         VA = ArgLocs[++i]; // skip ahead to next loc
1620         if (VA.isRegLoc()) {
1621           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1622                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1623         } else {
1624           assert(VA.isMemLoc());
1625
1626           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1627                                                  dl, DAG, VA, Flags));
1628         }
1629       } else {
1630         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1631                          StackPtr, MemOpChains, Flags);
1632       }
1633     } else if (VA.isRegLoc()) {
1634       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
1635         assert(VA.getLocVT() == MVT::i32 &&
1636                "unexpected calling convention register assignment");
1637         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
1638                "unexpected use of 'returned'");
1639         isThisReturn = true;
1640       }
1641       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1642     } else if (isByVal) {
1643       assert(VA.isMemLoc());
1644       unsigned offset = 0;
1645
1646       // True if this byval aggregate will be split between registers
1647       // and memory.
1648       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
1649       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
1650
1651       if (CurByValIdx < ByValArgsCount) {
1652
1653         unsigned RegBegin, RegEnd;
1654         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
1655
1656         EVT PtrVT =
1657             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
1658         unsigned int i, j;
1659         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
1660           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
1661           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1662           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1663                                      MachinePointerInfo(),
1664                                      false, false, false,
1665                                      DAG.InferPtrAlignment(AddArg));
1666           MemOpChains.push_back(Load.getValue(1));
1667           RegsToPass.push_back(std::make_pair(j, Load));
1668         }
1669
1670         // If parameter size outsides register area, "offset" value
1671         // helps us to calculate stack slot for remained part properly.
1672         offset = RegEnd - RegBegin;
1673
1674         CCInfo.nextInRegsParam();
1675       }
1676
1677       if (Flags.getByValSize() > 4*offset) {
1678         auto PtrVT = getPointerTy(DAG.getDataLayout());
1679         unsigned LocMemOffset = VA.getLocMemOffset();
1680         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1681         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
1682         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
1683         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
1684         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
1685                                            MVT::i32);
1686         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
1687                                             MVT::i32);
1688
1689         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1690         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1691         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1692                                           Ops));
1693       }
1694     } else if (!isSibCall) {
1695       assert(VA.isMemLoc());
1696
1697       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1698                                              dl, DAG, VA, Flags));
1699     }
1700   }
1701
1702   if (!MemOpChains.empty())
1703     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
1704
1705   // Build a sequence of copy-to-reg nodes chained together with token chain
1706   // and flag operands which copy the outgoing args into the appropriate regs.
1707   SDValue InFlag;
1708   // Tail call byval lowering might overwrite argument registers so in case of
1709   // tail call optimization the copies to registers are lowered later.
1710   if (!isTailCall)
1711     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1712       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1713                                RegsToPass[i].second, InFlag);
1714       InFlag = Chain.getValue(1);
1715     }
1716
1717   // For tail calls lower the arguments to the 'real' stack slot.
1718   if (isTailCall) {
1719     // Force all the incoming stack arguments to be loaded from the stack
1720     // before any new outgoing arguments are stored to the stack, because the
1721     // outgoing stack slots may alias the incoming argument stack slots, and
1722     // the alias isn't otherwise explicit. This is slightly more conservative
1723     // than necessary, because it means that each store effectively depends
1724     // on every argument instead of just those arguments it would clobber.
1725
1726     // Do not flag preceding copytoreg stuff together with the following stuff.
1727     InFlag = SDValue();
1728     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1729       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1730                                RegsToPass[i].second, InFlag);
1731       InFlag = Chain.getValue(1);
1732     }
1733     InFlag = SDValue();
1734   }
1735
1736   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1737   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1738   // node so that legalize doesn't hack it.
1739   bool isDirect = false;
1740   bool isARMFunc = false;
1741   bool isLocalARMFunc = false;
1742   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1743   auto PtrVt = getPointerTy(DAG.getDataLayout());
1744
1745   if (Subtarget->genLongCalls()) {
1746     assert((Subtarget->isTargetWindows() ||
1747             getTargetMachine().getRelocationModel() == Reloc::Static) &&
1748            "long-calls with non-static relocation model!");
1749     // Handle a global address or an external symbol. If it's not one of
1750     // those, the target's already in a register, so we don't need to do
1751     // anything extra.
1752     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1753       const GlobalValue *GV = G->getGlobal();
1754       // Create a constant pool entry for the callee address
1755       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1756       ARMConstantPoolValue *CPV =
1757         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
1758
1759       // Get the address of the callee into a register
1760       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
1761       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1762       Callee = DAG.getLoad(
1763           PtrVt, dl, DAG.getEntryNode(), CPAddr,
1764           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
1765           false, false, 0);
1766     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
1767       const char *Sym = S->getSymbol();
1768
1769       // Create a constant pool entry for the callee address
1770       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1771       ARMConstantPoolValue *CPV =
1772         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1773                                       ARMPCLabelIndex, 0);
1774       // Get the address of the callee into a register
1775       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
1776       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1777       Callee = DAG.getLoad(
1778           PtrVt, dl, DAG.getEntryNode(), CPAddr,
1779           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
1780           false, false, 0);
1781     }
1782   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1783     const GlobalValue *GV = G->getGlobal();
1784     isDirect = true;
1785     bool isDef = GV->isStrongDefinitionForLinker();
1786     bool isStub = (!isDef && Subtarget->isTargetMachO()) &&
1787                    getTargetMachine().getRelocationModel() != Reloc::Static;
1788     isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
1789     // ARM call to a local ARM function is predicable.
1790     isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
1791     // tBX takes a register source operand.
1792     if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1793       assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
1794       Callee = DAG.getNode(
1795           ARMISD::WrapperPIC, dl, PtrVt,
1796           DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
1797       Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
1798                            MachinePointerInfo::getGOT(DAG.getMachineFunction()),
1799                            false, false, true, 0);
1800     } else if (Subtarget->isTargetCOFF()) {
1801       assert(Subtarget->isTargetWindows() &&
1802              "Windows is the only supported COFF target");
1803       unsigned TargetFlags = GV->hasDLLImportStorageClass()
1804                                  ? ARMII::MO_DLLIMPORT
1805                                  : ARMII::MO_NO_FLAG;
1806       Callee =
1807           DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags);
1808       if (GV->hasDLLImportStorageClass())
1809         Callee =
1810             DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
1811                         DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
1812                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
1813                         false, false, false, 0);
1814     } else {
1815       // On ELF targets for PIC code, direct calls should go through the PLT
1816       unsigned OpFlags = 0;
1817       if (Subtarget->isTargetELF() &&
1818           getTargetMachine().getRelocationModel() == Reloc::PIC_)
1819         OpFlags = ARMII::MO_PLT;
1820       Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags);
1821     }
1822   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1823     isDirect = true;
1824     bool isStub = Subtarget->isTargetMachO() &&
1825                   getTargetMachine().getRelocationModel() != Reloc::Static;
1826     isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
1827     // tBX takes a register source operand.
1828     const char *Sym = S->getSymbol();
1829     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1830       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1831       ARMConstantPoolValue *CPV =
1832         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1833                                       ARMPCLabelIndex, 4);
1834       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
1835       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1836       Callee = DAG.getLoad(
1837           PtrVt, dl, DAG.getEntryNode(), CPAddr,
1838           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
1839           false, false, 0);
1840       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
1841       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
1842     } else {
1843       unsigned OpFlags = 0;
1844       // On ELF targets for PIC code, direct calls should go through the PLT
1845       if (Subtarget->isTargetELF() &&
1846                   getTargetMachine().getRelocationModel() == Reloc::PIC_)
1847         OpFlags = ARMII::MO_PLT;
1848       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags);
1849     }
1850   }
1851
1852   // FIXME: handle tail calls differently.
1853   unsigned CallOpc;
1854   if (Subtarget->isThumb()) {
1855     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
1856       CallOpc = ARMISD::CALL_NOLINK;
1857     else
1858       CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
1859   } else {
1860     if (!isDirect && !Subtarget->hasV5TOps())
1861       CallOpc = ARMISD::CALL_NOLINK;
1862     else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
1863              // Emit regular call when code size is the priority
1864              !MF.getFunction()->optForMinSize())
1865       // "mov lr, pc; b _foo" to avoid confusing the RSP
1866       CallOpc = ARMISD::CALL_NOLINK;
1867     else
1868       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
1869   }
1870
1871   std::vector<SDValue> Ops;
1872   Ops.push_back(Chain);
1873   Ops.push_back(Callee);
1874
1875   // Add argument registers to the end of the list so that they are known live
1876   // into the call.
1877   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1878     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1879                                   RegsToPass[i].second.getValueType()));
1880
1881   // Add a register mask operand representing the call-preserved registers.
1882   if (!isTailCall) {
1883     const uint32_t *Mask;
1884     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
1885     if (isThisReturn) {
1886       // For 'this' returns, use the R0-preserving mask if applicable
1887       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
1888       if (!Mask) {
1889         // Set isThisReturn to false if the calling convention is not one that
1890         // allows 'returned' to be modeled in this way, so LowerCallResult does
1891         // not try to pass 'this' straight through
1892         isThisReturn = false;
1893         Mask = ARI->getCallPreservedMask(MF, CallConv);
1894       }
1895     } else
1896       Mask = ARI->getCallPreservedMask(MF, CallConv);
1897
1898     assert(Mask && "Missing call preserved mask for calling convention");
1899     Ops.push_back(DAG.getRegisterMask(Mask));
1900   }
1901
1902   if (InFlag.getNode())
1903     Ops.push_back(InFlag);
1904
1905   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1906   if (isTailCall) {
1907     MF.getFrameInfo()->setHasTailCall();
1908     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
1909   }
1910
1911   // Returns a chain and a flag for retval copy to use.
1912   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
1913   InFlag = Chain.getValue(1);
1914
1915   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
1916                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
1917   if (!Ins.empty())
1918     InFlag = Chain.getValue(1);
1919
1920   // Handle result values, copying them out of physregs into vregs that we
1921   // return.
1922   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
1923                          InVals, isThisReturn,
1924                          isThisReturn ? OutVals[0] : SDValue());
1925 }
1926
1927 /// HandleByVal - Every parameter *after* a byval parameter is passed
1928 /// on the stack.  Remember the next parameter register to allocate,
1929 /// and then confiscate the rest of the parameter registers to insure
1930 /// this.
1931 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
1932                                     unsigned Align) const {
1933   assert((State->getCallOrPrologue() == Prologue ||
1934           State->getCallOrPrologue() == Call) &&
1935          "unhandled ParmContext");
1936
1937   // Byval (as with any stack) slots are always at least 4 byte aligned.
1938   Align = std::max(Align, 4U);
1939
1940   unsigned Reg = State->AllocateReg(GPRArgRegs);
1941   if (!Reg)
1942     return;
1943
1944   unsigned AlignInRegs = Align / 4;
1945   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
1946   for (unsigned i = 0; i < Waste; ++i)
1947     Reg = State->AllocateReg(GPRArgRegs);
1948
1949   if (!Reg)
1950     return;
1951
1952   unsigned Excess = 4 * (ARM::R4 - Reg);
1953
1954   // Special case when NSAA != SP and parameter size greater than size of
1955   // all remained GPR regs. In that case we can't split parameter, we must
1956   // send it to stack. We also must set NCRN to R4, so waste all
1957   // remained registers.
1958   const unsigned NSAAOffset = State->getNextStackOffset();
1959   if (NSAAOffset != 0 && Size > Excess) {
1960     while (State->AllocateReg(GPRArgRegs))
1961       ;
1962     return;
1963   }
1964
1965   // First register for byval parameter is the first register that wasn't
1966   // allocated before this method call, so it would be "reg".
1967   // If parameter is small enough to be saved in range [reg, r4), then
1968   // the end (first after last) register would be reg + param-size-in-regs,
1969   // else parameter would be splitted between registers and stack,
1970   // end register would be r4 in this case.
1971   unsigned ByValRegBegin = Reg;
1972   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
1973   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
1974   // Note, first register is allocated in the beginning of function already,
1975   // allocate remained amount of registers we need.
1976   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
1977     State->AllocateReg(GPRArgRegs);
1978   // A byval parameter that is split between registers and memory needs its
1979   // size truncated here.
1980   // In the case where the entire structure fits in registers, we set the
1981   // size in memory to zero.
1982   Size = std::max<int>(Size - Excess, 0);
1983 }
1984
1985 /// MatchingStackOffset - Return true if the given stack call argument is
1986 /// already available in the same position (relatively) of the caller's
1987 /// incoming argument stack.
1988 static
1989 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
1990                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
1991                          const TargetInstrInfo *TII) {
1992   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
1993   int FI = INT_MAX;
1994   if (Arg.getOpcode() == ISD::CopyFromReg) {
1995     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
1996     if (!TargetRegisterInfo::isVirtualRegister(VR))
1997       return false;
1998     MachineInstr *Def = MRI->getVRegDef(VR);
1999     if (!Def)
2000       return false;
2001     if (!Flags.isByVal()) {
2002       if (!TII->isLoadFromStackSlot(Def, FI))
2003         return false;
2004     } else {
2005       return false;
2006     }
2007   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2008     if (Flags.isByVal())
2009       // ByVal argument is passed in as a pointer but it's now being
2010       // dereferenced. e.g.
2011       // define @foo(%struct.X* %A) {
2012       //   tail call @bar(%struct.X* byval %A)
2013       // }
2014       return false;
2015     SDValue Ptr = Ld->getBasePtr();
2016     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2017     if (!FINode)
2018       return false;
2019     FI = FINode->getIndex();
2020   } else
2021     return false;
2022
2023   assert(FI != INT_MAX);
2024   if (!MFI->isFixedObjectIndex(FI))
2025     return false;
2026   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2027 }
2028
2029 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2030 /// for tail call optimization. Targets which want to do tail call
2031 /// optimization should implement this function.
2032 bool
2033 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2034                                                      CallingConv::ID CalleeCC,
2035                                                      bool isVarArg,
2036                                                      bool isCalleeStructRet,
2037                                                      bool isCallerStructRet,
2038                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
2039                                     const SmallVectorImpl<SDValue> &OutVals,
2040                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2041                                                      SelectionDAG& DAG) const {
2042   const Function *CallerF = DAG.getMachineFunction().getFunction();
2043   CallingConv::ID CallerCC = CallerF->getCallingConv();
2044   bool CCMatch = CallerCC == CalleeCC;
2045
2046   // Look for obvious safe cases to perform tail call optimization that do not
2047   // require ABI changes. This is what gcc calls sibcall.
2048
2049   // Do not sibcall optimize vararg calls unless the call site is not passing
2050   // any arguments.
2051   if (isVarArg && !Outs.empty())
2052     return false;
2053
2054   // Exception-handling functions need a special set of instructions to indicate
2055   // a return to the hardware. Tail-calling another function would probably
2056   // break this.
2057   if (CallerF->hasFnAttribute("interrupt"))
2058     return false;
2059
2060   // Also avoid sibcall optimization if either caller or callee uses struct
2061   // return semantics.
2062   if (isCalleeStructRet || isCallerStructRet)
2063     return false;
2064
2065   // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
2066   // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
2067   // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
2068   // support in the assembler and linker to be used. This would need to be
2069   // fixed to fully support tail calls in Thumb1.
2070   //
2071   // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
2072   // LR.  This means if we need to reload LR, it takes an extra instructions,
2073   // which outweighs the value of the tail call; but here we don't know yet
2074   // whether LR is going to be used.  Probably the right approach is to
2075   // generate the tail call here and turn it back into CALL/RET in
2076   // emitEpilogue if LR is used.
2077
2078   // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
2079   // but we need to make sure there are enough registers; the only valid
2080   // registers are the 4 used for parameters.  We don't currently do this
2081   // case.
2082   if (Subtarget->isThumb1Only())
2083     return false;
2084
2085   // Externally-defined functions with weak linkage should not be
2086   // tail-called on ARM when the OS does not support dynamic
2087   // pre-emption of symbols, as the AAELF spec requires normal calls
2088   // to undefined weak functions to be replaced with a NOP or jump to the
2089   // next instruction. The behaviour of branch instructions in this
2090   // situation (as used for tail calls) is implementation-defined, so we
2091   // cannot rely on the linker replacing the tail call with a return.
2092   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2093     const GlobalValue *GV = G->getGlobal();
2094     const Triple &TT = getTargetMachine().getTargetTriple();
2095     if (GV->hasExternalWeakLinkage() &&
2096         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2097       return false;
2098   }
2099
2100   // If the calling conventions do not match, then we'd better make sure the
2101   // results are returned in the same way as what the caller expects.
2102   if (!CCMatch) {
2103     SmallVector<CCValAssign, 16> RVLocs1;
2104     ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
2105                        *DAG.getContext(), Call);
2106     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
2107
2108     SmallVector<CCValAssign, 16> RVLocs2;
2109     ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
2110                        *DAG.getContext(), Call);
2111     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
2112
2113     if (RVLocs1.size() != RVLocs2.size())
2114       return false;
2115     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2116       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2117         return false;
2118       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2119         return false;
2120       if (RVLocs1[i].isRegLoc()) {
2121         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2122           return false;
2123       } else {
2124         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2125           return false;
2126       }
2127     }
2128   }
2129
2130   // If Caller's vararg or byval argument has been split between registers and
2131   // stack, do not perform tail call, since part of the argument is in caller's
2132   // local frame.
2133   const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
2134                                       getInfo<ARMFunctionInfo>();
2135   if (AFI_Caller->getArgRegsSaveSize())
2136     return false;
2137
2138   // If the callee takes no arguments then go on to check the results of the
2139   // call.
2140   if (!Outs.empty()) {
2141     // Check if stack adjustment is needed. For now, do not do this if any
2142     // argument is passed on the stack.
2143     SmallVector<CCValAssign, 16> ArgLocs;
2144     ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
2145                       *DAG.getContext(), Call);
2146     CCInfo.AnalyzeCallOperands(Outs,
2147                                CCAssignFnForNode(CalleeCC, false, isVarArg));
2148     if (CCInfo.getNextStackOffset()) {
2149       MachineFunction &MF = DAG.getMachineFunction();
2150
2151       // Check if the arguments are already laid out in the right way as
2152       // the caller's fixed stack objects.
2153       MachineFrameInfo *MFI = MF.getFrameInfo();
2154       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2155       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2156       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2157            i != e;
2158            ++i, ++realArgIdx) {
2159         CCValAssign &VA = ArgLocs[i];
2160         EVT RegVT = VA.getLocVT();
2161         SDValue Arg = OutVals[realArgIdx];
2162         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2163         if (VA.getLocInfo() == CCValAssign::Indirect)
2164           return false;
2165         if (VA.needsCustom()) {
2166           // f64 and vector types are split into multiple registers or
2167           // register/stack-slot combinations.  The types will not match
2168           // the registers; give up on memory f64 refs until we figure
2169           // out what to do about this.
2170           if (!VA.isRegLoc())
2171             return false;
2172           if (!ArgLocs[++i].isRegLoc())
2173             return false;
2174           if (RegVT == MVT::v2f64) {
2175             if (!ArgLocs[++i].isRegLoc())
2176               return false;
2177             if (!ArgLocs[++i].isRegLoc())
2178               return false;
2179           }
2180         } else if (!VA.isRegLoc()) {
2181           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2182                                    MFI, MRI, TII))
2183             return false;
2184         }
2185       }
2186     }
2187   }
2188
2189   return true;
2190 }
2191
2192 bool
2193 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2194                                   MachineFunction &MF, bool isVarArg,
2195                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2196                                   LLVMContext &Context) const {
2197   SmallVector<CCValAssign, 16> RVLocs;
2198   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2199   return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
2200                                                     isVarArg));
2201 }
2202
2203 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2204                                     SDLoc DL, SelectionDAG &DAG) {
2205   const MachineFunction &MF = DAG.getMachineFunction();
2206   const Function *F = MF.getFunction();
2207
2208   StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
2209
2210   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2211   // version of the "preferred return address". These offsets affect the return
2212   // instruction if this is a return from PL1 without hypervisor extensions.
2213   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2214   //    SWI:     0      "subs pc, lr, #0"
2215   //    ABORT:   +4     "subs pc, lr, #4"
2216   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2217   // UNDEF varies depending on where the exception came from ARM or Thumb
2218   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2219
2220   int64_t LROffset;
2221   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2222       IntKind == "ABORT")
2223     LROffset = 4;
2224   else if (IntKind == "SWI" || IntKind == "UNDEF")
2225     LROffset = 0;
2226   else
2227     report_fatal_error("Unsupported interrupt attribute. If present, value "
2228                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2229
2230   RetOps.insert(RetOps.begin() + 1,
2231                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2232
2233   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2234 }
2235
2236 SDValue
2237 ARMTargetLowering::LowerReturn(SDValue Chain,
2238                                CallingConv::ID CallConv, bool isVarArg,
2239                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2240                                const SmallVectorImpl<SDValue> &OutVals,
2241                                SDLoc dl, SelectionDAG &DAG) const {
2242
2243   // CCValAssign - represent the assignment of the return value to a location.
2244   SmallVector<CCValAssign, 16> RVLocs;
2245
2246   // CCState - Info about the registers and stack slots.
2247   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2248                     *DAG.getContext(), Call);
2249
2250   // Analyze outgoing return values.
2251   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
2252                                                isVarArg));
2253
2254   SDValue Flag;
2255   SmallVector<SDValue, 4> RetOps;
2256   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2257   bool isLittleEndian = Subtarget->isLittle();
2258
2259   MachineFunction &MF = DAG.getMachineFunction();
2260   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2261   AFI->setReturnRegsCount(RVLocs.size());
2262
2263   // Copy the result values into the output registers.
2264   for (unsigned i = 0, realRVLocIdx = 0;
2265        i != RVLocs.size();
2266        ++i, ++realRVLocIdx) {
2267     CCValAssign &VA = RVLocs[i];
2268     assert(VA.isRegLoc() && "Can only return in registers!");
2269
2270     SDValue Arg = OutVals[realRVLocIdx];
2271
2272     switch (VA.getLocInfo()) {
2273     default: llvm_unreachable("Unknown loc info!");
2274     case CCValAssign::Full: break;
2275     case CCValAssign::BCvt:
2276       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2277       break;
2278     }
2279
2280     if (VA.needsCustom()) {
2281       if (VA.getLocVT() == MVT::v2f64) {
2282         // Extract the first half and return it in two registers.
2283         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2284                                    DAG.getConstant(0, dl, MVT::i32));
2285         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2286                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2287
2288         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2289                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
2290                                  Flag);
2291         Flag = Chain.getValue(1);
2292         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2293         VA = RVLocs[++i]; // skip ahead to next loc
2294         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2295                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
2296                                  Flag);
2297         Flag = Chain.getValue(1);
2298         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2299         VA = RVLocs[++i]; // skip ahead to next loc
2300
2301         // Extract the 2nd half and fall through to handle it as an f64 value.
2302         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2303                           DAG.getConstant(1, dl, MVT::i32));
2304       }
2305       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2306       // available.
2307       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2308                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
2309       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2310                                fmrrd.getValue(isLittleEndian ? 0 : 1),
2311                                Flag);
2312       Flag = Chain.getValue(1);
2313       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2314       VA = RVLocs[++i]; // skip ahead to next loc
2315       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2316                                fmrrd.getValue(isLittleEndian ? 1 : 0),
2317                                Flag);
2318     } else
2319       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2320
2321     // Guarantee that all emitted copies are
2322     // stuck together, avoiding something bad.
2323     Flag = Chain.getValue(1);
2324     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2325   }
2326
2327   // Update chain and glue.
2328   RetOps[0] = Chain;
2329   if (Flag.getNode())
2330     RetOps.push_back(Flag);
2331
2332   // CPUs which aren't M-class use a special sequence to return from
2333   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
2334   // though we use "subs pc, lr, #N").
2335   //
2336   // M-class CPUs actually use a normal return sequence with a special
2337   // (hardware-provided) value in LR, so the normal code path works.
2338   if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
2339       !Subtarget->isMClass()) {
2340     if (Subtarget->isThumb1Only())
2341       report_fatal_error("interrupt attribute is not supported in Thumb1");
2342     return LowerInterruptReturn(RetOps, dl, DAG);
2343   }
2344
2345   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
2346 }
2347
2348 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2349   if (N->getNumValues() != 1)
2350     return false;
2351   if (!N->hasNUsesOfValue(1, 0))
2352     return false;
2353
2354   SDValue TCChain = Chain;
2355   SDNode *Copy = *N->use_begin();
2356   if (Copy->getOpcode() == ISD::CopyToReg) {
2357     // If the copy has a glue operand, we conservatively assume it isn't safe to
2358     // perform a tail call.
2359     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2360       return false;
2361     TCChain = Copy->getOperand(0);
2362   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2363     SDNode *VMov = Copy;
2364     // f64 returned in a pair of GPRs.
2365     SmallPtrSet<SDNode*, 2> Copies;
2366     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2367          UI != UE; ++UI) {
2368       if (UI->getOpcode() != ISD::CopyToReg)
2369         return false;
2370       Copies.insert(*UI);
2371     }
2372     if (Copies.size() > 2)
2373       return false;
2374
2375     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2376          UI != UE; ++UI) {
2377       SDValue UseChain = UI->getOperand(0);
2378       if (Copies.count(UseChain.getNode()))
2379         // Second CopyToReg
2380         Copy = *UI;
2381       else {
2382         // We are at the top of this chain.
2383         // If the copy has a glue operand, we conservatively assume it
2384         // isn't safe to perform a tail call.
2385         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
2386           return false;
2387         // First CopyToReg
2388         TCChain = UseChain;
2389       }
2390     }
2391   } else if (Copy->getOpcode() == ISD::BITCAST) {
2392     // f32 returned in a single GPR.
2393     if (!Copy->hasOneUse())
2394       return false;
2395     Copy = *Copy->use_begin();
2396     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2397       return false;
2398     // If the copy has a glue operand, we conservatively assume it isn't safe to
2399     // perform a tail call.
2400     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2401       return false;
2402     TCChain = Copy->getOperand(0);
2403   } else {
2404     return false;
2405   }
2406
2407   bool HasRet = false;
2408   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2409        UI != UE; ++UI) {
2410     if (UI->getOpcode() != ARMISD::RET_FLAG &&
2411         UI->getOpcode() != ARMISD::INTRET_FLAG)
2412       return false;
2413     HasRet = true;
2414   }
2415
2416   if (!HasRet)
2417     return false;
2418
2419   Chain = TCChain;
2420   return true;
2421 }
2422
2423 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2424   if (!Subtarget->supportsTailCall())
2425     return false;
2426
2427   auto Attr =
2428       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2429   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2430     return false;
2431
2432   return !Subtarget->isThumb1Only();
2433 }
2434
2435 // Trying to write a 64 bit value so need to split into two 32 bit values first,
2436 // and pass the lower and high parts through.
2437 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
2438   SDLoc DL(Op);
2439   SDValue WriteValue = Op->getOperand(2);
2440
2441   // This function is only supposed to be called for i64 type argument.
2442   assert(WriteValue.getValueType() == MVT::i64
2443           && "LowerWRITE_REGISTER called for non-i64 type argument.");
2444
2445   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2446                            DAG.getConstant(0, DL, MVT::i32));
2447   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2448                            DAG.getConstant(1, DL, MVT::i32));
2449   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
2450   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
2451 }
2452
2453 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2454 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2455 // one of the above mentioned nodes. It has to be wrapped because otherwise
2456 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2457 // be used to form addressing mode. These wrapped nodes will be selected
2458 // into MOVi.
2459 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
2460   EVT PtrVT = Op.getValueType();
2461   // FIXME there is no actual debug info here
2462   SDLoc dl(Op);
2463   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2464   SDValue Res;
2465   if (CP->isMachineConstantPoolEntry())
2466     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2467                                     CP->getAlignment());
2468   else
2469     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2470                                     CP->getAlignment());
2471   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2472 }
2473
2474 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2475   return MachineJumpTableInfo::EK_Inline;
2476 }
2477
2478 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2479                                              SelectionDAG &DAG) const {
2480   MachineFunction &MF = DAG.getMachineFunction();
2481   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2482   unsigned ARMPCLabelIndex = 0;
2483   SDLoc DL(Op);
2484   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2485   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2486   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2487   SDValue CPAddr;
2488   if (RelocM == Reloc::Static) {
2489     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2490   } else {
2491     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2492     ARMPCLabelIndex = AFI->createPICLabelUId();
2493     ARMConstantPoolValue *CPV =
2494       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2495                                       ARMCP::CPBlockAddress, PCAdj);
2496     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2497   }
2498   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2499   SDValue Result =
2500       DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
2501                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
2502                   false, false, false, 0);
2503   if (RelocM == Reloc::Static)
2504     return Result;
2505   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
2506   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2507 }
2508
2509 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
2510 SDValue
2511 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2512                                                  SelectionDAG &DAG) const {
2513   SDLoc dl(GA);
2514   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2515   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2516   MachineFunction &MF = DAG.getMachineFunction();
2517   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2518   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2519   ARMConstantPoolValue *CPV =
2520     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2521                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2522   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2523   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2524   Argument =
2525       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
2526                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
2527                   false, false, false, 0);
2528   SDValue Chain = Argument.getValue(1);
2529
2530   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2531   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2532
2533   // call __tls_get_addr.
2534   ArgListTy Args;
2535   ArgListEntry Entry;
2536   Entry.Node = Argument;
2537   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2538   Args.push_back(Entry);
2539
2540   // FIXME: is there useful debug info available here?
2541   TargetLowering::CallLoweringInfo CLI(DAG);
2542   CLI.setDebugLoc(dl).setChain(Chain)
2543     .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
2544                DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args),
2545                0);
2546
2547   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2548   return CallResult.first;
2549 }
2550
2551 // Lower ISD::GlobalTLSAddress using the "initial exec" or
2552 // "local exec" model.
2553 SDValue
2554 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2555                                         SelectionDAG &DAG,
2556                                         TLSModel::Model model) const {
2557   const GlobalValue *GV = GA->getGlobal();
2558   SDLoc dl(GA);
2559   SDValue Offset;
2560   SDValue Chain = DAG.getEntryNode();
2561   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2562   // Get the Thread Pointer
2563   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2564
2565   if (model == TLSModel::InitialExec) {
2566     MachineFunction &MF = DAG.getMachineFunction();
2567     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2568     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2569     // Initial exec model.
2570     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2571     ARMConstantPoolValue *CPV =
2572       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2573                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2574                                       true);
2575     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2576     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2577     Offset = DAG.getLoad(
2578         PtrVT, dl, Chain, Offset,
2579         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2580         false, false, 0);
2581     Chain = Offset.getValue(1);
2582
2583     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2584     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
2585
2586     Offset = DAG.getLoad(
2587         PtrVT, dl, Chain, Offset,
2588         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2589         false, false, 0);
2590   } else {
2591     // local exec model
2592     assert(model == TLSModel::LocalExec);
2593     ARMConstantPoolValue *CPV =
2594       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
2595     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2596     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2597     Offset = DAG.getLoad(
2598         PtrVT, dl, Chain, Offset,
2599         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2600         false, false, 0);
2601   }
2602
2603   // The address of the thread local variable is the add of the thread
2604   // pointer with the offset of the variable.
2605   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
2606 }
2607
2608 SDValue
2609 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
2610   // TODO: implement the "local dynamic" model
2611   assert(Subtarget->isTargetELF() &&
2612          "TLS not implemented for non-ELF targets");
2613   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2614   if (DAG.getTarget().Options.EmulatedTLS)
2615     return LowerToTLSEmulatedModel(GA, DAG);
2616
2617   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
2618
2619   switch (model) {
2620     case TLSModel::GeneralDynamic:
2621     case TLSModel::LocalDynamic:
2622       return LowerToTLSGeneralDynamicModel(GA, DAG);
2623     case TLSModel::InitialExec:
2624     case TLSModel::LocalExec:
2625       return LowerToTLSExecModels(GA, DAG, model);
2626   }
2627   llvm_unreachable("bogus TLS model");
2628 }
2629
2630 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
2631                                                  SelectionDAG &DAG) const {
2632   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2633   SDLoc dl(Op);
2634   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2635   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2636     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
2637     ARMConstantPoolValue *CPV =
2638       ARMConstantPoolConstant::Create(GV,
2639                                       UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
2640     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2641     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2642     SDValue Result = DAG.getLoad(
2643         PtrVT, dl, DAG.getEntryNode(), CPAddr,
2644         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2645         false, false, 0);
2646     SDValue Chain = Result.getValue(1);
2647     SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
2648     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
2649     if (!UseGOTOFF)
2650       Result = DAG.getLoad(PtrVT, dl, Chain, Result,
2651                            MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2652                            false, false, false, 0);
2653     return Result;
2654   }
2655
2656   // If we have T2 ops, we can materialize the address directly via movt/movw
2657   // pair. This is always cheaper.
2658   if (Subtarget->useMovt(DAG.getMachineFunction())) {
2659     ++NumMovwMovt;
2660     // FIXME: Once remat is capable of dealing with instructions with register
2661     // operands, expand this into two nodes.
2662     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2663                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2664   } else {
2665     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2666     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2667     return DAG.getLoad(
2668         PtrVT, dl, DAG.getEntryNode(), CPAddr,
2669         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2670         false, false, 0);
2671   }
2672 }
2673
2674 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
2675                                                     SelectionDAG &DAG) const {
2676   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2677   SDLoc dl(Op);
2678   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2679   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2680
2681   if (Subtarget->useMovt(DAG.getMachineFunction()))
2682     ++NumMovwMovt;
2683
2684   // FIXME: Once remat is capable of dealing with instructions with register
2685   // operands, expand this into multiple nodes
2686   unsigned Wrapper =
2687       RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper;
2688
2689   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
2690   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
2691
2692   if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2693     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
2694                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2695                          false, false, false, 0);
2696   return Result;
2697 }
2698
2699 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
2700                                                      SelectionDAG &DAG) const {
2701   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
2702   assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
2703          "Windows on ARM expects to use movw/movt");
2704
2705   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2706   const ARMII::TOF TargetFlags =
2707     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
2708   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2709   SDValue Result;
2710   SDLoc DL(Op);
2711
2712   ++NumMovwMovt;
2713
2714   // FIXME: Once remat is capable of dealing with instructions with register
2715   // operands, expand this into two nodes.
2716   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
2717                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
2718                                                   TargetFlags));
2719   if (GV->hasDLLImportStorageClass())
2720     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
2721                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2722                          false, false, false, 0);
2723   return Result;
2724 }
2725
2726 SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
2727                                                     SelectionDAG &DAG) const {
2728   assert(Subtarget->isTargetELF() &&
2729          "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
2730   MachineFunction &MF = DAG.getMachineFunction();
2731   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2732   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2733   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2734   SDLoc dl(Op);
2735   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2736   ARMConstantPoolValue *CPV =
2737     ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
2738                                   ARMPCLabelIndex, PCAdj);
2739   SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2740   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2741   SDValue Result =
2742       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2743                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
2744                   false, false, false, 0);
2745   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2746   return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2747 }
2748
2749 SDValue
2750 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
2751   SDLoc dl(Op);
2752   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
2753   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
2754                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
2755                      Op.getOperand(1), Val);
2756 }
2757
2758 SDValue
2759 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
2760   SDLoc dl(Op);
2761   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
2762                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
2763 }
2764
2765 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
2766                                                       SelectionDAG &DAG) const {
2767   SDLoc dl(Op);
2768   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
2769                      Op.getOperand(0));
2770 }
2771
2772 SDValue
2773 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
2774                                           const ARMSubtarget *Subtarget) const {
2775   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2776   SDLoc dl(Op);
2777   switch (IntNo) {
2778   default: return SDValue();    // Don't custom lower most intrinsics.
2779   case Intrinsic::arm_rbit: {
2780     assert(Op.getOperand(1).getValueType() == MVT::i32 &&
2781            "RBIT intrinsic must have i32 type!");
2782     return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
2783   }
2784   case Intrinsic::arm_thread_pointer: {
2785     EVT PtrVT = getPointerTy(DAG.getDataLayout());
2786     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2787   }
2788   case Intrinsic::eh_sjlj_lsda: {
2789     MachineFunction &MF = DAG.getMachineFunction();
2790     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2791     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2792     EVT PtrVT = getPointerTy(DAG.getDataLayout());
2793     Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2794     SDValue CPAddr;
2795     unsigned PCAdj = (RelocM != Reloc::PIC_)
2796       ? 0 : (Subtarget->isThumb() ? 4 : 8);
2797     ARMConstantPoolValue *CPV =
2798       ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
2799                                       ARMCP::CPLSDA, PCAdj);
2800     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2801     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2802     SDValue Result = DAG.getLoad(
2803         PtrVT, dl, DAG.getEntryNode(), CPAddr,
2804         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2805         false, false, 0);
2806
2807     if (RelocM == Reloc::PIC_) {
2808       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2809       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2810     }
2811     return Result;
2812   }
2813   case Intrinsic::arm_neon_vmulls:
2814   case Intrinsic::arm_neon_vmullu: {
2815     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
2816       ? ARMISD::VMULLs : ARMISD::VMULLu;
2817     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2818                        Op.getOperand(1), Op.getOperand(2));
2819   }
2820   case Intrinsic::arm_neon_vminnm:
2821   case Intrinsic::arm_neon_vmaxnm: {
2822     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
2823       ? ISD::FMINNUM : ISD::FMAXNUM;
2824     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2825                        Op.getOperand(1), Op.getOperand(2));
2826   }
2827   case Intrinsic::arm_neon_vmins:
2828   case Intrinsic::arm_neon_vmaxs: {
2829     // v{min,max}s is overloaded between signed integers and floats.
2830     if (!Op.getValueType().isFloatingPoint())
2831       return SDValue();
2832     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
2833       ? ISD::FMINNAN : ISD::FMAXNAN;
2834     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2835                        Op.getOperand(1), Op.getOperand(2));
2836   }
2837   }
2838 }
2839
2840 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
2841                                  const ARMSubtarget *Subtarget) {
2842   // FIXME: handle "fence singlethread" more efficiently.
2843   SDLoc dl(Op);
2844   if (!Subtarget->hasDataBarrier()) {
2845     // Some ARMv6 cpus can support data barriers with an mcr instruction.
2846     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
2847     // here.
2848     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
2849            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
2850     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
2851                        DAG.getConstant(0, dl, MVT::i32));
2852   }
2853
2854   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
2855   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
2856   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
2857   if (Subtarget->isMClass()) {
2858     // Only a full system barrier exists in the M-class architectures.
2859     Domain = ARM_MB::SY;
2860   } else if (Subtarget->isSwift() && Ord == Release) {
2861     // Swift happens to implement ISHST barriers in a way that's compatible with
2862     // Release semantics but weaker than ISH so we'd be fools not to use
2863     // it. Beware: other processors probably don't!
2864     Domain = ARM_MB::ISHST;
2865   }
2866
2867   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
2868                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
2869                      DAG.getConstant(Domain, dl, MVT::i32));
2870 }
2871
2872 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
2873                              const ARMSubtarget *Subtarget) {
2874   // ARM pre v5TE and Thumb1 does not have preload instructions.
2875   if (!(Subtarget->isThumb2() ||
2876         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
2877     // Just preserve the chain.
2878     return Op.getOperand(0);
2879
2880   SDLoc dl(Op);
2881   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
2882   if (!isRead &&
2883       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
2884     // ARMv7 with MP extension has PLDW.
2885     return Op.getOperand(0);
2886
2887   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2888   if (Subtarget->isThumb()) {
2889     // Invert the bits.
2890     isRead = ~isRead & 1;
2891     isData = ~isData & 1;
2892   }
2893
2894   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
2895                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
2896                      DAG.getConstant(isData, dl, MVT::i32));
2897 }
2898
2899 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
2900   MachineFunction &MF = DAG.getMachineFunction();
2901   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
2902
2903   // vastart just stores the address of the VarArgsFrameIndex slot into the
2904   // memory location argument.
2905   SDLoc dl(Op);
2906   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
2907   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
2908   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2909   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
2910                       MachinePointerInfo(SV), false, false, 0);
2911 }
2912
2913 SDValue
2914 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
2915                                         SDValue &Root, SelectionDAG &DAG,
2916                                         SDLoc dl) const {
2917   MachineFunction &MF = DAG.getMachineFunction();
2918   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2919
2920   const TargetRegisterClass *RC;
2921   if (AFI->isThumb1OnlyFunction())
2922     RC = &ARM::tGPRRegClass;
2923   else
2924     RC = &ARM::GPRRegClass;
2925
2926   // Transform the arguments stored in physical registers into virtual ones.
2927   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2928   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2929
2930   SDValue ArgValue2;
2931   if (NextVA.isMemLoc()) {
2932     MachineFrameInfo *MFI = MF.getFrameInfo();
2933     int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
2934
2935     // Create load node to retrieve arguments from the stack.
2936     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2937     ArgValue2 = DAG.getLoad(
2938         MVT::i32, dl, Root, FIN,
2939         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
2940         false, false, 0);
2941   } else {
2942     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2943     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2944   }
2945   if (!Subtarget->isLittle())
2946     std::swap (ArgValue, ArgValue2);
2947   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
2948 }
2949
2950 // The remaining GPRs hold either the beginning of variable-argument
2951 // data, or the beginning of an aggregate passed by value (usually
2952 // byval).  Either way, we allocate stack slots adjacent to the data
2953 // provided by our caller, and store the unallocated registers there.
2954 // If this is a variadic function, the va_list pointer will begin with
2955 // these values; otherwise, this reassembles a (byval) structure that
2956 // was split between registers and memory.
2957 // Return: The frame index registers were stored into.
2958 int
2959 ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
2960                                   SDLoc dl, SDValue &Chain,
2961                                   const Value *OrigArg,
2962                                   unsigned InRegsParamRecordIdx,
2963                                   int ArgOffset,
2964                                   unsigned ArgSize) const {
2965   // Currently, two use-cases possible:
2966   // Case #1. Non-var-args function, and we meet first byval parameter.
2967   //          Setup first unallocated register as first byval register;
2968   //          eat all remained registers
2969   //          (these two actions are performed by HandleByVal method).
2970   //          Then, here, we initialize stack frame with
2971   //          "store-reg" instructions.
2972   // Case #2. Var-args function, that doesn't contain byval parameters.
2973   //          The same: eat all remained unallocated registers,
2974   //          initialize stack frame.
2975
2976   MachineFunction &MF = DAG.getMachineFunction();
2977   MachineFrameInfo *MFI = MF.getFrameInfo();
2978   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2979   unsigned RBegin, REnd;
2980   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
2981     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
2982   } else {
2983     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
2984     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
2985     REnd = ARM::R4;
2986   }
2987
2988   if (REnd != RBegin)
2989     ArgOffset = -4 * (ARM::R4 - RBegin);
2990
2991   auto PtrVT = getPointerTy(DAG.getDataLayout());
2992   int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false);
2993   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
2994
2995   SmallVector<SDValue, 4> MemOps;
2996   const TargetRegisterClass *RC =
2997       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
2998
2999   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
3000     unsigned VReg = MF.addLiveIn(Reg, RC);
3001     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
3002     SDValue Store =
3003         DAG.getStore(Val.getValue(1), dl, Val, FIN,
3004                      MachinePointerInfo(OrigArg, 4 * i), false, false, 0);
3005     MemOps.push_back(Store);
3006     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
3007   }
3008
3009   if (!MemOps.empty())
3010     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3011   return FrameIndex;
3012 }
3013
3014 // Setup stack frame, the va_list pointer will start from.
3015 void
3016 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
3017                                         SDLoc dl, SDValue &Chain,
3018                                         unsigned ArgOffset,
3019                                         unsigned TotalArgRegsSaveSize,
3020                                         bool ForceMutable) const {
3021   MachineFunction &MF = DAG.getMachineFunction();
3022   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3023
3024   // Try to store any remaining integer argument regs
3025   // to their spots on the stack so that they may be loaded by deferencing
3026   // the result of va_next.
3027   // If there is no regs to be stored, just point address after last
3028   // argument passed via stack.
3029   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
3030                                   CCInfo.getInRegsParamsCount(),
3031                                   CCInfo.getNextStackOffset(), 4);
3032   AFI->setVarArgsFrameIndex(FrameIndex);
3033 }
3034
3035 SDValue
3036 ARMTargetLowering::LowerFormalArguments(SDValue Chain,
3037                                         CallingConv::ID CallConv, bool isVarArg,
3038                                         const SmallVectorImpl<ISD::InputArg>
3039                                           &Ins,
3040                                         SDLoc dl, SelectionDAG &DAG,
3041                                         SmallVectorImpl<SDValue> &InVals)
3042                                           const {
3043   MachineFunction &MF = DAG.getMachineFunction();
3044   MachineFrameInfo *MFI = MF.getFrameInfo();
3045
3046   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3047
3048   // Assign locations to all of the incoming arguments.
3049   SmallVector<CCValAssign, 16> ArgLocs;
3050   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3051                     *DAG.getContext(), Prologue);
3052   CCInfo.AnalyzeFormalArguments(Ins,
3053                                 CCAssignFnForNode(CallConv, /* Return*/ false,
3054                                                   isVarArg));
3055
3056   SmallVector<SDValue, 16> ArgValues;
3057   SDValue ArgValue;
3058   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
3059   unsigned CurArgIdx = 0;
3060
3061   // Initially ArgRegsSaveSize is zero.
3062   // Then we increase this value each time we meet byval parameter.
3063   // We also increase this value in case of varargs function.
3064   AFI->setArgRegsSaveSize(0);
3065
3066   // Calculate the amount of stack space that we need to allocate to store
3067   // byval and variadic arguments that are passed in registers.
3068   // We need to know this before we allocate the first byval or variadic
3069   // argument, as they will be allocated a stack slot below the CFA (Canonical
3070   // Frame Address, the stack pointer at entry to the function).
3071   unsigned ArgRegBegin = ARM::R4;
3072   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3073     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
3074       break;
3075
3076     CCValAssign &VA = ArgLocs[i];
3077     unsigned Index = VA.getValNo();
3078     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
3079     if (!Flags.isByVal())
3080       continue;
3081
3082     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
3083     unsigned RBegin, REnd;
3084     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
3085     ArgRegBegin = std::min(ArgRegBegin, RBegin);
3086
3087     CCInfo.nextInRegsParam();
3088   }
3089   CCInfo.rewindByValRegsInfo();
3090
3091   int lastInsIndex = -1;
3092   if (isVarArg && MFI->hasVAStart()) {
3093     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3094     if (RegIdx != array_lengthof(GPRArgRegs))
3095       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
3096   }
3097
3098   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
3099   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
3100   auto PtrVT = getPointerTy(DAG.getDataLayout());
3101
3102   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3103     CCValAssign &VA = ArgLocs[i];
3104     if (Ins[VA.getValNo()].isOrigArg()) {
3105       std::advance(CurOrigArg,
3106                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
3107       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
3108     }
3109     // Arguments stored in registers.
3110     if (VA.isRegLoc()) {
3111       EVT RegVT = VA.getLocVT();
3112
3113       if (VA.needsCustom()) {
3114         // f64 and vector types are split up into multiple registers or
3115         // combinations of registers and stack slots.
3116         if (VA.getLocVT() == MVT::v2f64) {
3117           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
3118                                                    Chain, DAG, dl);
3119           VA = ArgLocs[++i]; // skip ahead to next loc
3120           SDValue ArgValue2;
3121           if (VA.isMemLoc()) {
3122             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
3123             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3124             ArgValue2 = DAG.getLoad(
3125                 MVT::f64, dl, Chain, FIN,
3126                 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3127                 false, false, false, 0);
3128           } else {
3129             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
3130                                              Chain, DAG, dl);
3131           }
3132           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
3133           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3134                                  ArgValue, ArgValue1,
3135                                  DAG.getIntPtrConstant(0, dl));
3136           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3137                                  ArgValue, ArgValue2,
3138                                  DAG.getIntPtrConstant(1, dl));
3139         } else
3140           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
3141
3142       } else {
3143         const TargetRegisterClass *RC;
3144
3145         if (RegVT == MVT::f32)
3146           RC = &ARM::SPRRegClass;
3147         else if (RegVT == MVT::f64)
3148           RC = &ARM::DPRRegClass;
3149         else if (RegVT == MVT::v2f64)
3150           RC = &ARM::QPRRegClass;
3151         else if (RegVT == MVT::i32)
3152           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
3153                                            : &ARM::GPRRegClass;
3154         else
3155           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3156
3157         // Transform the arguments in physical registers into virtual ones.
3158         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3159         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3160       }
3161
3162       // If this is an 8 or 16-bit value, it is really passed promoted
3163       // to 32 bits.  Insert an assert[sz]ext to capture this, then
3164       // truncate to the right size.
3165       switch (VA.getLocInfo()) {
3166       default: llvm_unreachable("Unknown loc info!");
3167       case CCValAssign::Full: break;
3168       case CCValAssign::BCvt:
3169         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
3170         break;
3171       case CCValAssign::SExt:
3172         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3173                                DAG.getValueType(VA.getValVT()));
3174         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3175         break;
3176       case CCValAssign::ZExt:
3177         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3178                                DAG.getValueType(VA.getValVT()));
3179         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3180         break;
3181       }
3182
3183       InVals.push_back(ArgValue);
3184
3185     } else { // VA.isRegLoc()
3186
3187       // sanity check
3188       assert(VA.isMemLoc());
3189       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
3190
3191       int index = VA.getValNo();
3192
3193       // Some Ins[] entries become multiple ArgLoc[] entries.
3194       // Process them only once.
3195       if (index != lastInsIndex)
3196         {
3197           ISD::ArgFlagsTy Flags = Ins[index].Flags;
3198           // FIXME: For now, all byval parameter objects are marked mutable.
3199           // This can be changed with more analysis.
3200           // In case of tail call optimization mark all arguments mutable.
3201           // Since they could be overwritten by lowering of arguments in case of
3202           // a tail call.
3203           if (Flags.isByVal()) {
3204             assert(Ins[index].isOrigArg() &&
3205                    "Byval arguments cannot be implicit");
3206             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
3207
3208             int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg,
3209                                             CurByValIndex, VA.getLocMemOffset(),
3210                                             Flags.getByValSize());
3211             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
3212             CCInfo.nextInRegsParam();
3213           } else {
3214             unsigned FIOffset = VA.getLocMemOffset();
3215             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
3216                                             FIOffset, true);
3217
3218             // Create load nodes to retrieve arguments from the stack.
3219             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3220             InVals.push_back(DAG.getLoad(
3221                 VA.getValVT(), dl, Chain, FIN,
3222                 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3223                 false, false, false, 0));
3224           }
3225           lastInsIndex = index;
3226         }
3227     }
3228   }
3229
3230   // varargs
3231   if (isVarArg && MFI->hasVAStart())
3232     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3233                          CCInfo.getNextStackOffset(),
3234                          TotalArgRegsSaveSize);
3235
3236   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
3237
3238   return Chain;
3239 }
3240
3241 /// isFloatingPointZero - Return true if this is +0.0.
3242 static bool isFloatingPointZero(SDValue Op) {
3243   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
3244     return CFP->getValueAPF().isPosZero();
3245   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
3246     // Maybe this has already been legalized into the constant pool?
3247     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
3248       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
3249       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
3250         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
3251           return CFP->getValueAPF().isPosZero();
3252     }
3253   } else if (Op->getOpcode() == ISD::BITCAST &&
3254              Op->getValueType(0) == MVT::f64) {
3255     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
3256     // created by LowerConstantFP().
3257     SDValue BitcastOp = Op->getOperand(0);
3258     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) {
3259       SDValue MoveOp = BitcastOp->getOperand(0);
3260       if (MoveOp->getOpcode() == ISD::TargetConstant &&
3261           cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) {
3262         return true;
3263       }
3264     }
3265   }
3266   return false;
3267 }
3268
3269 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
3270 /// the given operands.
3271 SDValue
3272 ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3273                              SDValue &ARMcc, SelectionDAG &DAG,
3274                              SDLoc dl) const {
3275   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3276     unsigned C = RHSC->getZExtValue();
3277     if (!isLegalICmpImmediate(C)) {
3278       // Constant does not fit, try adjusting it by one?
3279       switch (CC) {
3280       default: break;
3281       case ISD::SETLT:
3282       case ISD::SETGE:
3283         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
3284           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3285           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3286         }
3287         break;
3288       case ISD::SETULT:
3289       case ISD::SETUGE:
3290         if (C != 0 && isLegalICmpImmediate(C-1)) {
3291           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3292           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3293         }
3294         break;
3295       case ISD::SETLE:
3296       case ISD::SETGT:
3297         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
3298           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3299           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3300         }
3301         break;
3302       case ISD::SETULE:
3303       case ISD::SETUGT:
3304         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
3305           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3306           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3307         }
3308         break;
3309       }
3310     }
3311   }
3312
3313   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3314   ARMISD::NodeType CompareType;
3315   switch (CondCode) {
3316   default:
3317     CompareType = ARMISD::CMP;
3318     break;
3319   case ARMCC::EQ:
3320   case ARMCC::NE:
3321     // Uses only Z Flag
3322     CompareType = ARMISD::CMPZ;
3323     break;
3324   }
3325   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3326   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
3327 }
3328
3329 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
3330 SDValue
3331 ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
3332                              SDLoc dl) const {
3333   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
3334   SDValue Cmp;
3335   if (!isFloatingPointZero(RHS))
3336     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
3337   else
3338     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
3339   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
3340 }
3341
3342 /// duplicateCmp - Glue values can have only one use, so this function
3343 /// duplicates a comparison node.
3344 SDValue
3345 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
3346   unsigned Opc = Cmp.getOpcode();
3347   SDLoc DL(Cmp);
3348   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
3349     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3350
3351   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
3352   Cmp = Cmp.getOperand(0);
3353   Opc = Cmp.getOpcode();
3354   if (Opc == ARMISD::CMPFP)
3355     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3356   else {
3357     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
3358     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
3359   }
3360   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
3361 }
3362
3363 std::pair<SDValue, SDValue>
3364 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
3365                                  SDValue &ARMcc) const {
3366   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
3367
3368   SDValue Value, OverflowCmp;
3369   SDValue LHS = Op.getOperand(0);
3370   SDValue RHS = Op.getOperand(1);
3371   SDLoc dl(Op);
3372
3373   // FIXME: We are currently always generating CMPs because we don't support
3374   // generating CMN through the backend. This is not as good as the natural
3375   // CMP case because it causes a register dependency and cannot be folded
3376   // later.
3377
3378   switch (Op.getOpcode()) {
3379   default:
3380     llvm_unreachable("Unknown overflow instruction!");
3381   case ISD::SADDO:
3382     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3383     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3384     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3385     break;
3386   case ISD::UADDO:
3387     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3388     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3389     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3390     break;
3391   case ISD::SSUBO:
3392     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3393     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3394     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3395     break;
3396   case ISD::USUBO:
3397     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3398     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3399     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3400     break;
3401   } // switch (...)
3402
3403   return std::make_pair(Value, OverflowCmp);
3404 }
3405
3406
3407 SDValue
3408 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
3409   // Let legalize expand this if it isn't a legal type yet.
3410   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3411     return SDValue();
3412
3413   SDValue Value, OverflowCmp;
3414   SDValue ARMcc;
3415   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
3416   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3417   SDLoc dl(Op);
3418   // We use 0 and 1 as false and true values.
3419   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3420   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3421   EVT VT = Op.getValueType();
3422
3423   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
3424                                  ARMcc, CCR, OverflowCmp);
3425
3426   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3427   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3428 }
3429
3430
3431 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
3432   SDValue Cond = Op.getOperand(0);
3433   SDValue SelectTrue = Op.getOperand(1);
3434   SDValue SelectFalse = Op.getOperand(2);
3435   SDLoc dl(Op);
3436   unsigned Opc = Cond.getOpcode();
3437
3438   if (Cond.getResNo() == 1 &&
3439       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
3440        Opc == ISD::USUBO)) {
3441     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
3442       return SDValue();
3443
3444     SDValue Value, OverflowCmp;
3445     SDValue ARMcc;
3446     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
3447     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3448     EVT VT = Op.getValueType();
3449
3450     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
3451                    OverflowCmp, DAG);
3452   }
3453
3454   // Convert:
3455   //
3456   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
3457   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
3458   //
3459   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
3460     const ConstantSDNode *CMOVTrue =
3461       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
3462     const ConstantSDNode *CMOVFalse =
3463       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3464
3465     if (CMOVTrue && CMOVFalse) {
3466       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
3467       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
3468
3469       SDValue True;
3470       SDValue False;
3471       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
3472         True = SelectTrue;
3473         False = SelectFalse;
3474       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
3475         True = SelectFalse;
3476         False = SelectTrue;
3477       }
3478
3479       if (True.getNode() && False.getNode()) {
3480         EVT VT = Op.getValueType();
3481         SDValue ARMcc = Cond.getOperand(2);
3482         SDValue CCR = Cond.getOperand(3);
3483         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
3484         assert(True.getValueType() == VT);
3485         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
3486       }
3487     }
3488   }
3489
3490   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
3491   // undefined bits before doing a full-word comparison with zero.
3492   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
3493                      DAG.getConstant(1, dl, Cond.getValueType()));
3494
3495   return DAG.getSelectCC(dl, Cond,
3496                          DAG.getConstant(0, dl, Cond.getValueType()),
3497                          SelectTrue, SelectFalse, ISD::SETNE);
3498 }
3499
3500 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
3501                                  bool &swpCmpOps, bool &swpVselOps) {
3502   // Start by selecting the GE condition code for opcodes that return true for
3503   // 'equality'
3504   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
3505       CC == ISD::SETULE)
3506     CondCode = ARMCC::GE;
3507
3508   // and GT for opcodes that return false for 'equality'.
3509   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
3510            CC == ISD::SETULT)
3511     CondCode = ARMCC::GT;
3512
3513   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
3514   // to swap the compare operands.
3515   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
3516       CC == ISD::SETULT)
3517     swpCmpOps = true;
3518
3519   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
3520   // If we have an unordered opcode, we need to swap the operands to the VSEL
3521   // instruction (effectively negating the condition).
3522   //
3523   // This also has the effect of swapping which one of 'less' or 'greater'
3524   // returns true, so we also swap the compare operands. It also switches
3525   // whether we return true for 'equality', so we compensate by picking the
3526   // opposite condition code to our original choice.
3527   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
3528       CC == ISD::SETUGT) {
3529     swpCmpOps = !swpCmpOps;
3530     swpVselOps = !swpVselOps;
3531     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
3532   }
3533
3534   // 'ordered' is 'anything but unordered', so use the VS condition code and
3535   // swap the VSEL operands.
3536   if (CC == ISD::SETO) {
3537     CondCode = ARMCC::VS;
3538     swpVselOps = true;
3539   }
3540
3541   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
3542   // code and swap the VSEL operands.
3543   if (CC == ISD::SETUNE) {
3544     CondCode = ARMCC::EQ;
3545     swpVselOps = true;
3546   }
3547 }
3548
3549 SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal,
3550                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
3551                                    SDValue Cmp, SelectionDAG &DAG) const {
3552   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
3553     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
3554                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
3555     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
3556                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
3557
3558     SDValue TrueLow = TrueVal.getValue(0);
3559     SDValue TrueHigh = TrueVal.getValue(1);
3560     SDValue FalseLow = FalseVal.getValue(0);
3561     SDValue FalseHigh = FalseVal.getValue(1);
3562
3563     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
3564                               ARMcc, CCR, Cmp);
3565     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
3566                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
3567
3568     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
3569   } else {
3570     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
3571                        Cmp);
3572   }
3573 }
3574
3575 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
3576   EVT VT = Op.getValueType();
3577   SDValue LHS = Op.getOperand(0);
3578   SDValue RHS = Op.getOperand(1);
3579   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
3580   SDValue TrueVal = Op.getOperand(2);
3581   SDValue FalseVal = Op.getOperand(3);
3582   SDLoc dl(Op);
3583
3584   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
3585     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
3586                                                     dl);
3587
3588     // If softenSetCCOperands only returned one value, we should compare it to
3589     // zero.
3590     if (!RHS.getNode()) {
3591       RHS = DAG.getConstant(0, dl, LHS.getValueType());
3592       CC = ISD::SETNE;
3593     }
3594   }
3595
3596   if (LHS.getValueType() == MVT::i32) {
3597     // Try to generate VSEL on ARMv8.
3598     // The VSEL instruction can't use all the usual ARM condition
3599     // codes: it only has two bits to select the condition code, so it's
3600     // constrained to use only GE, GT, VS and EQ.
3601     //
3602     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
3603     // swap the operands of the previous compare instruction (effectively
3604     // inverting the compare condition, swapping 'less' and 'greater') and
3605     // sometimes need to swap the operands to the VSEL (which inverts the
3606     // condition in the sense of firing whenever the previous condition didn't)
3607     if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
3608                                     TrueVal.getValueType() == MVT::f64)) {
3609       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3610       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
3611           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
3612         CC = ISD::getSetCCInverse(CC, true);
3613         std::swap(TrueVal, FalseVal);
3614       }
3615     }
3616
3617     SDValue ARMcc;
3618     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3619     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3620     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
3621   }
3622
3623   ARMCC::CondCodes CondCode, CondCode2;
3624   FPCCToARMCC(CC, CondCode, CondCode2);
3625
3626   // Try to generate VMAXNM/VMINNM on ARMv8.
3627   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
3628                                   TrueVal.getValueType() == MVT::f64)) {
3629     // We can use VMAXNM/VMINNM for a compare followed by a select with the
3630     // same operands, as follows:
3631     //   c = fcmp [?gt, ?ge, ?lt, ?le] a, b
3632     //   select c, a, b
3633     // In NoNaNsFPMath the CC will have been changed from, e.g., 'ogt' to 'gt'.
3634     bool swapSides = false;
3635     if (!getTargetMachine().Options.NoNaNsFPMath) {
3636       // transformability may depend on which way around we compare
3637       switch (CC) {
3638       default:
3639         break;
3640       case ISD::SETOGT:
3641       case ISD::SETOGE:
3642       case ISD::SETOLT:
3643       case ISD::SETOLE:
3644         // the non-NaN should be RHS
3645         swapSides = DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS);
3646         break;
3647       case ISD::SETUGT:
3648       case ISD::SETUGE:
3649       case ISD::SETULT:
3650       case ISD::SETULE:
3651         // the non-NaN should be LHS
3652         swapSides = DAG.isKnownNeverNaN(RHS) && !DAG.isKnownNeverNaN(LHS);
3653         break;
3654       }
3655     }
3656     swapSides = swapSides || (LHS == FalseVal && RHS == TrueVal);
3657     if (swapSides) {
3658       CC = ISD::getSetCCSwappedOperands(CC);
3659       std::swap(LHS, RHS);
3660     }
3661     if (LHS == TrueVal && RHS == FalseVal) {
3662       bool canTransform = true;
3663       // FIXME: FastMathFlags::noSignedZeros() doesn't appear reachable from here
3664       if (!getTargetMachine().Options.UnsafeFPMath &&
3665           !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
3666         const ConstantFPSDNode *Zero;
3667         switch (CC) {
3668         default:
3669           break;
3670         case ISD::SETOGT:
3671         case ISD::SETUGT:
3672         case ISD::SETGT:
3673           // RHS must not be -0
3674           canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) &&
3675                          !Zero->isNegative();
3676           break;
3677         case ISD::SETOGE:
3678         case ISD::SETUGE:
3679         case ISD::SETGE:
3680           // LHS must not be -0
3681           canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) &&
3682                          !Zero->isNegative();
3683           break;
3684         case ISD::SETOLT:
3685         case ISD::SETULT:
3686         case ISD::SETLT:
3687           // RHS must not be +0
3688           canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) &&
3689                           Zero->isNegative();
3690           break;
3691         case ISD::SETOLE:
3692         case ISD::SETULE:
3693         case ISD::SETLE:
3694           // LHS must not be +0
3695           canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) &&
3696                           Zero->isNegative();
3697           break;
3698         }
3699       }
3700       if (canTransform) {
3701         // Note: If one of the elements in a pair is a number and the other
3702         // element is NaN, the corresponding result element is the number.
3703         // This is consistent with the IEEE 754-2008 standard.
3704         // Therefore, a > b ? a : b <=> vmax(a,b), if b is constant and a is NaN
3705         switch (CC) {
3706         default:
3707           break;
3708         case ISD::SETOGT:
3709         case ISD::SETOGE:
3710           if (!DAG.isKnownNeverNaN(RHS))
3711             break;
3712           return DAG.getNode(ISD::FMAXNUM, dl, VT, LHS, RHS);
3713         case ISD::SETUGT:
3714         case ISD::SETUGE:
3715           if (!DAG.isKnownNeverNaN(LHS))
3716             break;
3717         case ISD::SETGT:
3718         case ISD::SETGE:
3719           return DAG.getNode(ISD::FMAXNUM, dl, VT, LHS, RHS);
3720         case ISD::SETOLT:
3721         case ISD::SETOLE:
3722           if (!DAG.isKnownNeverNaN(RHS))
3723             break;
3724           return DAG.getNode(ISD::FMINNUM, dl, VT, LHS, RHS);
3725         case ISD::SETULT:
3726         case ISD::SETULE:
3727           if (!DAG.isKnownNeverNaN(LHS))
3728             break;
3729         case ISD::SETLT:
3730         case ISD::SETLE:
3731           return DAG.getNode(ISD::FMINNUM, dl, VT, LHS, RHS);
3732         }
3733       }
3734     }
3735
3736     bool swpCmpOps = false;
3737     bool swpVselOps = false;
3738     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
3739
3740     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
3741         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
3742       if (swpCmpOps)
3743         std::swap(LHS, RHS);
3744       if (swpVselOps)
3745         std::swap(TrueVal, FalseVal);
3746     }
3747   }
3748
3749   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3750   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3751   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3752   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
3753   if (CondCode2 != ARMCC::AL) {
3754     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
3755     // FIXME: Needs another CMP because flag can have but one use.
3756     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
3757     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
3758   }
3759   return Result;
3760 }
3761
3762 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
3763 /// to morph to an integer compare sequence.
3764 static bool canChangeToInt(SDValue Op, bool &SeenZero,
3765                            const ARMSubtarget *Subtarget) {
3766   SDNode *N = Op.getNode();
3767   if (!N->hasOneUse())
3768     // Otherwise it requires moving the value from fp to integer registers.
3769     return false;
3770   if (!N->getNumValues())
3771     return false;
3772   EVT VT = Op.getValueType();
3773   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
3774     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
3775     // vmrs are very slow, e.g. cortex-a8.
3776     return false;
3777
3778   if (isFloatingPointZero(Op)) {
3779     SeenZero = true;
3780     return true;
3781   }
3782   return ISD::isNormalLoad(N);
3783 }
3784
3785 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
3786   if (isFloatingPointZero(Op))
3787     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
3788
3789   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
3790     return DAG.getLoad(MVT::i32, SDLoc(Op),
3791                        Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
3792                        Ld->isVolatile(), Ld->isNonTemporal(),
3793                        Ld->isInvariant(), Ld->getAlignment());
3794
3795   llvm_unreachable("Unknown VFP cmp argument!");
3796 }
3797
3798 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
3799                            SDValue &RetVal1, SDValue &RetVal2) {
3800   SDLoc dl(Op);
3801
3802   if (isFloatingPointZero(Op)) {
3803     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
3804     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
3805     return;
3806   }
3807
3808   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
3809     SDValue Ptr = Ld->getBasePtr();
3810     RetVal1 = DAG.getLoad(MVT::i32, dl,
3811                           Ld->getChain(), Ptr,
3812                           Ld->getPointerInfo(),
3813                           Ld->isVolatile(), Ld->isNonTemporal(),
3814                           Ld->isInvariant(), Ld->getAlignment());
3815
3816     EVT PtrType = Ptr.getValueType();
3817     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
3818     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
3819                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
3820     RetVal2 = DAG.getLoad(MVT::i32, dl,
3821                           Ld->getChain(), NewPtr,
3822                           Ld->getPointerInfo().getWithOffset(4),
3823                           Ld->isVolatile(), Ld->isNonTemporal(),
3824                           Ld->isInvariant(), NewAlign);
3825     return;
3826   }
3827
3828   llvm_unreachable("Unknown VFP cmp argument!");
3829 }
3830
3831 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
3832 /// f32 and even f64 comparisons to integer ones.
3833 SDValue
3834 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
3835   SDValue Chain = Op.getOperand(0);
3836   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3837   SDValue LHS = Op.getOperand(2);
3838   SDValue RHS = Op.getOperand(3);
3839   SDValue Dest = Op.getOperand(4);
3840   SDLoc dl(Op);
3841
3842   bool LHSSeenZero = false;
3843   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
3844   bool RHSSeenZero = false;
3845   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
3846   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
3847     // If unsafe fp math optimization is enabled and there are no other uses of
3848     // the CMP operands, and the condition code is EQ or NE, we can optimize it
3849     // to an integer comparison.
3850     if (CC == ISD::SETOEQ)
3851       CC = ISD::SETEQ;
3852     else if (CC == ISD::SETUNE)
3853       CC = ISD::SETNE;
3854
3855     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
3856     SDValue ARMcc;
3857     if (LHS.getValueType() == MVT::f32) {
3858       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3859                         bitcastf32Toi32(LHS, DAG), Mask);
3860       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3861                         bitcastf32Toi32(RHS, DAG), Mask);
3862       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3863       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3864       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3865                          Chain, Dest, ARMcc, CCR, Cmp);
3866     }
3867
3868     SDValue LHS1, LHS2;
3869     SDValue RHS1, RHS2;
3870     expandf64Toi32(LHS, DAG, LHS1, LHS2);
3871     expandf64Toi32(RHS, DAG, RHS1, RHS2);
3872     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
3873     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
3874     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3875     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3876     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3877     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
3878     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
3879   }
3880
3881   return SDValue();
3882 }
3883
3884 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3885   SDValue Chain = Op.getOperand(0);
3886   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3887   SDValue LHS = Op.getOperand(2);
3888   SDValue RHS = Op.getOperand(3);
3889   SDValue Dest = Op.getOperand(4);
3890   SDLoc dl(Op);
3891
3892   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
3893     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
3894                                                     dl);
3895
3896     // If softenSetCCOperands only returned one value, we should compare it to
3897     // zero.
3898     if (!RHS.getNode()) {
3899       RHS = DAG.getConstant(0, dl, LHS.getValueType());
3900       CC = ISD::SETNE;
3901     }
3902   }
3903
3904   if (LHS.getValueType() == MVT::i32) {
3905     SDValue ARMcc;
3906     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3907     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3908     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3909                        Chain, Dest, ARMcc, CCR, Cmp);
3910   }
3911
3912   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
3913
3914   if (getTargetMachine().Options.UnsafeFPMath &&
3915       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
3916        CC == ISD::SETNE || CC == ISD::SETUNE)) {
3917     SDValue Result = OptimizeVFPBrcond(Op, DAG);
3918     if (Result.getNode())
3919       return Result;
3920   }
3921
3922   ARMCC::CondCodes CondCode, CondCode2;
3923   FPCCToARMCC(CC, CondCode, CondCode2);
3924
3925   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3926   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3927   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3928   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3929   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
3930   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
3931   if (CondCode2 != ARMCC::AL) {
3932     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
3933     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
3934     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
3935   }
3936   return Res;
3937 }
3938
3939 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3940   SDValue Chain = Op.getOperand(0);
3941   SDValue Table = Op.getOperand(1);
3942   SDValue Index = Op.getOperand(2);
3943   SDLoc dl(Op);
3944
3945   EVT PTy = getPointerTy(DAG.getDataLayout());
3946   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
3947   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
3948   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
3949   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
3950   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
3951   if (Subtarget->isThumb2()) {
3952     // Thumb2 uses a two-level jump. That is, it jumps into the jump table
3953     // which does another jump to the destination. This also makes it easier
3954     // to translate it to TBB / TBH later.
3955     // FIXME: This might not work if the function is extremely large.
3956     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
3957                        Addr, Op.getOperand(2), JTI);
3958   }
3959   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
3960     Addr =
3961         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
3962                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
3963                     false, false, false, 0);
3964     Chain = Addr.getValue(1);
3965     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
3966     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
3967   } else {
3968     Addr =
3969         DAG.getLoad(PTy, dl, Chain, Addr,
3970                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
3971                     false, false, false, 0);
3972     Chain = Addr.getValue(1);
3973     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
3974   }
3975 }
3976
3977 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
3978   EVT VT = Op.getValueType();
3979   SDLoc dl(Op);
3980
3981   if (Op.getValueType().getVectorElementType() == MVT::i32) {
3982     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
3983       return Op;
3984     return DAG.UnrollVectorOp(Op.getNode());
3985   }
3986
3987   assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
3988          "Invalid type for custom lowering!");
3989   if (VT != MVT::v4i16)
3990     return DAG.UnrollVectorOp(Op.getNode());
3991
3992   Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
3993   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
3994 }
3995
3996 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
3997   EVT VT = Op.getValueType();
3998   if (VT.isVector())
3999     return LowerVectorFP_TO_INT(Op, DAG);
4000   if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
4001     RTLIB::Libcall LC;
4002     if (Op.getOpcode() == ISD::FP_TO_SINT)
4003       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
4004                               Op.getValueType());
4005     else
4006       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
4007                               Op.getValueType());
4008     return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1,
4009                        /*isSigned*/ false, SDLoc(Op)).first;
4010   }
4011
4012   return Op;
4013 }
4014
4015 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4016   EVT VT = Op.getValueType();
4017   SDLoc dl(Op);
4018
4019   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
4020     if (VT.getVectorElementType() == MVT::f32)
4021       return Op;
4022     return DAG.UnrollVectorOp(Op.getNode());
4023   }
4024
4025   assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
4026          "Invalid type for custom lowering!");
4027   if (VT != MVT::v4f32)
4028     return DAG.UnrollVectorOp(Op.getNode());
4029
4030   unsigned CastOpc;
4031   unsigned Opc;
4032   switch (Op.getOpcode()) {
4033   default: llvm_unreachable("Invalid opcode!");
4034   case ISD::SINT_TO_FP:
4035     CastOpc = ISD::SIGN_EXTEND;
4036     Opc = ISD::SINT_TO_FP;
4037     break;
4038   case ISD::UINT_TO_FP:
4039     CastOpc = ISD::ZERO_EXTEND;
4040     Opc = ISD::UINT_TO_FP;
4041     break;
4042   }
4043
4044   Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
4045   return DAG.getNode(Opc, dl, VT, Op);
4046 }
4047
4048 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
4049   EVT VT = Op.getValueType();
4050   if (VT.isVector())
4051     return LowerVectorINT_TO_FP(Op, DAG);
4052   if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
4053     RTLIB::Libcall LC;
4054     if (Op.getOpcode() == ISD::SINT_TO_FP)
4055       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
4056                               Op.getValueType());
4057     else
4058       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
4059                               Op.getValueType());
4060     return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1,
4061                        /*isSigned*/ false, SDLoc(Op)).first;
4062   }
4063
4064   return Op;
4065 }
4066
4067 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
4068   // Implement fcopysign with a fabs and a conditional fneg.
4069   SDValue Tmp0 = Op.getOperand(0);
4070   SDValue Tmp1 = Op.getOperand(1);
4071   SDLoc dl(Op);
4072   EVT VT = Op.getValueType();
4073   EVT SrcVT = Tmp1.getValueType();
4074   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
4075     Tmp0.getOpcode() == ARMISD::VMOVDRR;
4076   bool UseNEON = !InGPR && Subtarget->hasNEON();
4077
4078   if (UseNEON) {
4079     // Use VBSL to copy the sign bit.
4080     unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
4081     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
4082                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
4083     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
4084     if (VT == MVT::f64)
4085       Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4086                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
4087                          DAG.getConstant(32, dl, MVT::i32));
4088     else /*if (VT == MVT::f32)*/
4089       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
4090     if (SrcVT == MVT::f32) {
4091       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
4092       if (VT == MVT::f64)
4093         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4094                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
4095                            DAG.getConstant(32, dl, MVT::i32));
4096     } else if (VT == MVT::f32)
4097       Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
4098                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
4099                          DAG.getConstant(32, dl, MVT::i32));
4100     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
4101     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
4102
4103     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
4104                                             dl, MVT::i32);
4105     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
4106     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
4107                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
4108
4109     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
4110                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
4111                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
4112     if (VT == MVT::f32) {
4113       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
4114       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
4115                         DAG.getConstant(0, dl, MVT::i32));
4116     } else {
4117       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
4118     }
4119
4120     return Res;
4121   }
4122
4123   // Bitcast operand 1 to i32.
4124   if (SrcVT == MVT::f64)
4125     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4126                        Tmp1).getValue(1);
4127   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
4128
4129   // Or in the signbit with integer operations.
4130   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
4131   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4132   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
4133   if (VT == MVT::f32) {
4134     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
4135                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
4136     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
4137                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
4138   }
4139
4140   // f64: Or the high part with signbit and then combine two parts.
4141   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4142                      Tmp0);
4143   SDValue Lo = Tmp0.getValue(0);
4144   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
4145   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
4146   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
4147 }
4148
4149 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
4150   MachineFunction &MF = DAG.getMachineFunction();
4151   MachineFrameInfo *MFI = MF.getFrameInfo();
4152   MFI->setReturnAddressIsTaken(true);
4153
4154   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
4155     return SDValue();
4156
4157   EVT VT = Op.getValueType();
4158   SDLoc dl(Op);
4159   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4160   if (Depth) {
4161     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
4162     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
4163     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
4164                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
4165                        MachinePointerInfo(), false, false, false, 0);
4166   }
4167
4168   // Return LR, which contains the return address. Mark it an implicit live-in.
4169   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4170   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
4171 }
4172
4173 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
4174   const ARMBaseRegisterInfo &ARI =
4175     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
4176   MachineFunction &MF = DAG.getMachineFunction();
4177   MachineFrameInfo *MFI = MF.getFrameInfo();
4178   MFI->setFrameAddressIsTaken(true);
4179
4180   EVT VT = Op.getValueType();
4181   SDLoc dl(Op);  // FIXME probably not meaningful
4182   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4183   unsigned FrameReg = ARI.getFrameRegister(MF);
4184   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
4185   while (Depth--)
4186     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
4187                             MachinePointerInfo(),
4188                             false, false, false, 0);
4189   return FrameAddr;
4190 }
4191
4192 // FIXME? Maybe this could be a TableGen attribute on some registers and
4193 // this table could be generated automatically from RegInfo.
4194 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
4195                                               SelectionDAG &DAG) const {
4196   unsigned Reg = StringSwitch<unsigned>(RegName)
4197                        .Case("sp", ARM::SP)
4198                        .Default(0);
4199   if (Reg)
4200     return Reg;
4201   report_fatal_error(Twine("Invalid register name \""
4202                               + StringRef(RegName)  + "\"."));
4203 }
4204
4205 // Result is 64 bit value so split into two 32 bit values and return as a
4206 // pair of values.
4207 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
4208                                 SelectionDAG &DAG) {
4209   SDLoc DL(N);
4210
4211   // This function is only supposed to be called for i64 type destination.
4212   assert(N->getValueType(0) == MVT::i64
4213           && "ExpandREAD_REGISTER called for non-i64 type result.");
4214
4215   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
4216                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
4217                              N->getOperand(0),
4218                              N->getOperand(1));
4219
4220   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
4221                     Read.getValue(1)));
4222   Results.push_back(Read.getOperand(0));
4223 }
4224
4225 /// ExpandBITCAST - If the target supports VFP, this function is called to
4226 /// expand a bit convert where either the source or destination type is i64 to
4227 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
4228 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
4229 /// vectors), since the legalizer won't know what to do with that.
4230 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
4231   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4232   SDLoc dl(N);
4233   SDValue Op = N->getOperand(0);
4234
4235   // This function is only supposed to be called for i64 types, either as the
4236   // source or destination of the bit convert.
4237   EVT SrcVT = Op.getValueType();
4238   EVT DstVT = N->getValueType(0);
4239   assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
4240          "ExpandBITCAST called for non-i64 type");
4241
4242   // Turn i64->f64 into VMOVDRR.
4243   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
4244     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4245                              DAG.getConstant(0, dl, MVT::i32));
4246     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4247                              DAG.getConstant(1, dl, MVT::i32));
4248     return DAG.getNode(ISD::BITCAST, dl, DstVT,
4249                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
4250   }
4251
4252   // Turn f64->i64 into VMOVRRD.
4253   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
4254     SDValue Cvt;
4255     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
4256         SrcVT.getVectorNumElements() > 1)
4257       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4258                         DAG.getVTList(MVT::i32, MVT::i32),
4259                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
4260     else
4261       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4262                         DAG.getVTList(MVT::i32, MVT::i32), Op);
4263     // Merge the pieces into a single i64 value.
4264     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
4265   }
4266
4267   return SDValue();
4268 }
4269
4270 /// getZeroVector - Returns a vector of specified type with all zero elements.
4271 /// Zero vectors are used to represent vector negation and in those cases
4272 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
4273 /// not support i64 elements, so sometimes the zero vectors will need to be
4274 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
4275 /// zero vector.
4276 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
4277   assert(VT.isVector() && "Expected a vector type");
4278   // The canonical modified immediate encoding of a zero vector is....0!
4279   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
4280   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
4281   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
4282   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4283 }
4284
4285 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
4286 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4287 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
4288                                                 SelectionDAG &DAG) const {
4289   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4290   EVT VT = Op.getValueType();
4291   unsigned VTBits = VT.getSizeInBits();
4292   SDLoc dl(Op);
4293   SDValue ShOpLo = Op.getOperand(0);
4294   SDValue ShOpHi = Op.getOperand(1);
4295   SDValue ShAmt  = Op.getOperand(2);
4296   SDValue ARMcc;
4297   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
4298
4299   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
4300
4301   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4302                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4303   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
4304   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4305                                    DAG.getConstant(VTBits, dl, MVT::i32));
4306   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
4307   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4308   SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
4309
4310   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4311   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4312                           ISD::SETGE, ARMcc, DAG, dl);
4313   SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
4314   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
4315                            CCR, Cmp);
4316
4317   SDValue Ops[2] = { Lo, Hi };
4318   return DAG.getMergeValues(Ops, dl);
4319 }
4320
4321 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
4322 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4323 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
4324                                                SelectionDAG &DAG) const {
4325   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4326   EVT VT = Op.getValueType();
4327   unsigned VTBits = VT.getSizeInBits();
4328   SDLoc dl(Op);
4329   SDValue ShOpLo = Op.getOperand(0);
4330   SDValue ShOpHi = Op.getOperand(1);
4331   SDValue ShAmt  = Op.getOperand(2);
4332   SDValue ARMcc;
4333
4334   assert(Op.getOpcode() == ISD::SHL_PARTS);
4335   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4336                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4337   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
4338   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4339                                    DAG.getConstant(VTBits, dl, MVT::i32));
4340   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
4341   SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
4342
4343   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4344   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4345   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4346                           ISD::SETGE, ARMcc, DAG, dl);
4347   SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4348   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
4349                            CCR, Cmp);
4350
4351   SDValue Ops[2] = { Lo, Hi };
4352   return DAG.getMergeValues(Ops, dl);
4353 }
4354
4355 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
4356                                             SelectionDAG &DAG) const {
4357   // The rounding mode is in bits 23:22 of the FPSCR.
4358   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4359   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4360   // so that the shift + and get folded into a bitfield extract.
4361   SDLoc dl(Op);
4362   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
4363                               DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
4364                                               MVT::i32));
4365   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
4366                                   DAG.getConstant(1U << 22, dl, MVT::i32));
4367   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4368                               DAG.getConstant(22, dl, MVT::i32));
4369   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4370                      DAG.getConstant(3, dl, MVT::i32));
4371 }
4372
4373 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
4374                          const ARMSubtarget *ST) {
4375   SDLoc dl(N);
4376   EVT VT = N->getValueType(0);
4377   if (VT.isVector()) {
4378     assert(ST->hasNEON());
4379
4380     // Compute the least significant set bit: LSB = X & -X
4381     SDValue X = N->getOperand(0);
4382     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
4383     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
4384
4385     EVT ElemTy = VT.getVectorElementType();
4386
4387     if (ElemTy == MVT::i8) {
4388       // Compute with: cttz(x) = ctpop(lsb - 1)
4389       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4390                                 DAG.getTargetConstant(1, dl, ElemTy));
4391       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
4392       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
4393     }
4394
4395     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
4396         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
4397       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
4398       unsigned NumBits = ElemTy.getSizeInBits();
4399       SDValue WidthMinus1 =
4400           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4401                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
4402       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
4403       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
4404     }
4405
4406     // Compute with: cttz(x) = ctpop(lsb - 1)
4407
4408     // Since we can only compute the number of bits in a byte with vcnt.8, we
4409     // have to gather the result with pairwise addition (vpaddl) for i16, i32,
4410     // and i64.
4411
4412     // Compute LSB - 1.
4413     SDValue Bits;
4414     if (ElemTy == MVT::i64) {
4415       // Load constant 0xffff'ffff'ffff'ffff to register.
4416       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4417                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
4418       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
4419     } else {
4420       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4421                                 DAG.getTargetConstant(1, dl, ElemTy));
4422       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
4423     }
4424
4425     // Count #bits with vcnt.8.
4426     EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
4427     SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
4428     SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
4429
4430     // Gather the #bits with vpaddl (pairwise add.)
4431     EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
4432     SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
4433         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4434         Cnt8);
4435     if (ElemTy == MVT::i16)
4436       return Cnt16;
4437
4438     EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
4439     SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
4440         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4441         Cnt16);
4442     if (ElemTy == MVT::i32)
4443       return Cnt32;
4444
4445     assert(ElemTy == MVT::i64);
4446     SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4447         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4448         Cnt32);
4449     return Cnt64;
4450   }
4451
4452   if (!ST->hasV6T2Ops())
4453     return SDValue();
4454
4455   SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
4456   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
4457 }
4458
4459 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
4460 /// for each 16-bit element from operand, repeated.  The basic idea is to
4461 /// leverage vcnt to get the 8-bit counts, gather and add the results.
4462 ///
4463 /// Trace for v4i16:
4464 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
4465 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
4466 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
4467 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
4468 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
4469 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
4470 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
4471 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
4472 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
4473   EVT VT = N->getValueType(0);
4474   SDLoc DL(N);
4475
4476   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
4477   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
4478   SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
4479   SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
4480   SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
4481   return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
4482 }
4483
4484 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
4485 /// bit-count for each 16-bit element from the operand.  We need slightly
4486 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
4487 /// 64/128-bit registers.
4488 ///
4489 /// Trace for v4i16:
4490 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
4491 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
4492 /// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
4493 /// v4i16:Extracted = [k0    k1    k2    k3    ]
4494 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
4495   EVT VT = N->getValueType(0);
4496   SDLoc DL(N);
4497
4498   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
4499   if (VT.is64BitVector()) {
4500     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
4501     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
4502                        DAG.getIntPtrConstant(0, DL));
4503   } else {
4504     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
4505                                     BitCounts, DAG.getIntPtrConstant(0, DL));
4506     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
4507   }
4508 }
4509
4510 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
4511 /// bit-count for each 32-bit element from the operand.  The idea here is
4512 /// to split the vector into 16-bit elements, leverage the 16-bit count
4513 /// routine, and then combine the results.
4514 ///
4515 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
4516 /// input    = [v0    v1    ] (vi: 32-bit elements)
4517 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
4518 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
4519 /// vrev: N0 = [k1 k0 k3 k2 ]
4520 ///            [k0 k1 k2 k3 ]
4521 ///       N1 =+[k1 k0 k3 k2 ]
4522 ///            [k0 k2 k1 k3 ]
4523 ///       N2 =+[k1 k3 k0 k2 ]
4524 ///            [k0    k2    k1    k3    ]
4525 /// Extended =+[k1    k3    k0    k2    ]
4526 ///            [k0    k2    ]
4527 /// Extracted=+[k1    k3    ]
4528 ///
4529 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
4530   EVT VT = N->getValueType(0);
4531   SDLoc DL(N);
4532
4533   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
4534
4535   SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
4536   SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
4537   SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
4538   SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
4539   SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
4540
4541   if (VT.is64BitVector()) {
4542     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
4543     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
4544                        DAG.getIntPtrConstant(0, DL));
4545   } else {
4546     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
4547                                     DAG.getIntPtrConstant(0, DL));
4548     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
4549   }
4550 }
4551
4552 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
4553                           const ARMSubtarget *ST) {
4554   EVT VT = N->getValueType(0);
4555
4556   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
4557   assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
4558           VT == MVT::v4i16 || VT == MVT::v8i16) &&
4559          "Unexpected type for custom ctpop lowering");
4560
4561   if (VT.getVectorElementType() == MVT::i32)
4562     return lowerCTPOP32BitElements(N, DAG);
4563   else
4564     return lowerCTPOP16BitElements(N, DAG);
4565 }
4566
4567 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
4568                           const ARMSubtarget *ST) {
4569   EVT VT = N->getValueType(0);
4570   SDLoc dl(N);
4571
4572   if (!VT.isVector())
4573     return SDValue();
4574
4575   // Lower vector shifts on NEON to use VSHL.
4576   assert(ST->hasNEON() && "unexpected vector shift");
4577
4578   // Left shifts translate directly to the vshiftu intrinsic.
4579   if (N->getOpcode() == ISD::SHL)
4580     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4581                        DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
4582                                        MVT::i32),
4583                        N->getOperand(0), N->getOperand(1));
4584
4585   assert((N->getOpcode() == ISD::SRA ||
4586           N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
4587
4588   // NEON uses the same intrinsics for both left and right shifts.  For
4589   // right shifts, the shift amounts are negative, so negate the vector of
4590   // shift amounts.
4591   EVT ShiftVT = N->getOperand(1).getValueType();
4592   SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
4593                                      getZeroVector(ShiftVT, DAG, dl),
4594                                      N->getOperand(1));
4595   Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
4596                              Intrinsic::arm_neon_vshifts :
4597                              Intrinsic::arm_neon_vshiftu);
4598   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4599                      DAG.getConstant(vshiftInt, dl, MVT::i32),
4600                      N->getOperand(0), NegatedCount);
4601 }
4602
4603 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
4604                                 const ARMSubtarget *ST) {
4605   EVT VT = N->getValueType(0);
4606   SDLoc dl(N);
4607
4608   // We can get here for a node like i32 = ISD::SHL i32, i64
4609   if (VT != MVT::i64)
4610     return SDValue();
4611
4612   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
4613          "Unknown shift to lower!");
4614
4615   // We only lower SRA, SRL of 1 here, all others use generic lowering.
4616   if (!isa<ConstantSDNode>(N->getOperand(1)) ||
4617       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
4618     return SDValue();
4619
4620   // If we are in thumb mode, we don't have RRX.
4621   if (ST->isThumb1Only()) return SDValue();
4622
4623   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
4624   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
4625                            DAG.getConstant(0, dl, MVT::i32));
4626   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
4627                            DAG.getConstant(1, dl, MVT::i32));
4628
4629   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
4630   // captures the result into a carry flag.
4631   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
4632   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
4633
4634   // The low part is an ARMISD::RRX operand, which shifts the carry in.
4635   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
4636
4637   // Merge the pieces into a single i64 value.
4638  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
4639 }
4640
4641 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
4642   SDValue TmpOp0, TmpOp1;
4643   bool Invert = false;
4644   bool Swap = false;
4645   unsigned Opc = 0;
4646
4647   SDValue Op0 = Op.getOperand(0);
4648   SDValue Op1 = Op.getOperand(1);
4649   SDValue CC = Op.getOperand(2);
4650   EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
4651   EVT VT = Op.getValueType();
4652   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
4653   SDLoc dl(Op);
4654
4655   if (Op1.getValueType().isFloatingPoint()) {
4656     switch (SetCCOpcode) {
4657     default: llvm_unreachable("Illegal FP comparison");
4658     case ISD::SETUNE:
4659     case ISD::SETNE:  Invert = true; // Fallthrough
4660     case ISD::SETOEQ:
4661     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
4662     case ISD::SETOLT:
4663     case ISD::SETLT: Swap = true; // Fallthrough
4664     case ISD::SETOGT:
4665     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
4666     case ISD::SETOLE:
4667     case ISD::SETLE:  Swap = true; // Fallthrough
4668     case ISD::SETOGE:
4669     case ISD::SETGE: Opc = ARMISD::VCGE; break;
4670     case ISD::SETUGE: Swap = true; // Fallthrough
4671     case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
4672     case ISD::SETUGT: Swap = true; // Fallthrough
4673     case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
4674     case ISD::SETUEQ: Invert = true; // Fallthrough
4675     case ISD::SETONE:
4676       // Expand this to (OLT | OGT).
4677       TmpOp0 = Op0;
4678       TmpOp1 = Op1;
4679       Opc = ISD::OR;
4680       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
4681       Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
4682       break;
4683     case ISD::SETUO: Invert = true; // Fallthrough
4684     case ISD::SETO:
4685       // Expand this to (OLT | OGE).
4686       TmpOp0 = Op0;
4687       TmpOp1 = Op1;
4688       Opc = ISD::OR;
4689       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
4690       Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
4691       break;
4692     }
4693   } else {
4694     // Integer comparisons.
4695     switch (SetCCOpcode) {
4696     default: llvm_unreachable("Illegal integer comparison");
4697     case ISD::SETNE:  Invert = true;
4698     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
4699     case ISD::SETLT:  Swap = true;
4700     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
4701     case ISD::SETLE:  Swap = true;
4702     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
4703     case ISD::SETULT: Swap = true;
4704     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
4705     case ISD::SETULE: Swap = true;
4706     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
4707     }
4708
4709     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
4710     if (Opc == ARMISD::VCEQ) {
4711
4712       SDValue AndOp;
4713       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
4714         AndOp = Op0;
4715       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
4716         AndOp = Op1;
4717
4718       // Ignore bitconvert.
4719       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
4720         AndOp = AndOp.getOperand(0);
4721
4722       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
4723         Opc = ARMISD::VTST;
4724         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
4725         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
4726         Invert = !Invert;
4727       }
4728     }
4729   }
4730
4731   if (Swap)
4732     std::swap(Op0, Op1);
4733
4734   // If one of the operands is a constant vector zero, attempt to fold the
4735   // comparison to a specialized compare-against-zero form.
4736   SDValue SingleOp;
4737   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
4738     SingleOp = Op0;
4739   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
4740     if (Opc == ARMISD::VCGE)
4741       Opc = ARMISD::VCLEZ;
4742     else if (Opc == ARMISD::VCGT)
4743       Opc = ARMISD::VCLTZ;
4744     SingleOp = Op1;
4745   }
4746
4747   SDValue Result;
4748   if (SingleOp.getNode()) {
4749     switch (Opc) {
4750     case ARMISD::VCEQ:
4751       Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
4752     case ARMISD::VCGE:
4753       Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
4754     case ARMISD::VCLEZ:
4755       Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
4756     case ARMISD::VCGT:
4757       Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
4758     case ARMISD::VCLTZ:
4759       Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
4760     default:
4761       Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
4762     }
4763   } else {
4764      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
4765   }
4766
4767   Result = DAG.getSExtOrTrunc(Result, dl, VT);
4768
4769   if (Invert)
4770     Result = DAG.getNOT(dl, Result, VT);
4771
4772   return Result;
4773 }
4774
4775 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
4776 /// valid vector constant for a NEON instruction with a "modified immediate"
4777 /// operand (e.g., VMOV).  If so, return the encoded value.
4778 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
4779                                  unsigned SplatBitSize, SelectionDAG &DAG,
4780                                  SDLoc dl, EVT &VT, bool is128Bits,
4781                                  NEONModImmType type) {
4782   unsigned OpCmode, Imm;
4783
4784   // SplatBitSize is set to the smallest size that splats the vector, so a
4785   // zero vector will always have SplatBitSize == 8.  However, NEON modified
4786   // immediate instructions others than VMOV do not support the 8-bit encoding
4787   // of a zero vector, and the default encoding of zero is supposed to be the
4788   // 32-bit version.
4789   if (SplatBits == 0)
4790     SplatBitSize = 32;
4791
4792   switch (SplatBitSize) {
4793   case 8:
4794     if (type != VMOVModImm)
4795       return SDValue();
4796     // Any 1-byte value is OK.  Op=0, Cmode=1110.
4797     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
4798     OpCmode = 0xe;
4799     Imm = SplatBits;
4800     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
4801     break;
4802
4803   case 16:
4804     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
4805     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
4806     if ((SplatBits & ~0xff) == 0) {
4807       // Value = 0x00nn: Op=x, Cmode=100x.
4808       OpCmode = 0x8;
4809       Imm = SplatBits;
4810       break;
4811     }
4812     if ((SplatBits & ~0xff00) == 0) {
4813       // Value = 0xnn00: Op=x, Cmode=101x.
4814       OpCmode = 0xa;
4815       Imm = SplatBits >> 8;
4816       break;
4817     }
4818     return SDValue();
4819
4820   case 32:
4821     // NEON's 32-bit VMOV supports splat values where:
4822     // * only one byte is nonzero, or
4823     // * the least significant byte is 0xff and the second byte is nonzero, or
4824     // * the least significant 2 bytes are 0xff and the third is nonzero.
4825     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
4826     if ((SplatBits & ~0xff) == 0) {
4827       // Value = 0x000000nn: Op=x, Cmode=000x.
4828       OpCmode = 0;
4829       Imm = SplatBits;
4830       break;
4831     }
4832     if ((SplatBits & ~0xff00) == 0) {
4833       // Value = 0x0000nn00: Op=x, Cmode=001x.
4834       OpCmode = 0x2;
4835       Imm = SplatBits >> 8;
4836       break;
4837     }
4838     if ((SplatBits & ~0xff0000) == 0) {
4839       // Value = 0x00nn0000: Op=x, Cmode=010x.
4840       OpCmode = 0x4;
4841       Imm = SplatBits >> 16;
4842       break;
4843     }
4844     if ((SplatBits & ~0xff000000) == 0) {
4845       // Value = 0xnn000000: Op=x, Cmode=011x.
4846       OpCmode = 0x6;
4847       Imm = SplatBits >> 24;
4848       break;
4849     }
4850
4851     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
4852     if (type == OtherModImm) return SDValue();
4853
4854     if ((SplatBits & ~0xffff) == 0 &&
4855         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
4856       // Value = 0x0000nnff: Op=x, Cmode=1100.
4857       OpCmode = 0xc;
4858       Imm = SplatBits >> 8;
4859       break;
4860     }
4861
4862     if ((SplatBits & ~0xffffff) == 0 &&
4863         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
4864       // Value = 0x00nnffff: Op=x, Cmode=1101.
4865       OpCmode = 0xd;
4866       Imm = SplatBits >> 16;
4867       break;
4868     }
4869
4870     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
4871     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
4872     // VMOV.I32.  A (very) minor optimization would be to replicate the value
4873     // and fall through here to test for a valid 64-bit splat.  But, then the
4874     // caller would also need to check and handle the change in size.
4875     return SDValue();
4876
4877   case 64: {
4878     if (type != VMOVModImm)
4879       return SDValue();
4880     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
4881     uint64_t BitMask = 0xff;
4882     uint64_t Val = 0;
4883     unsigned ImmMask = 1;
4884     Imm = 0;
4885     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
4886       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
4887         Val |= BitMask;
4888         Imm |= ImmMask;
4889       } else if ((SplatBits & BitMask) != 0) {
4890         return SDValue();
4891       }
4892       BitMask <<= 8;
4893       ImmMask <<= 1;
4894     }
4895
4896     if (DAG.getDataLayout().isBigEndian())
4897       // swap higher and lower 32 bit word
4898       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
4899
4900     // Op=1, Cmode=1110.
4901     OpCmode = 0x1e;
4902     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
4903     break;
4904   }
4905
4906   default:
4907     llvm_unreachable("unexpected size for isNEONModifiedImm");
4908   }
4909
4910   unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
4911   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
4912 }
4913
4914 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
4915                                            const ARMSubtarget *ST) const {
4916   if (!ST->hasVFP3())
4917     return SDValue();
4918
4919   bool IsDouble = Op.getValueType() == MVT::f64;
4920   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
4921
4922   // Use the default (constant pool) lowering for double constants when we have
4923   // an SP-only FPU
4924   if (IsDouble && Subtarget->isFPOnlySP())
4925     return SDValue();
4926
4927   // Try splatting with a VMOV.f32...
4928   APFloat FPVal = CFP->getValueAPF();
4929   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
4930
4931   if (ImmVal != -1) {
4932     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
4933       // We have code in place to select a valid ConstantFP already, no need to
4934       // do any mangling.
4935       return Op;
4936     }
4937
4938     // It's a float and we are trying to use NEON operations where
4939     // possible. Lower it to a splat followed by an extract.
4940     SDLoc DL(Op);
4941     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
4942     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
4943                                       NewVal);
4944     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
4945                        DAG.getConstant(0, DL, MVT::i32));
4946   }
4947
4948   // The rest of our options are NEON only, make sure that's allowed before
4949   // proceeding..
4950   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
4951     return SDValue();
4952
4953   EVT VMovVT;
4954   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
4955
4956   // It wouldn't really be worth bothering for doubles except for one very
4957   // important value, which does happen to match: 0.0. So make sure we don't do
4958   // anything stupid.
4959   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
4960     return SDValue();
4961
4962   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
4963   SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
4964                                      VMovVT, false, VMOVModImm);
4965   if (NewVal != SDValue()) {
4966     SDLoc DL(Op);
4967     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
4968                                       NewVal);
4969     if (IsDouble)
4970       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
4971
4972     // It's a float: cast and extract a vector element.
4973     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
4974                                        VecConstant);
4975     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
4976                        DAG.getConstant(0, DL, MVT::i32));
4977   }
4978
4979   // Finally, try a VMVN.i32
4980   NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
4981                              false, VMVNModImm);
4982   if (NewVal != SDValue()) {
4983     SDLoc DL(Op);
4984     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
4985
4986     if (IsDouble)
4987       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
4988
4989     // It's a float: cast and extract a vector element.
4990     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
4991                                        VecConstant);
4992     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
4993                        DAG.getConstant(0, DL, MVT::i32));
4994   }
4995
4996   return SDValue();
4997 }
4998
4999 // check if an VEXT instruction can handle the shuffle mask when the
5000 // vector sources of the shuffle are the same.
5001 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
5002   unsigned NumElts = VT.getVectorNumElements();
5003
5004   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5005   if (M[0] < 0)
5006     return false;
5007
5008   Imm = M[0];
5009
5010   // If this is a VEXT shuffle, the immediate value is the index of the first
5011   // element.  The other shuffle indices must be the successive elements after
5012   // the first one.
5013   unsigned ExpectedElt = Imm;
5014   for (unsigned i = 1; i < NumElts; ++i) {
5015     // Increment the expected index.  If it wraps around, just follow it
5016     // back to index zero and keep going.
5017     ++ExpectedElt;
5018     if (ExpectedElt == NumElts)
5019       ExpectedElt = 0;
5020
5021     if (M[i] < 0) continue; // ignore UNDEF indices
5022     if (ExpectedElt != static_cast<unsigned>(M[i]))
5023       return false;
5024   }
5025
5026   return true;
5027 }
5028
5029
5030 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
5031                        bool &ReverseVEXT, unsigned &Imm) {
5032   unsigned NumElts = VT.getVectorNumElements();
5033   ReverseVEXT = false;
5034
5035   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5036   if (M[0] < 0)
5037     return false;
5038
5039   Imm = M[0];
5040
5041   // If this is a VEXT shuffle, the immediate value is the index of the first
5042   // element.  The other shuffle indices must be the successive elements after
5043   // the first one.
5044   unsigned ExpectedElt = Imm;
5045   for (unsigned i = 1; i < NumElts; ++i) {
5046     // Increment the expected index.  If it wraps around, it may still be
5047     // a VEXT but the source vectors must be swapped.
5048     ExpectedElt += 1;
5049     if (ExpectedElt == NumElts * 2) {
5050       ExpectedElt = 0;
5051       ReverseVEXT = true;
5052     }
5053
5054     if (M[i] < 0) continue; // ignore UNDEF indices
5055     if (ExpectedElt != static_cast<unsigned>(M[i]))
5056       return false;
5057   }
5058
5059   // Adjust the index value if the source operands will be swapped.
5060   if (ReverseVEXT)
5061     Imm -= NumElts;
5062
5063   return true;
5064 }
5065
5066 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
5067 /// instruction with the specified blocksize.  (The order of the elements
5068 /// within each block of the vector is reversed.)
5069 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
5070   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
5071          "Only possible block sizes for VREV are: 16, 32, 64");
5072
5073   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5074   if (EltSz == 64)
5075     return false;
5076
5077   unsigned NumElts = VT.getVectorNumElements();
5078   unsigned BlockElts = M[0] + 1;
5079   // If the first shuffle index is UNDEF, be optimistic.
5080   if (M[0] < 0)
5081     BlockElts = BlockSize / EltSz;
5082
5083   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
5084     return false;
5085
5086   for (unsigned i = 0; i < NumElts; ++i) {
5087     if (M[i] < 0) continue; // ignore UNDEF indices
5088     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
5089       return false;
5090   }
5091
5092   return true;
5093 }
5094
5095 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
5096   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
5097   // range, then 0 is placed into the resulting vector. So pretty much any mask
5098   // of 8 elements can work here.
5099   return VT == MVT::v8i8 && M.size() == 8;
5100 }
5101
5102 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
5103 // checking that pairs of elements in the shuffle mask represent the same index
5104 // in each vector, incrementing the expected index by 2 at each step.
5105 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
5106 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
5107 //  v2={e,f,g,h}
5108 // WhichResult gives the offset for each element in the mask based on which
5109 // of the two results it belongs to.
5110 //
5111 // The transpose can be represented either as:
5112 // result1 = shufflevector v1, v2, result1_shuffle_mask
5113 // result2 = shufflevector v1, v2, result2_shuffle_mask
5114 // where v1/v2 and the shuffle masks have the same number of elements
5115 // (here WhichResult (see below) indicates which result is being checked)
5116 //
5117 // or as:
5118 // results = shufflevector v1, v2, shuffle_mask
5119 // where both results are returned in one vector and the shuffle mask has twice
5120 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
5121 // want to check the low half and high half of the shuffle mask as if it were
5122 // the other case
5123 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5124   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5125   if (EltSz == 64)
5126     return false;
5127
5128   unsigned NumElts = VT.getVectorNumElements();
5129   if (M.size() != NumElts && M.size() != NumElts*2)
5130     return false;
5131
5132   // If the mask is twice as long as the result then we need to check the upper
5133   // and lower parts of the mask
5134   for (unsigned i = 0; i < M.size(); i += NumElts) {
5135     WhichResult = M[i] == 0 ? 0 : 1;
5136     for (unsigned j = 0; j < NumElts; j += 2) {
5137       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5138           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
5139         return false;
5140     }
5141   }
5142
5143   if (M.size() == NumElts*2)
5144     WhichResult = 0;
5145
5146   return true;
5147 }
5148
5149 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
5150 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5151 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
5152 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5153   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5154   if (EltSz == 64)
5155     return false;
5156
5157   unsigned NumElts = VT.getVectorNumElements();
5158   if (M.size() != NumElts && M.size() != NumElts*2)
5159     return false;
5160
5161   for (unsigned i = 0; i < M.size(); i += NumElts) {
5162     WhichResult = M[i] == 0 ? 0 : 1;
5163     for (unsigned j = 0; j < NumElts; j += 2) {
5164       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5165           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
5166         return false;
5167     }
5168   }
5169
5170   if (M.size() == NumElts*2)
5171     WhichResult = 0;
5172
5173   return true;
5174 }
5175
5176 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
5177 // that the mask elements are either all even and in steps of size 2 or all odd
5178 // and in steps of size 2.
5179 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
5180 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
5181 //  v2={e,f,g,h}
5182 // Requires similar checks to that of isVTRNMask with
5183 // respect the how results are returned.
5184 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5185   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5186   if (EltSz == 64)
5187     return false;
5188
5189   unsigned NumElts = VT.getVectorNumElements();
5190   if (M.size() != NumElts && M.size() != NumElts*2)
5191     return false;
5192
5193   for (unsigned i = 0; i < M.size(); i += NumElts) {
5194     WhichResult = M[i] == 0 ? 0 : 1;
5195     for (unsigned j = 0; j < NumElts; ++j) {
5196       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
5197         return false;
5198     }
5199   }
5200
5201   if (M.size() == NumElts*2)
5202     WhichResult = 0;
5203
5204   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5205   if (VT.is64BitVector() && EltSz == 32)
5206     return false;
5207
5208   return true;
5209 }
5210
5211 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
5212 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5213 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
5214 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5215   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5216   if (EltSz == 64)
5217     return false;
5218
5219   unsigned NumElts = VT.getVectorNumElements();
5220   if (M.size() != NumElts && M.size() != NumElts*2)
5221     return false;
5222
5223   unsigned Half = NumElts / 2;
5224   for (unsigned i = 0; i < M.size(); i += NumElts) {
5225     WhichResult = M[i] == 0 ? 0 : 1;
5226     for (unsigned j = 0; j < NumElts; j += Half) {
5227       unsigned Idx = WhichResult;
5228       for (unsigned k = 0; k < Half; ++k) {
5229         int MIdx = M[i + j + k];
5230         if (MIdx >= 0 && (unsigned) MIdx != Idx)
5231           return false;
5232         Idx += 2;
5233       }
5234     }
5235   }
5236
5237   if (M.size() == NumElts*2)
5238     WhichResult = 0;
5239
5240   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5241   if (VT.is64BitVector() && EltSz == 32)
5242     return false;
5243
5244   return true;
5245 }
5246
5247 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
5248 // that pairs of elements of the shufflemask represent the same index in each
5249 // vector incrementing sequentially through the vectors.
5250 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
5251 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
5252 //  v2={e,f,g,h}
5253 // Requires similar checks to that of isVTRNMask with respect the how results
5254 // are returned.
5255 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5256   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5257   if (EltSz == 64)
5258     return false;
5259
5260   unsigned NumElts = VT.getVectorNumElements();
5261   if (M.size() != NumElts && M.size() != NumElts*2)
5262     return false;
5263
5264   for (unsigned i = 0; i < M.size(); i += NumElts) {
5265     WhichResult = M[i] == 0 ? 0 : 1;
5266     unsigned Idx = WhichResult * NumElts / 2;
5267     for (unsigned j = 0; j < NumElts; j += 2) {
5268       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5269           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
5270         return false;
5271       Idx += 1;
5272     }
5273   }
5274
5275   if (M.size() == NumElts*2)
5276     WhichResult = 0;
5277
5278   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5279   if (VT.is64BitVector() && EltSz == 32)
5280     return false;
5281
5282   return true;
5283 }
5284
5285 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
5286 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5287 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
5288 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5289   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5290   if (EltSz == 64)
5291     return false;
5292
5293   unsigned NumElts = VT.getVectorNumElements();
5294   if (M.size() != NumElts && M.size() != NumElts*2)
5295     return false;
5296
5297   for (unsigned i = 0; i < M.size(); i += NumElts) {
5298     WhichResult = M[i] == 0 ? 0 : 1;
5299     unsigned Idx = WhichResult * NumElts / 2;
5300     for (unsigned j = 0; j < NumElts; j += 2) {
5301       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5302           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
5303         return false;
5304       Idx += 1;
5305     }
5306   }
5307
5308   if (M.size() == NumElts*2)
5309     WhichResult = 0;
5310
5311   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5312   if (VT.is64BitVector() && EltSz == 32)
5313     return false;
5314
5315   return true;
5316 }
5317
5318 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
5319 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
5320 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
5321                                            unsigned &WhichResult,
5322                                            bool &isV_UNDEF) {
5323   isV_UNDEF = false;
5324   if (isVTRNMask(ShuffleMask, VT, WhichResult))
5325     return ARMISD::VTRN;
5326   if (isVUZPMask(ShuffleMask, VT, WhichResult))
5327     return ARMISD::VUZP;
5328   if (isVZIPMask(ShuffleMask, VT, WhichResult))
5329     return ARMISD::VZIP;
5330
5331   isV_UNDEF = true;
5332   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
5333     return ARMISD::VTRN;
5334   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5335     return ARMISD::VUZP;
5336   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5337     return ARMISD::VZIP;
5338
5339   return 0;
5340 }
5341
5342 /// \return true if this is a reverse operation on an vector.
5343 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
5344   unsigned NumElts = VT.getVectorNumElements();
5345   // Make sure the mask has the right size.
5346   if (NumElts != M.size())
5347       return false;
5348
5349   // Look for <15, ..., 3, -1, 1, 0>.
5350   for (unsigned i = 0; i != NumElts; ++i)
5351     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
5352       return false;
5353
5354   return true;
5355 }
5356
5357 // If N is an integer constant that can be moved into a register in one
5358 // instruction, return an SDValue of such a constant (will become a MOV
5359 // instruction).  Otherwise return null.
5360 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
5361                                      const ARMSubtarget *ST, SDLoc dl) {
5362   uint64_t Val;
5363   if (!isa<ConstantSDNode>(N))
5364     return SDValue();
5365   Val = cast<ConstantSDNode>(N)->getZExtValue();
5366
5367   if (ST->isThumb1Only()) {
5368     if (Val <= 255 || ~Val <= 255)
5369       return DAG.getConstant(Val, dl, MVT::i32);
5370   } else {
5371     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
5372       return DAG.getConstant(Val, dl, MVT::i32);
5373   }
5374   return SDValue();
5375 }
5376
5377 // If this is a case we can't handle, return null and let the default
5378 // expansion code take care of it.
5379 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
5380                                              const ARMSubtarget *ST) const {
5381   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
5382   SDLoc dl(Op);
5383   EVT VT = Op.getValueType();
5384
5385   APInt SplatBits, SplatUndef;
5386   unsigned SplatBitSize;
5387   bool HasAnyUndefs;
5388   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
5389     if (SplatBitSize <= 64) {
5390       // Check if an immediate VMOV works.
5391       EVT VmovVT;
5392       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
5393                                       SplatUndef.getZExtValue(), SplatBitSize,
5394                                       DAG, dl, VmovVT, VT.is128BitVector(),
5395                                       VMOVModImm);
5396       if (Val.getNode()) {
5397         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
5398         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
5399       }
5400
5401       // Try an immediate VMVN.
5402       uint64_t NegatedImm = (~SplatBits).getZExtValue();
5403       Val = isNEONModifiedImm(NegatedImm,
5404                                       SplatUndef.getZExtValue(), SplatBitSize,
5405                                       DAG, dl, VmovVT, VT.is128BitVector(),
5406                                       VMVNModImm);
5407       if (Val.getNode()) {
5408         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
5409         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
5410       }
5411
5412       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
5413       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
5414         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
5415         if (ImmVal != -1) {
5416           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
5417           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
5418         }
5419       }
5420     }
5421   }
5422
5423   // Scan through the operands to see if only one value is used.
5424   //
5425   // As an optimisation, even if more than one value is used it may be more
5426   // profitable to splat with one value then change some lanes.
5427   //
5428   // Heuristically we decide to do this if the vector has a "dominant" value,
5429   // defined as splatted to more than half of the lanes.
5430   unsigned NumElts = VT.getVectorNumElements();
5431   bool isOnlyLowElement = true;
5432   bool usesOnlyOneValue = true;
5433   bool hasDominantValue = false;
5434   bool isConstant = true;
5435
5436   // Map of the number of times a particular SDValue appears in the
5437   // element list.
5438   DenseMap<SDValue, unsigned> ValueCounts;
5439   SDValue Value;
5440   for (unsigned i = 0; i < NumElts; ++i) {
5441     SDValue V = Op.getOperand(i);
5442     if (V.getOpcode() == ISD::UNDEF)
5443       continue;
5444     if (i > 0)
5445       isOnlyLowElement = false;
5446     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
5447       isConstant = false;
5448
5449     ValueCounts.insert(std::make_pair(V, 0));
5450     unsigned &Count = ValueCounts[V];
5451
5452     // Is this value dominant? (takes up more than half of the lanes)
5453     if (++Count > (NumElts / 2)) {
5454       hasDominantValue = true;
5455       Value = V;
5456     }
5457   }
5458   if (ValueCounts.size() != 1)
5459     usesOnlyOneValue = false;
5460   if (!Value.getNode() && ValueCounts.size() > 0)
5461     Value = ValueCounts.begin()->first;
5462
5463   if (ValueCounts.size() == 0)
5464     return DAG.getUNDEF(VT);
5465
5466   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
5467   // Keep going if we are hitting this case.
5468   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
5469     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
5470
5471   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5472
5473   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
5474   // i32 and try again.
5475   if (hasDominantValue && EltSize <= 32) {
5476     if (!isConstant) {
5477       SDValue N;
5478
5479       // If we are VDUPing a value that comes directly from a vector, that will
5480       // cause an unnecessary move to and from a GPR, where instead we could
5481       // just use VDUPLANE. We can only do this if the lane being extracted
5482       // is at a constant index, as the VDUP from lane instructions only have
5483       // constant-index forms.
5484       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5485           isa<ConstantSDNode>(Value->getOperand(1))) {
5486         // We need to create a new undef vector to use for the VDUPLANE if the
5487         // size of the vector from which we get the value is different than the
5488         // size of the vector that we need to create. We will insert the element
5489         // such that the register coalescer will remove unnecessary copies.
5490         if (VT != Value->getOperand(0).getValueType()) {
5491           ConstantSDNode *constIndex;
5492           constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
5493           assert(constIndex && "The index is not a constant!");
5494           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
5495                              VT.getVectorNumElements();
5496           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
5497                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
5498                         Value, DAG.getConstant(index, dl, MVT::i32)),
5499                            DAG.getConstant(index, dl, MVT::i32));
5500         } else
5501           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
5502                         Value->getOperand(0), Value->getOperand(1));
5503       } else
5504         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
5505
5506       if (!usesOnlyOneValue) {
5507         // The dominant value was splatted as 'N', but we now have to insert
5508         // all differing elements.
5509         for (unsigned I = 0; I < NumElts; ++I) {
5510           if (Op.getOperand(I) == Value)
5511             continue;
5512           SmallVector<SDValue, 3> Ops;
5513           Ops.push_back(N);
5514           Ops.push_back(Op.getOperand(I));
5515           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
5516           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
5517         }
5518       }
5519       return N;
5520     }
5521     if (VT.getVectorElementType().isFloatingPoint()) {
5522       SmallVector<SDValue, 8> Ops;
5523       for (unsigned i = 0; i < NumElts; ++i)
5524         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5525                                   Op.getOperand(i)));
5526       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
5527       SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
5528       Val = LowerBUILD_VECTOR(Val, DAG, ST);
5529       if (Val.getNode())
5530         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
5531     }
5532     if (usesOnlyOneValue) {
5533       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
5534       if (isConstant && Val.getNode())
5535         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
5536     }
5537   }
5538
5539   // If all elements are constants and the case above didn't get hit, fall back
5540   // to the default expansion, which will generate a load from the constant
5541   // pool.
5542   if (isConstant)
5543     return SDValue();
5544
5545   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
5546   if (NumElts >= 4) {
5547     SDValue shuffle = ReconstructShuffle(Op, DAG);
5548     if (shuffle != SDValue())
5549       return shuffle;
5550   }
5551
5552   // Vectors with 32- or 64-bit elements can be built by directly assigning
5553   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
5554   // will be legalized.
5555   if (EltSize >= 32) {
5556     // Do the expansion with floating-point types, since that is what the VFP
5557     // registers are defined to use, and since i64 is not legal.
5558     EVT EltVT = EVT::getFloatingPointVT(EltSize);
5559     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
5560     SmallVector<SDValue, 8> Ops;
5561     for (unsigned i = 0; i < NumElts; ++i)
5562       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
5563     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
5564     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
5565   }
5566
5567   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
5568   // know the default expansion would otherwise fall back on something even
5569   // worse. For a vector with one or two non-undef values, that's
5570   // scalar_to_vector for the elements followed by a shuffle (provided the
5571   // shuffle is valid for the target) and materialization element by element
5572   // on the stack followed by a load for everything else.
5573   if (!isConstant && !usesOnlyOneValue) {
5574     SDValue Vec = DAG.getUNDEF(VT);
5575     for (unsigned i = 0 ; i < NumElts; ++i) {
5576       SDValue V = Op.getOperand(i);
5577       if (V.getOpcode() == ISD::UNDEF)
5578         continue;
5579       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
5580       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
5581     }
5582     return Vec;
5583   }
5584
5585   return SDValue();
5586 }
5587
5588 /// getExtFactor - Determine the adjustment factor for the position when
5589 /// generating an "extract from vector registers" instruction.
5590 static unsigned getExtFactor(SDValue &V) {
5591   EVT EltType = V.getValueType().getVectorElementType();
5592   return EltType.getSizeInBits() / 8;
5593 }
5594
5595 // Gather data to see if the operation can be modelled as a
5596 // shuffle in combination with VEXTs.
5597 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
5598                                               SelectionDAG &DAG) const {
5599   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
5600   SDLoc dl(Op);
5601   EVT VT = Op.getValueType();
5602   unsigned NumElts = VT.getVectorNumElements();
5603
5604   struct ShuffleSourceInfo {
5605     SDValue Vec;
5606     unsigned MinElt;
5607     unsigned MaxElt;
5608
5609     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
5610     // be compatible with the shuffle we intend to construct. As a result
5611     // ShuffleVec will be some sliding window into the original Vec.
5612     SDValue ShuffleVec;
5613
5614     // Code should guarantee that element i in Vec starts at element "WindowBase
5615     // + i * WindowScale in ShuffleVec".
5616     int WindowBase;
5617     int WindowScale;
5618
5619     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
5620     ShuffleSourceInfo(SDValue Vec)
5621         : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
5622           WindowScale(1) {}
5623   };
5624
5625   // First gather all vectors used as an immediate source for this BUILD_VECTOR
5626   // node.
5627   SmallVector<ShuffleSourceInfo, 2> Sources;
5628   for (unsigned i = 0; i < NumElts; ++i) {
5629     SDValue V = Op.getOperand(i);
5630     if (V.getOpcode() == ISD::UNDEF)
5631       continue;
5632     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
5633       // A shuffle can only come from building a vector from various
5634       // elements of other vectors.
5635       return SDValue();
5636     }
5637
5638     // Add this element source to the list if it's not already there.
5639     SDValue SourceVec = V.getOperand(0);
5640     auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
5641     if (Source == Sources.end())
5642       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
5643
5644     // Update the minimum and maximum lane number seen.
5645     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
5646     Source->MinElt = std::min(Source->MinElt, EltNo);
5647     Source->MaxElt = std::max(Source->MaxElt, EltNo);
5648   }
5649
5650   // Currently only do something sane when at most two source vectors
5651   // are involved.
5652   if (Sources.size() > 2)
5653     return SDValue();
5654
5655   // Find out the smallest element size among result and two sources, and use
5656   // it as element size to build the shuffle_vector.
5657   EVT SmallestEltTy = VT.getVectorElementType();
5658   for (auto &Source : Sources) {
5659     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
5660     if (SrcEltTy.bitsLT(SmallestEltTy))
5661       SmallestEltTy = SrcEltTy;
5662   }
5663   unsigned ResMultiplier =
5664       VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
5665   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
5666   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
5667
5668   // If the source vector is too wide or too narrow, we may nevertheless be able
5669   // to construct a compatible shuffle either by concatenating it with UNDEF or
5670   // extracting a suitable range of elements.
5671   for (auto &Src : Sources) {
5672     EVT SrcVT = Src.ShuffleVec.getValueType();
5673
5674     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
5675       continue;
5676
5677     // This stage of the search produces a source with the same element type as
5678     // the original, but with a total width matching the BUILD_VECTOR output.
5679     EVT EltVT = SrcVT.getVectorElementType();
5680     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
5681     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
5682
5683     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
5684       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
5685         return SDValue();
5686       // We can pad out the smaller vector for free, so if it's part of a
5687       // shuffle...
5688       Src.ShuffleVec =
5689           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
5690                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
5691       continue;
5692     }
5693
5694     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
5695       return SDValue();
5696
5697     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
5698       // Span too large for a VEXT to cope
5699       return SDValue();
5700     }
5701
5702     if (Src.MinElt >= NumSrcElts) {
5703       // The extraction can just take the second half
5704       Src.ShuffleVec =
5705           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5706                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
5707       Src.WindowBase = -NumSrcElts;
5708     } else if (Src.MaxElt < NumSrcElts) {
5709       // The extraction can just take the first half
5710       Src.ShuffleVec =
5711           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5712                       DAG.getConstant(0, dl, MVT::i32));
5713     } else {
5714       // An actual VEXT is needed
5715       SDValue VEXTSrc1 =
5716           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5717                       DAG.getConstant(0, dl, MVT::i32));
5718       SDValue VEXTSrc2 =
5719           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5720                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
5721       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
5722
5723       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
5724                                    VEXTSrc2,
5725                                    DAG.getConstant(Imm, dl, MVT::i32));
5726       Src.WindowBase = -Src.MinElt;
5727     }
5728   }
5729
5730   // Another possible incompatibility occurs from the vector element types. We
5731   // can fix this by bitcasting the source vectors to the same type we intend
5732   // for the shuffle.
5733   for (auto &Src : Sources) {
5734     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
5735     if (SrcEltTy == SmallestEltTy)
5736       continue;
5737     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
5738     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
5739     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
5740     Src.WindowBase *= Src.WindowScale;
5741   }
5742
5743   // Final sanity check before we try to actually produce a shuffle.
5744   DEBUG(
5745     for (auto Src : Sources)
5746       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
5747   );
5748
5749   // The stars all align, our next step is to produce the mask for the shuffle.
5750   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
5751   int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
5752   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
5753     SDValue Entry = Op.getOperand(i);
5754     if (Entry.getOpcode() == ISD::UNDEF)
5755       continue;
5756
5757     auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
5758     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
5759
5760     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
5761     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
5762     // segment.
5763     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
5764     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
5765                                VT.getVectorElementType().getSizeInBits());
5766     int LanesDefined = BitsDefined / BitsPerShuffleLane;
5767
5768     // This source is expected to fill ResMultiplier lanes of the final shuffle,
5769     // starting at the appropriate offset.
5770     int *LaneMask = &Mask[i * ResMultiplier];
5771
5772     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
5773     ExtractBase += NumElts * (Src - Sources.begin());
5774     for (int j = 0; j < LanesDefined; ++j)
5775       LaneMask[j] = ExtractBase + j;
5776   }
5777
5778   // Final check before we try to produce nonsense...
5779   if (!isShuffleMaskLegal(Mask, ShuffleVT))
5780     return SDValue();
5781
5782   // We can't handle more than two sources. This should have already
5783   // been checked before this point.
5784   assert(Sources.size() <= 2 && "Too many sources!");
5785
5786   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
5787   for (unsigned i = 0; i < Sources.size(); ++i)
5788     ShuffleOps[i] = Sources[i].ShuffleVec;
5789
5790   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
5791                                          ShuffleOps[1], &Mask[0]);
5792   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
5793 }
5794
5795 /// isShuffleMaskLegal - Targets can use this to indicate that they only
5796 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
5797 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
5798 /// are assumed to be legal.
5799 bool
5800 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
5801                                       EVT VT) const {
5802   if (VT.getVectorNumElements() == 4 &&
5803       (VT.is128BitVector() || VT.is64BitVector())) {
5804     unsigned PFIndexes[4];
5805     for (unsigned i = 0; i != 4; ++i) {
5806       if (M[i] < 0)
5807         PFIndexes[i] = 8;
5808       else
5809         PFIndexes[i] = M[i];
5810     }
5811
5812     // Compute the index in the perfect shuffle table.
5813     unsigned PFTableIndex =
5814       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
5815     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
5816     unsigned Cost = (PFEntry >> 30);
5817
5818     if (Cost <= 4)
5819       return true;
5820   }
5821
5822   bool ReverseVEXT, isV_UNDEF;
5823   unsigned Imm, WhichResult;
5824
5825   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5826   return (EltSize >= 32 ||
5827           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
5828           isVREVMask(M, VT, 64) ||
5829           isVREVMask(M, VT, 32) ||
5830           isVREVMask(M, VT, 16) ||
5831           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
5832           isVTBLMask(M, VT) ||
5833           isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
5834           ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
5835 }
5836
5837 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
5838 /// the specified operations to build the shuffle.
5839 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
5840                                       SDValue RHS, SelectionDAG &DAG,
5841                                       SDLoc dl) {
5842   unsigned OpNum = (PFEntry >> 26) & 0x0F;
5843   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
5844   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
5845
5846   enum {
5847     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
5848     OP_VREV,
5849     OP_VDUP0,
5850     OP_VDUP1,
5851     OP_VDUP2,
5852     OP_VDUP3,
5853     OP_VEXT1,
5854     OP_VEXT2,
5855     OP_VEXT3,
5856     OP_VUZPL, // VUZP, left result
5857     OP_VUZPR, // VUZP, right result
5858     OP_VZIPL, // VZIP, left result
5859     OP_VZIPR, // VZIP, right result
5860     OP_VTRNL, // VTRN, left result
5861     OP_VTRNR  // VTRN, right result
5862   };
5863
5864   if (OpNum == OP_COPY) {
5865     if (LHSID == (1*9+2)*9+3) return LHS;
5866     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
5867     return RHS;
5868   }
5869
5870   SDValue OpLHS, OpRHS;
5871   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
5872   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
5873   EVT VT = OpLHS.getValueType();
5874
5875   switch (OpNum) {
5876   default: llvm_unreachable("Unknown shuffle opcode!");
5877   case OP_VREV:
5878     // VREV divides the vector in half and swaps within the half.
5879     if (VT.getVectorElementType() == MVT::i32 ||
5880         VT.getVectorElementType() == MVT::f32)
5881       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
5882     // vrev <4 x i16> -> VREV32
5883     if (VT.getVectorElementType() == MVT::i16)
5884       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
5885     // vrev <4 x i8> -> VREV16
5886     assert(VT.getVectorElementType() == MVT::i8);
5887     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
5888   case OP_VDUP0:
5889   case OP_VDUP1:
5890   case OP_VDUP2:
5891   case OP_VDUP3:
5892     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
5893                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
5894   case OP_VEXT1:
5895   case OP_VEXT2:
5896   case OP_VEXT3:
5897     return DAG.getNode(ARMISD::VEXT, dl, VT,
5898                        OpLHS, OpRHS,
5899                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
5900   case OP_VUZPL:
5901   case OP_VUZPR:
5902     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
5903                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
5904   case OP_VZIPL:
5905   case OP_VZIPR:
5906     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
5907                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
5908   case OP_VTRNL:
5909   case OP_VTRNR:
5910     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
5911                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
5912   }
5913 }
5914
5915 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
5916                                        ArrayRef<int> ShuffleMask,
5917                                        SelectionDAG &DAG) {
5918   // Check to see if we can use the VTBL instruction.
5919   SDValue V1 = Op.getOperand(0);
5920   SDValue V2 = Op.getOperand(1);
5921   SDLoc DL(Op);
5922
5923   SmallVector<SDValue, 8> VTBLMask;
5924   for (ArrayRef<int>::iterator
5925          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
5926     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
5927
5928   if (V2.getNode()->getOpcode() == ISD::UNDEF)
5929     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
5930                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
5931
5932   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
5933                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
5934 }
5935
5936 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
5937                                                       SelectionDAG &DAG) {
5938   SDLoc DL(Op);
5939   SDValue OpLHS = Op.getOperand(0);
5940   EVT VT = OpLHS.getValueType();
5941
5942   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
5943          "Expect an v8i16/v16i8 type");
5944   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
5945   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
5946   // extract the first 8 bytes into the top double word and the last 8 bytes
5947   // into the bottom double word. The v8i16 case is similar.
5948   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
5949   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
5950                      DAG.getConstant(ExtractNum, DL, MVT::i32));
5951 }
5952
5953 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
5954   SDValue V1 = Op.getOperand(0);
5955   SDValue V2 = Op.getOperand(1);
5956   SDLoc dl(Op);
5957   EVT VT = Op.getValueType();
5958   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
5959
5960   // Convert shuffles that are directly supported on NEON to target-specific
5961   // DAG nodes, instead of keeping them as shuffles and matching them again
5962   // during code selection.  This is more efficient and avoids the possibility
5963   // of inconsistencies between legalization and selection.
5964   // FIXME: floating-point vectors should be canonicalized to integer vectors
5965   // of the same time so that they get CSEd properly.
5966   ArrayRef<int> ShuffleMask = SVN->getMask();
5967
5968   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5969   if (EltSize <= 32) {
5970     if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
5971       int Lane = SVN->getSplatIndex();
5972       // If this is undef splat, generate it via "just" vdup, if possible.
5973       if (Lane == -1) Lane = 0;
5974
5975       // Test if V1 is a SCALAR_TO_VECTOR.
5976       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5977         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
5978       }
5979       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
5980       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
5981       // reaches it).
5982       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
5983           !isa<ConstantSDNode>(V1.getOperand(0))) {
5984         bool IsScalarToVector = true;
5985         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
5986           if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
5987             IsScalarToVector = false;
5988             break;
5989           }
5990         if (IsScalarToVector)
5991           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
5992       }
5993       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
5994                          DAG.getConstant(Lane, dl, MVT::i32));
5995     }
5996
5997     bool ReverseVEXT;
5998     unsigned Imm;
5999     if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
6000       if (ReverseVEXT)
6001         std::swap(V1, V2);
6002       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
6003                          DAG.getConstant(Imm, dl, MVT::i32));
6004     }
6005
6006     if (isVREVMask(ShuffleMask, VT, 64))
6007       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
6008     if (isVREVMask(ShuffleMask, VT, 32))
6009       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
6010     if (isVREVMask(ShuffleMask, VT, 16))
6011       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
6012
6013     if (V2->getOpcode() == ISD::UNDEF &&
6014         isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
6015       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
6016                          DAG.getConstant(Imm, dl, MVT::i32));
6017     }
6018
6019     // Check for Neon shuffles that modify both input vectors in place.
6020     // If both results are used, i.e., if there are two shuffles with the same
6021     // source operands and with masks corresponding to both results of one of
6022     // these operations, DAG memoization will ensure that a single node is
6023     // used for both shuffles.
6024     unsigned WhichResult;
6025     bool isV_UNDEF;
6026     if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6027             ShuffleMask, VT, WhichResult, isV_UNDEF)) {
6028       if (isV_UNDEF)
6029         V2 = V1;
6030       return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
6031           .getValue(WhichResult);
6032     }
6033
6034     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
6035     // shuffles that produce a result larger than their operands with:
6036     //   shuffle(concat(v1, undef), concat(v2, undef))
6037     // ->
6038     //   shuffle(concat(v1, v2), undef)
6039     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
6040     //
6041     // This is useful in the general case, but there are special cases where
6042     // native shuffles produce larger results: the two-result ops.
6043     //
6044     // Look through the concat when lowering them:
6045     //   shuffle(concat(v1, v2), undef)
6046     // ->
6047     //   concat(VZIP(v1, v2):0, :1)
6048     //
6049     if (V1->getOpcode() == ISD::CONCAT_VECTORS &&
6050         V2->getOpcode() == ISD::UNDEF) {
6051       SDValue SubV1 = V1->getOperand(0);
6052       SDValue SubV2 = V1->getOperand(1);
6053       EVT SubVT = SubV1.getValueType();
6054
6055       // We expect these to have been canonicalized to -1.
6056       assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) {
6057         return i < (int)VT.getVectorNumElements();
6058       }) && "Unexpected shuffle index into UNDEF operand!");
6059
6060       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6061               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
6062         if (isV_UNDEF)
6063           SubV2 = SubV1;
6064         assert((WhichResult == 0) &&
6065                "In-place shuffle of concat can only have one result!");
6066         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
6067                                   SubV1, SubV2);
6068         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
6069                            Res.getValue(1));
6070       }
6071     }
6072   }
6073
6074   // If the shuffle is not directly supported and it has 4 elements, use
6075   // the PerfectShuffle-generated table to synthesize it from other shuffles.
6076   unsigned NumElts = VT.getVectorNumElements();
6077   if (NumElts == 4) {
6078     unsigned PFIndexes[4];
6079     for (unsigned i = 0; i != 4; ++i) {
6080       if (ShuffleMask[i] < 0)
6081         PFIndexes[i] = 8;
6082       else
6083         PFIndexes[i] = ShuffleMask[i];
6084     }
6085
6086     // Compute the index in the perfect shuffle table.
6087     unsigned PFTableIndex =
6088       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6089     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6090     unsigned Cost = (PFEntry >> 30);
6091
6092     if (Cost <= 4)
6093       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
6094   }
6095
6096   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
6097   if (EltSize >= 32) {
6098     // Do the expansion with floating-point types, since that is what the VFP
6099     // registers are defined to use, and since i64 is not legal.
6100     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6101     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6102     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
6103     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
6104     SmallVector<SDValue, 8> Ops;
6105     for (unsigned i = 0; i < NumElts; ++i) {
6106       if (ShuffleMask[i] < 0)
6107         Ops.push_back(DAG.getUNDEF(EltVT));
6108       else
6109         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6110                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
6111                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
6112                                                   dl, MVT::i32)));
6113     }
6114     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6115     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6116   }
6117
6118   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
6119     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
6120
6121   if (VT == MVT::v8i8) {
6122     SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
6123     if (NewOp.getNode())
6124       return NewOp;
6125   }
6126
6127   return SDValue();
6128 }
6129
6130 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6131   // INSERT_VECTOR_ELT is legal only for immediate indexes.
6132   SDValue Lane = Op.getOperand(2);
6133   if (!isa<ConstantSDNode>(Lane))
6134     return SDValue();
6135
6136   return Op;
6137 }
6138
6139 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6140   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
6141   SDValue Lane = Op.getOperand(1);
6142   if (!isa<ConstantSDNode>(Lane))
6143     return SDValue();
6144
6145   SDValue Vec = Op.getOperand(0);
6146   if (Op.getValueType() == MVT::i32 &&
6147       Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
6148     SDLoc dl(Op);
6149     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
6150   }
6151
6152   return Op;
6153 }
6154
6155 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6156   // The only time a CONCAT_VECTORS operation can have legal types is when
6157   // two 64-bit vectors are concatenated to a 128-bit vector.
6158   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
6159          "unexpected CONCAT_VECTORS");
6160   SDLoc dl(Op);
6161   SDValue Val = DAG.getUNDEF(MVT::v2f64);
6162   SDValue Op0 = Op.getOperand(0);
6163   SDValue Op1 = Op.getOperand(1);
6164   if (Op0.getOpcode() != ISD::UNDEF)
6165     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6166                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
6167                       DAG.getIntPtrConstant(0, dl));
6168   if (Op1.getOpcode() != ISD::UNDEF)
6169     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6170                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
6171                       DAG.getIntPtrConstant(1, dl));
6172   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
6173 }
6174
6175 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
6176 /// element has been zero/sign-extended, depending on the isSigned parameter,
6177 /// from an integer type half its size.
6178 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
6179                                    bool isSigned) {
6180   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
6181   EVT VT = N->getValueType(0);
6182   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
6183     SDNode *BVN = N->getOperand(0).getNode();
6184     if (BVN->getValueType(0) != MVT::v4i32 ||
6185         BVN->getOpcode() != ISD::BUILD_VECTOR)
6186       return false;
6187     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6188     unsigned HiElt = 1 - LoElt;
6189     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
6190     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
6191     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
6192     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
6193     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
6194       return false;
6195     if (isSigned) {
6196       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
6197           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
6198         return true;
6199     } else {
6200       if (Hi0->isNullValue() && Hi1->isNullValue())
6201         return true;
6202     }
6203     return false;
6204   }
6205
6206   if (N->getOpcode() != ISD::BUILD_VECTOR)
6207     return false;
6208
6209   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
6210     SDNode *Elt = N->getOperand(i).getNode();
6211     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
6212       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
6213       unsigned HalfSize = EltSize / 2;
6214       if (isSigned) {
6215         if (!isIntN(HalfSize, C->getSExtValue()))
6216           return false;
6217       } else {
6218         if (!isUIntN(HalfSize, C->getZExtValue()))
6219           return false;
6220       }
6221       continue;
6222     }
6223     return false;
6224   }
6225
6226   return true;
6227 }
6228
6229 /// isSignExtended - Check if a node is a vector value that is sign-extended
6230 /// or a constant BUILD_VECTOR with sign-extended elements.
6231 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
6232   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
6233     return true;
6234   if (isExtendedBUILD_VECTOR(N, DAG, true))
6235     return true;
6236   return false;
6237 }
6238
6239 /// isZeroExtended - Check if a node is a vector value that is zero-extended
6240 /// or a constant BUILD_VECTOR with zero-extended elements.
6241 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
6242   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
6243     return true;
6244   if (isExtendedBUILD_VECTOR(N, DAG, false))
6245     return true;
6246   return false;
6247 }
6248
6249 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
6250   if (OrigVT.getSizeInBits() >= 64)
6251     return OrigVT;
6252
6253   assert(OrigVT.isSimple() && "Expecting a simple value type");
6254
6255   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
6256   switch (OrigSimpleTy) {
6257   default: llvm_unreachable("Unexpected Vector Type");
6258   case MVT::v2i8:
6259   case MVT::v2i16:
6260      return MVT::v2i32;
6261   case MVT::v4i8:
6262     return  MVT::v4i16;
6263   }
6264 }
6265
6266 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
6267 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
6268 /// We insert the required extension here to get the vector to fill a D register.
6269 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
6270                                             const EVT &OrigTy,
6271                                             const EVT &ExtTy,
6272                                             unsigned ExtOpcode) {
6273   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
6274   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
6275   // 64-bits we need to insert a new extension so that it will be 64-bits.
6276   assert(ExtTy.is128BitVector() && "Unexpected extension size");
6277   if (OrigTy.getSizeInBits() >= 64)
6278     return N;
6279
6280   // Must extend size to at least 64 bits to be used as an operand for VMULL.
6281   EVT NewVT = getExtensionTo64Bits(OrigTy);
6282
6283   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
6284 }
6285
6286 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
6287 /// does not do any sign/zero extension. If the original vector is less
6288 /// than 64 bits, an appropriate extension will be added after the load to
6289 /// reach a total size of 64 bits. We have to add the extension separately
6290 /// because ARM does not have a sign/zero extending load for vectors.
6291 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
6292   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
6293
6294   // The load already has the right type.
6295   if (ExtendedTy == LD->getMemoryVT())
6296     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
6297                 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
6298                 LD->isNonTemporal(), LD->isInvariant(),
6299                 LD->getAlignment());
6300
6301   // We need to create a zextload/sextload. We cannot just create a load
6302   // followed by a zext/zext node because LowerMUL is also run during normal
6303   // operation legalization where we can't create illegal types.
6304   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
6305                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
6306                         LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(),
6307                         LD->isNonTemporal(), LD->getAlignment());
6308 }
6309
6310 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
6311 /// extending load, or BUILD_VECTOR with extended elements, return the
6312 /// unextended value. The unextended vector should be 64 bits so that it can
6313 /// be used as an operand to a VMULL instruction. If the original vector size
6314 /// before extension is less than 64 bits we add a an extension to resize
6315 /// the vector to 64 bits.
6316 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
6317   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
6318     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
6319                                         N->getOperand(0)->getValueType(0),
6320                                         N->getValueType(0),
6321                                         N->getOpcode());
6322
6323   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
6324     return SkipLoadExtensionForVMULL(LD, DAG);
6325
6326   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
6327   // have been legalized as a BITCAST from v4i32.
6328   if (N->getOpcode() == ISD::BITCAST) {
6329     SDNode *BVN = N->getOperand(0).getNode();
6330     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
6331            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
6332     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6333     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
6334                        BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
6335   }
6336   // Construct a new BUILD_VECTOR with elements truncated to half the size.
6337   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
6338   EVT VT = N->getValueType(0);
6339   unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
6340   unsigned NumElts = VT.getVectorNumElements();
6341   MVT TruncVT = MVT::getIntegerVT(EltSize);
6342   SmallVector<SDValue, 8> Ops;
6343   SDLoc dl(N);
6344   for (unsigned i = 0; i != NumElts; ++i) {
6345     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
6346     const APInt &CInt = C->getAPIntValue();
6347     // Element types smaller than 32 bits are not legal, so use i32 elements.
6348     // The values are implicitly truncated so sext vs. zext doesn't matter.
6349     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
6350   }
6351   return DAG.getNode(ISD::BUILD_VECTOR, dl,
6352                      MVT::getVectorVT(TruncVT, NumElts), Ops);
6353 }
6354
6355 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
6356   unsigned Opcode = N->getOpcode();
6357   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
6358     SDNode *N0 = N->getOperand(0).getNode();
6359     SDNode *N1 = N->getOperand(1).getNode();
6360     return N0->hasOneUse() && N1->hasOneUse() &&
6361       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
6362   }
6363   return false;
6364 }
6365
6366 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
6367   unsigned Opcode = N->getOpcode();
6368   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
6369     SDNode *N0 = N->getOperand(0).getNode();
6370     SDNode *N1 = N->getOperand(1).getNode();
6371     return N0->hasOneUse() && N1->hasOneUse() &&
6372       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
6373   }
6374   return false;
6375 }
6376
6377 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
6378   // Multiplications are only custom-lowered for 128-bit vectors so that
6379   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
6380   EVT VT = Op.getValueType();
6381   assert(VT.is128BitVector() && VT.isInteger() &&
6382          "unexpected type for custom-lowering ISD::MUL");
6383   SDNode *N0 = Op.getOperand(0).getNode();
6384   SDNode *N1 = Op.getOperand(1).getNode();
6385   unsigned NewOpc = 0;
6386   bool isMLA = false;
6387   bool isN0SExt = isSignExtended(N0, DAG);
6388   bool isN1SExt = isSignExtended(N1, DAG);
6389   if (isN0SExt && isN1SExt)
6390     NewOpc = ARMISD::VMULLs;
6391   else {
6392     bool isN0ZExt = isZeroExtended(N0, DAG);
6393     bool isN1ZExt = isZeroExtended(N1, DAG);
6394     if (isN0ZExt && isN1ZExt)
6395       NewOpc = ARMISD::VMULLu;
6396     else if (isN1SExt || isN1ZExt) {
6397       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
6398       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
6399       if (isN1SExt && isAddSubSExt(N0, DAG)) {
6400         NewOpc = ARMISD::VMULLs;
6401         isMLA = true;
6402       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
6403         NewOpc = ARMISD::VMULLu;
6404         isMLA = true;
6405       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
6406         std::swap(N0, N1);
6407         NewOpc = ARMISD::VMULLu;
6408         isMLA = true;
6409       }
6410     }
6411
6412     if (!NewOpc) {
6413       if (VT == MVT::v2i64)
6414         // Fall through to expand this.  It is not legal.
6415         return SDValue();
6416       else
6417         // Other vector multiplications are legal.
6418         return Op;
6419     }
6420   }
6421
6422   // Legalize to a VMULL instruction.
6423   SDLoc DL(Op);
6424   SDValue Op0;
6425   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
6426   if (!isMLA) {
6427     Op0 = SkipExtensionForVMULL(N0, DAG);
6428     assert(Op0.getValueType().is64BitVector() &&
6429            Op1.getValueType().is64BitVector() &&
6430            "unexpected types for extended operands to VMULL");
6431     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
6432   }
6433
6434   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
6435   // isel lowering to take advantage of no-stall back to back vmul + vmla.
6436   //   vmull q0, d4, d6
6437   //   vmlal q0, d5, d6
6438   // is faster than
6439   //   vaddl q0, d4, d5
6440   //   vmovl q1, d6
6441   //   vmul  q0, q0, q1
6442   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
6443   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
6444   EVT Op1VT = Op1.getValueType();
6445   return DAG.getNode(N0->getOpcode(), DL, VT,
6446                      DAG.getNode(NewOpc, DL, VT,
6447                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
6448                      DAG.getNode(NewOpc, DL, VT,
6449                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
6450 }
6451
6452 static SDValue
6453 LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
6454   // Convert to float
6455   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
6456   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
6457   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
6458   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
6459   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
6460   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
6461   // Get reciprocal estimate.
6462   // float4 recip = vrecpeq_f32(yf);
6463   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6464                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
6465                    Y);
6466   // Because char has a smaller range than uchar, we can actually get away
6467   // without any newton steps.  This requires that we use a weird bias
6468   // of 0xb000, however (again, this has been exhaustively tested).
6469   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
6470   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
6471   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
6472   Y = DAG.getConstant(0xb000, dl, MVT::i32);
6473   Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
6474   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
6475   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
6476   // Convert back to short.
6477   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
6478   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
6479   return X;
6480 }
6481
6482 static SDValue
6483 LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
6484   SDValue N2;
6485   // Convert to float.
6486   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
6487   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
6488   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
6489   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
6490   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
6491   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
6492
6493   // Use reciprocal estimate and one refinement step.
6494   // float4 recip = vrecpeq_f32(yf);
6495   // recip *= vrecpsq_f32(yf, recip);
6496   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6497                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
6498                    N1);
6499   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6500                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
6501                    N1, N2);
6502   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
6503   // Because short has a smaller range than ushort, we can actually get away
6504   // with only a single newton step.  This requires that we use a weird bias
6505   // of 89, however (again, this has been exhaustively tested).
6506   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
6507   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
6508   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
6509   N1 = DAG.getConstant(0x89, dl, MVT::i32);
6510   N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
6511   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
6512   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
6513   // Convert back to integer and return.
6514   // return vmovn_s32(vcvt_s32_f32(result));
6515   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
6516   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
6517   return N0;
6518 }
6519
6520 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
6521   EVT VT = Op.getValueType();
6522   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
6523          "unexpected type for custom-lowering ISD::SDIV");
6524
6525   SDLoc dl(Op);
6526   SDValue N0 = Op.getOperand(0);
6527   SDValue N1 = Op.getOperand(1);
6528   SDValue N2, N3;
6529
6530   if (VT == MVT::v8i8) {
6531     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
6532     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
6533
6534     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
6535                      DAG.getIntPtrConstant(4, dl));
6536     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
6537                      DAG.getIntPtrConstant(4, dl));
6538     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
6539                      DAG.getIntPtrConstant(0, dl));
6540     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
6541                      DAG.getIntPtrConstant(0, dl));
6542
6543     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
6544     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
6545
6546     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
6547     N0 = LowerCONCAT_VECTORS(N0, DAG);
6548
6549     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
6550     return N0;
6551   }
6552   return LowerSDIV_v4i16(N0, N1, dl, DAG);
6553 }
6554
6555 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
6556   EVT VT = Op.getValueType();
6557   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
6558          "unexpected type for custom-lowering ISD::UDIV");
6559
6560   SDLoc dl(Op);
6561   SDValue N0 = Op.getOperand(0);
6562   SDValue N1 = Op.getOperand(1);
6563   SDValue N2, N3;
6564
6565   if (VT == MVT::v8i8) {
6566     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
6567     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
6568
6569     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
6570                      DAG.getIntPtrConstant(4, dl));
6571     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
6572                      DAG.getIntPtrConstant(4, dl));
6573     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
6574                      DAG.getIntPtrConstant(0, dl));
6575     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
6576                      DAG.getIntPtrConstant(0, dl));
6577
6578     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
6579     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
6580
6581     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
6582     N0 = LowerCONCAT_VECTORS(N0, DAG);
6583
6584     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
6585                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
6586                                      MVT::i32),
6587                      N0);
6588     return N0;
6589   }
6590
6591   // v4i16 sdiv ... Convert to float.
6592   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
6593   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
6594   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
6595   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
6596   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
6597   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
6598
6599   // Use reciprocal estimate and two refinement steps.
6600   // float4 recip = vrecpeq_f32(yf);
6601   // recip *= vrecpsq_f32(yf, recip);
6602   // recip *= vrecpsq_f32(yf, recip);
6603   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6604                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
6605                    BN1);
6606   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6607                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
6608                    BN1, N2);
6609   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
6610   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6611                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
6612                    BN1, N2);
6613   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
6614   // Simply multiplying by the reciprocal estimate can leave us a few ulps
6615   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
6616   // and that it will never cause us to return an answer too large).
6617   // float4 result = as_float4(as_int4(xf*recip) + 2);
6618   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
6619   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
6620   N1 = DAG.getConstant(2, dl, MVT::i32);
6621   N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
6622   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
6623   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
6624   // Convert back to integer and return.
6625   // return vmovn_u32(vcvt_s32_f32(result));
6626   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
6627   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
6628   return N0;
6629 }
6630
6631 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
6632   EVT VT = Op.getNode()->getValueType(0);
6633   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
6634
6635   unsigned Opc;
6636   bool ExtraOp = false;
6637   switch (Op.getOpcode()) {
6638   default: llvm_unreachable("Invalid code");
6639   case ISD::ADDC: Opc = ARMISD::ADDC; break;
6640   case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
6641   case ISD::SUBC: Opc = ARMISD::SUBC; break;
6642   case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
6643   }
6644
6645   if (!ExtraOp)
6646     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
6647                        Op.getOperand(1));
6648   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
6649                      Op.getOperand(1), Op.getOperand(2));
6650 }
6651
6652 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
6653   assert(Subtarget->isTargetDarwin());
6654
6655   // For iOS, we want to call an alternative entry point: __sincos_stret,
6656   // return values are passed via sret.
6657   SDLoc dl(Op);
6658   SDValue Arg = Op.getOperand(0);
6659   EVT ArgVT = Arg.getValueType();
6660   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
6661   auto PtrVT = getPointerTy(DAG.getDataLayout());
6662
6663   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
6664
6665   // Pair of floats / doubles used to pass the result.
6666   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
6667
6668   // Create stack object for sret.
6669   auto &DL = DAG.getDataLayout();
6670   const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
6671   const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
6672   int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
6673   SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
6674
6675   ArgListTy Args;
6676   ArgListEntry Entry;
6677
6678   Entry.Node = SRet;
6679   Entry.Ty = RetTy->getPointerTo();
6680   Entry.isSExt = false;
6681   Entry.isZExt = false;
6682   Entry.isSRet = true;
6683   Args.push_back(Entry);
6684
6685   Entry.Node = Arg;
6686   Entry.Ty = ArgTy;
6687   Entry.isSExt = false;
6688   Entry.isZExt = false;
6689   Args.push_back(Entry);
6690
6691   const char *LibcallName  = (ArgVT == MVT::f64)
6692   ? "__sincos_stret" : "__sincosf_stret";
6693   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
6694
6695   TargetLowering::CallLoweringInfo CLI(DAG);
6696   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
6697     .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee,
6698                std::move(Args), 0)
6699     .setDiscardResult();
6700
6701   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6702
6703   SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
6704                                 MachinePointerInfo(), false, false, false, 0);
6705
6706   // Address of cos field.
6707   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
6708                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
6709   SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
6710                                 MachinePointerInfo(), false, false, false, 0);
6711
6712   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
6713   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
6714                      LoadSin.getValue(0), LoadCos.getValue(0));
6715 }
6716
6717 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
6718   // Monotonic load/store is legal for all targets
6719   if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
6720     return Op;
6721
6722   // Acquire/Release load/store is not legal for targets without a
6723   // dmb or equivalent available.
6724   return SDValue();
6725 }
6726
6727 static void ReplaceREADCYCLECOUNTER(SDNode *N,
6728                                     SmallVectorImpl<SDValue> &Results,
6729                                     SelectionDAG &DAG,
6730                                     const ARMSubtarget *Subtarget) {
6731   SDLoc DL(N);
6732   SDValue Cycles32, OutChain;
6733
6734   if (Subtarget->hasPerfMon()) {
6735     // Under Power Management extensions, the cycle-count is:
6736     //    mrc p15, #0, <Rt>, c9, c13, #0
6737     SDValue Ops[] = { N->getOperand(0), // Chain
6738                       DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
6739                       DAG.getConstant(15, DL, MVT::i32),
6740                       DAG.getConstant(0, DL, MVT::i32),
6741                       DAG.getConstant(9, DL, MVT::i32),
6742                       DAG.getConstant(13, DL, MVT::i32),
6743                       DAG.getConstant(0, DL, MVT::i32)
6744     };
6745
6746     Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
6747                            DAG.getVTList(MVT::i32, MVT::Other), Ops);
6748     OutChain = Cycles32.getValue(1);
6749   } else {
6750     // Intrinsic is defined to return 0 on unsupported platforms. Technically
6751     // there are older ARM CPUs that have implementation-specific ways of
6752     // obtaining this information (FIXME!).
6753     Cycles32 = DAG.getConstant(0, DL, MVT::i32);
6754     OutChain = DAG.getEntryNode();
6755   }
6756
6757
6758   SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
6759                                  Cycles32, DAG.getConstant(0, DL, MVT::i32));
6760   Results.push_back(Cycles64);
6761   Results.push_back(OutChain);
6762 }
6763
6764 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
6765   switch (Op.getOpcode()) {
6766   default: llvm_unreachable("Don't know how to custom lower this!");
6767   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
6768   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
6769   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
6770   case ISD::GlobalAddress:
6771     switch (Subtarget->getTargetTriple().getObjectFormat()) {
6772     default: llvm_unreachable("unknown object format");
6773     case Triple::COFF:
6774       return LowerGlobalAddressWindows(Op, DAG);
6775     case Triple::ELF:
6776       return LowerGlobalAddressELF(Op, DAG);
6777     case Triple::MachO:
6778       return LowerGlobalAddressDarwin(Op, DAG);
6779     }
6780   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
6781   case ISD::SELECT:        return LowerSELECT(Op, DAG);
6782   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
6783   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
6784   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
6785   case ISD::VASTART:       return LowerVASTART(Op, DAG);
6786   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
6787   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
6788   case ISD::SINT_TO_FP:
6789   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
6790   case ISD::FP_TO_SINT:
6791   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
6792   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
6793   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
6794   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
6795   case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
6796   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
6797   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
6798   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
6799   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
6800                                                                Subtarget);
6801   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
6802   case ISD::SHL:
6803   case ISD::SRL:
6804   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
6805   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
6806   case ISD::SRL_PARTS:
6807   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
6808   case ISD::CTTZ:
6809   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
6810   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
6811   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
6812   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
6813   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
6814   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
6815   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
6816   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6817   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
6818   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
6819   case ISD::MUL:           return LowerMUL(Op, DAG);
6820   case ISD::SDIV:          return LowerSDIV(Op, DAG);
6821   case ISD::UDIV:          return LowerUDIV(Op, DAG);
6822   case ISD::ADDC:
6823   case ISD::ADDE:
6824   case ISD::SUBC:
6825   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
6826   case ISD::SADDO:
6827   case ISD::UADDO:
6828   case ISD::SSUBO:
6829   case ISD::USUBO:
6830     return LowerXALUO(Op, DAG);
6831   case ISD::ATOMIC_LOAD:
6832   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
6833   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
6834   case ISD::SDIVREM:
6835   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
6836   case ISD::DYNAMIC_STACKALLOC:
6837     if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
6838       return LowerDYNAMIC_STACKALLOC(Op, DAG);
6839     llvm_unreachable("Don't know how to custom lower this!");
6840   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
6841   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
6842   }
6843 }
6844
6845 /// ReplaceNodeResults - Replace the results of node with an illegal result
6846 /// type with new values built out of custom code.
6847 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
6848                                            SmallVectorImpl<SDValue>&Results,
6849                                            SelectionDAG &DAG) const {
6850   SDValue Res;
6851   switch (N->getOpcode()) {
6852   default:
6853     llvm_unreachable("Don't know how to custom expand this!");
6854   case ISD::READ_REGISTER:
6855     ExpandREAD_REGISTER(N, Results, DAG);
6856     break;
6857   case ISD::BITCAST:
6858     Res = ExpandBITCAST(N, DAG);
6859     break;
6860   case ISD::SRL:
6861   case ISD::SRA:
6862     Res = Expand64BitShift(N, DAG, Subtarget);
6863     break;
6864   case ISD::READCYCLECOUNTER:
6865     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
6866     return;
6867   }
6868   if (Res.getNode())
6869     Results.push_back(Res);
6870 }
6871
6872 //===----------------------------------------------------------------------===//
6873 //                           ARM Scheduler Hooks
6874 //===----------------------------------------------------------------------===//
6875
6876 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
6877 /// registers the function context.
6878 void ARMTargetLowering::
6879 SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
6880                        MachineBasicBlock *DispatchBB, int FI) const {
6881   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
6882   DebugLoc dl = MI->getDebugLoc();
6883   MachineFunction *MF = MBB->getParent();
6884   MachineRegisterInfo *MRI = &MF->getRegInfo();
6885   MachineConstantPool *MCP = MF->getConstantPool();
6886   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
6887   const Function *F = MF->getFunction();
6888
6889   bool isThumb = Subtarget->isThumb();
6890   bool isThumb2 = Subtarget->isThumb2();
6891
6892   unsigned PCLabelId = AFI->createPICLabelUId();
6893   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
6894   ARMConstantPoolValue *CPV =
6895     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
6896   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
6897
6898   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
6899                                            : &ARM::GPRRegClass;
6900
6901   // Grab constant pool and fixed stack memory operands.
6902   MachineMemOperand *CPMMO =
6903       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
6904                                MachineMemOperand::MOLoad, 4, 4);
6905
6906   MachineMemOperand *FIMMOSt =
6907       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
6908                                MachineMemOperand::MOStore, 4, 4);
6909
6910   // Load the address of the dispatch MBB into the jump buffer.
6911   if (isThumb2) {
6912     // Incoming value: jbuf
6913     //   ldr.n  r5, LCPI1_1
6914     //   orr    r5, r5, #1
6915     //   add    r5, pc
6916     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
6917     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6918     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
6919                    .addConstantPoolIndex(CPI)
6920                    .addMemOperand(CPMMO));
6921     // Set the low bit because of thumb mode.
6922     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6923     AddDefaultCC(
6924       AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
6925                      .addReg(NewVReg1, RegState::Kill)
6926                      .addImm(0x01)));
6927     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6928     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
6929       .addReg(NewVReg2, RegState::Kill)
6930       .addImm(PCLabelId);
6931     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
6932                    .addReg(NewVReg3, RegState::Kill)
6933                    .addFrameIndex(FI)
6934                    .addImm(36)  // &jbuf[1] :: pc
6935                    .addMemOperand(FIMMOSt));
6936   } else if (isThumb) {
6937     // Incoming value: jbuf
6938     //   ldr.n  r1, LCPI1_4
6939     //   add    r1, pc
6940     //   mov    r2, #1
6941     //   orrs   r1, r2
6942     //   add    r2, $jbuf, #+4 ; &jbuf[1]
6943     //   str    r1, [r2]
6944     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6945     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
6946                    .addConstantPoolIndex(CPI)
6947                    .addMemOperand(CPMMO));
6948     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6949     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
6950       .addReg(NewVReg1, RegState::Kill)
6951       .addImm(PCLabelId);
6952     // Set the low bit because of thumb mode.
6953     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6954     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
6955                    .addReg(ARM::CPSR, RegState::Define)
6956                    .addImm(1));
6957     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6958     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
6959                    .addReg(ARM::CPSR, RegState::Define)
6960                    .addReg(NewVReg2, RegState::Kill)
6961                    .addReg(NewVReg3, RegState::Kill));
6962     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
6963     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
6964             .addFrameIndex(FI)
6965             .addImm(36); // &jbuf[1] :: pc
6966     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
6967                    .addReg(NewVReg4, RegState::Kill)
6968                    .addReg(NewVReg5, RegState::Kill)
6969                    .addImm(0)
6970                    .addMemOperand(FIMMOSt));
6971   } else {
6972     // Incoming value: jbuf
6973     //   ldr  r1, LCPI1_1
6974     //   add  r1, pc, r1
6975     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
6976     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6977     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
6978                    .addConstantPoolIndex(CPI)
6979                    .addImm(0)
6980                    .addMemOperand(CPMMO));
6981     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6982     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
6983                    .addReg(NewVReg1, RegState::Kill)
6984                    .addImm(PCLabelId));
6985     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
6986                    .addReg(NewVReg2, RegState::Kill)
6987                    .addFrameIndex(FI)
6988                    .addImm(36)  // &jbuf[1] :: pc
6989                    .addMemOperand(FIMMOSt));
6990   }
6991 }
6992
6993 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
6994                                               MachineBasicBlock *MBB) const {
6995   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
6996   DebugLoc dl = MI->getDebugLoc();
6997   MachineFunction *MF = MBB->getParent();
6998   MachineRegisterInfo *MRI = &MF->getRegInfo();
6999   MachineFrameInfo *MFI = MF->getFrameInfo();
7000   int FI = MFI->getFunctionContextIndex();
7001
7002   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
7003                                                         : &ARM::GPRnopcRegClass;
7004
7005   // Get a mapping of the call site numbers to all of the landing pads they're
7006   // associated with.
7007   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
7008   unsigned MaxCSNum = 0;
7009   MachineModuleInfo &MMI = MF->getMMI();
7010   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
7011        ++BB) {
7012     if (!BB->isLandingPad()) continue;
7013
7014     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
7015     // pad.
7016     for (MachineBasicBlock::iterator
7017            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
7018       if (!II->isEHLabel()) continue;
7019
7020       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
7021       if (!MMI.hasCallSiteLandingPad(Sym)) continue;
7022
7023       SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
7024       for (SmallVectorImpl<unsigned>::iterator
7025              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
7026            CSI != CSE; ++CSI) {
7027         CallSiteNumToLPad[*CSI].push_back(BB);
7028         MaxCSNum = std::max(MaxCSNum, *CSI);
7029       }
7030       break;
7031     }
7032   }
7033
7034   // Get an ordered list of the machine basic blocks for the jump table.
7035   std::vector<MachineBasicBlock*> LPadList;
7036   SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
7037   LPadList.reserve(CallSiteNumToLPad.size());
7038   for (unsigned I = 1; I <= MaxCSNum; ++I) {
7039     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
7040     for (SmallVectorImpl<MachineBasicBlock*>::iterator
7041            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
7042       LPadList.push_back(*II);
7043       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
7044     }
7045   }
7046
7047   assert(!LPadList.empty() &&
7048          "No landing pad destinations for the dispatch jump table!");
7049
7050   // Create the jump table and associated information.
7051   MachineJumpTableInfo *JTI =
7052     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
7053   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
7054   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
7055
7056   // Create the MBBs for the dispatch code.
7057
7058   // Shove the dispatch's address into the return slot in the function context.
7059   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
7060   DispatchBB->setIsLandingPad();
7061
7062   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7063   unsigned trap_opcode;
7064   if (Subtarget->isThumb())
7065     trap_opcode = ARM::tTRAP;
7066   else
7067     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
7068
7069   BuildMI(TrapBB, dl, TII->get(trap_opcode));
7070   DispatchBB->addSuccessor(TrapBB);
7071
7072   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
7073   DispatchBB->addSuccessor(DispContBB);
7074
7075   // Insert and MBBs.
7076   MF->insert(MF->end(), DispatchBB);
7077   MF->insert(MF->end(), DispContBB);
7078   MF->insert(MF->end(), TrapBB);
7079
7080   // Insert code into the entry block that creates and registers the function
7081   // context.
7082   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
7083
7084   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
7085       MachinePointerInfo::getFixedStack(*MF, FI),
7086       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
7087
7088   MachineInstrBuilder MIB;
7089   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
7090
7091   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
7092   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
7093
7094   // Add a register mask with no preserved registers.  This results in all
7095   // registers being marked as clobbered.
7096   MIB.addRegMask(RI.getNoPreservedMask());
7097
7098   unsigned NumLPads = LPadList.size();
7099   if (Subtarget->isThumb2()) {
7100     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7101     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
7102                    .addFrameIndex(FI)
7103                    .addImm(4)
7104                    .addMemOperand(FIMMOLd));
7105
7106     if (NumLPads < 256) {
7107       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
7108                      .addReg(NewVReg1)
7109                      .addImm(LPadList.size()));
7110     } else {
7111       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7112       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
7113                      .addImm(NumLPads & 0xFFFF));
7114
7115       unsigned VReg2 = VReg1;
7116       if ((NumLPads & 0xFFFF0000) != 0) {
7117         VReg2 = MRI->createVirtualRegister(TRC);
7118         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
7119                        .addReg(VReg1)
7120                        .addImm(NumLPads >> 16));
7121       }
7122
7123       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
7124                      .addReg(NewVReg1)
7125                      .addReg(VReg2));
7126     }
7127
7128     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
7129       .addMBB(TrapBB)
7130       .addImm(ARMCC::HI)
7131       .addReg(ARM::CPSR);
7132
7133     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7134     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
7135                    .addJumpTableIndex(MJTI));
7136
7137     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7138     AddDefaultCC(
7139       AddDefaultPred(
7140         BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
7141         .addReg(NewVReg3, RegState::Kill)
7142         .addReg(NewVReg1)
7143         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
7144
7145     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
7146       .addReg(NewVReg4, RegState::Kill)
7147       .addReg(NewVReg1)
7148       .addJumpTableIndex(MJTI);
7149   } else if (Subtarget->isThumb()) {
7150     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7151     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
7152                    .addFrameIndex(FI)
7153                    .addImm(1)
7154                    .addMemOperand(FIMMOLd));
7155
7156     if (NumLPads < 256) {
7157       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
7158                      .addReg(NewVReg1)
7159                      .addImm(NumLPads));
7160     } else {
7161       MachineConstantPool *ConstantPool = MF->getConstantPool();
7162       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7163       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
7164
7165       // MachineConstantPool wants an explicit alignment.
7166       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
7167       if (Align == 0)
7168         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
7169       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7170
7171       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7172       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
7173                      .addReg(VReg1, RegState::Define)
7174                      .addConstantPoolIndex(Idx));
7175       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
7176                      .addReg(NewVReg1)
7177                      .addReg(VReg1));
7178     }
7179
7180     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
7181       .addMBB(TrapBB)
7182       .addImm(ARMCC::HI)
7183       .addReg(ARM::CPSR);
7184
7185     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7186     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
7187                    .addReg(ARM::CPSR, RegState::Define)
7188                    .addReg(NewVReg1)
7189                    .addImm(2));
7190
7191     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7192     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
7193                    .addJumpTableIndex(MJTI));
7194
7195     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7196     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
7197                    .addReg(ARM::CPSR, RegState::Define)
7198                    .addReg(NewVReg2, RegState::Kill)
7199                    .addReg(NewVReg3));
7200
7201     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
7202         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
7203
7204     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7205     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
7206                    .addReg(NewVReg4, RegState::Kill)
7207                    .addImm(0)
7208                    .addMemOperand(JTMMOLd));
7209
7210     unsigned NewVReg6 = NewVReg5;
7211     if (RelocM == Reloc::PIC_) {
7212       NewVReg6 = MRI->createVirtualRegister(TRC);
7213       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
7214                      .addReg(ARM::CPSR, RegState::Define)
7215                      .addReg(NewVReg5, RegState::Kill)
7216                      .addReg(NewVReg3));
7217     }
7218
7219     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
7220       .addReg(NewVReg6, RegState::Kill)
7221       .addJumpTableIndex(MJTI);
7222   } else {
7223     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7224     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
7225                    .addFrameIndex(FI)
7226                    .addImm(4)
7227                    .addMemOperand(FIMMOLd));
7228
7229     if (NumLPads < 256) {
7230       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
7231                      .addReg(NewVReg1)
7232                      .addImm(NumLPads));
7233     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
7234       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7235       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
7236                      .addImm(NumLPads & 0xFFFF));
7237
7238       unsigned VReg2 = VReg1;
7239       if ((NumLPads & 0xFFFF0000) != 0) {
7240         VReg2 = MRI->createVirtualRegister(TRC);
7241         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
7242                        .addReg(VReg1)
7243                        .addImm(NumLPads >> 16));
7244       }
7245
7246       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
7247                      .addReg(NewVReg1)
7248                      .addReg(VReg2));
7249     } else {
7250       MachineConstantPool *ConstantPool = MF->getConstantPool();
7251       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7252       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
7253
7254       // MachineConstantPool wants an explicit alignment.
7255       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
7256       if (Align == 0)
7257         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
7258       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7259
7260       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7261       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
7262                      .addReg(VReg1, RegState::Define)
7263                      .addConstantPoolIndex(Idx)
7264                      .addImm(0));
7265       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
7266                      .addReg(NewVReg1)
7267                      .addReg(VReg1, RegState::Kill));
7268     }
7269
7270     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
7271       .addMBB(TrapBB)
7272       .addImm(ARMCC::HI)
7273       .addReg(ARM::CPSR);
7274
7275     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7276     AddDefaultCC(
7277       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
7278                      .addReg(NewVReg1)
7279                      .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
7280     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7281     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
7282                    .addJumpTableIndex(MJTI));
7283
7284     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
7285         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
7286     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7287     AddDefaultPred(
7288       BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
7289       .addReg(NewVReg3, RegState::Kill)
7290       .addReg(NewVReg4)
7291       .addImm(0)
7292       .addMemOperand(JTMMOLd));
7293
7294     if (RelocM == Reloc::PIC_) {
7295       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
7296         .addReg(NewVReg5, RegState::Kill)
7297         .addReg(NewVReg4)
7298         .addJumpTableIndex(MJTI);
7299     } else {
7300       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
7301         .addReg(NewVReg5, RegState::Kill)
7302         .addJumpTableIndex(MJTI);
7303     }
7304   }
7305
7306   // Add the jump table entries as successors to the MBB.
7307   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
7308   for (std::vector<MachineBasicBlock*>::iterator
7309          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
7310     MachineBasicBlock *CurMBB = *I;
7311     if (SeenMBBs.insert(CurMBB).second)
7312       DispContBB->addSuccessor(CurMBB);
7313   }
7314
7315   // N.B. the order the invoke BBs are processed in doesn't matter here.
7316   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
7317   SmallVector<MachineBasicBlock*, 64> MBBLPads;
7318   for (MachineBasicBlock *BB : InvokeBBs) {
7319
7320     // Remove the landing pad successor from the invoke block and replace it
7321     // with the new dispatch block.
7322     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
7323                                                   BB->succ_end());
7324     while (!Successors.empty()) {
7325       MachineBasicBlock *SMBB = Successors.pop_back_val();
7326       if (SMBB->isLandingPad()) {
7327         BB->removeSuccessor(SMBB);
7328         MBBLPads.push_back(SMBB);
7329       }
7330     }
7331
7332     BB->addSuccessor(DispatchBB);
7333
7334     // Find the invoke call and mark all of the callee-saved registers as
7335     // 'implicit defined' so that they're spilled. This prevents code from
7336     // moving instructions to before the EH block, where they will never be
7337     // executed.
7338     for (MachineBasicBlock::reverse_iterator
7339            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
7340       if (!II->isCall()) continue;
7341
7342       DenseMap<unsigned, bool> DefRegs;
7343       for (MachineInstr::mop_iterator
7344              OI = II->operands_begin(), OE = II->operands_end();
7345            OI != OE; ++OI) {
7346         if (!OI->isReg()) continue;
7347         DefRegs[OI->getReg()] = true;
7348       }
7349
7350       MachineInstrBuilder MIB(*MF, &*II);
7351
7352       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
7353         unsigned Reg = SavedRegs[i];
7354         if (Subtarget->isThumb2() &&
7355             !ARM::tGPRRegClass.contains(Reg) &&
7356             !ARM::hGPRRegClass.contains(Reg))
7357           continue;
7358         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
7359           continue;
7360         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
7361           continue;
7362         if (!DefRegs[Reg])
7363           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
7364       }
7365
7366       break;
7367     }
7368   }
7369
7370   // Mark all former landing pads as non-landing pads. The dispatch is the only
7371   // landing pad now.
7372   for (SmallVectorImpl<MachineBasicBlock*>::iterator
7373          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
7374     (*I)->setIsLandingPad(false);
7375
7376   // The instruction is gone now.
7377   MI->eraseFromParent();
7378 }
7379
7380 static
7381 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
7382   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
7383        E = MBB->succ_end(); I != E; ++I)
7384     if (*I != Succ)
7385       return *I;
7386   llvm_unreachable("Expecting a BB with two successors!");
7387 }
7388
7389 /// Return the load opcode for a given load size. If load size >= 8,
7390 /// neon opcode will be returned.
7391 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
7392   if (LdSize >= 8)
7393     return LdSize == 16 ? ARM::VLD1q32wb_fixed
7394                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
7395   if (IsThumb1)
7396     return LdSize == 4 ? ARM::tLDRi
7397                        : LdSize == 2 ? ARM::tLDRHi
7398                                      : LdSize == 1 ? ARM::tLDRBi : 0;
7399   if (IsThumb2)
7400     return LdSize == 4 ? ARM::t2LDR_POST
7401                        : LdSize == 2 ? ARM::t2LDRH_POST
7402                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
7403   return LdSize == 4 ? ARM::LDR_POST_IMM
7404                      : LdSize == 2 ? ARM::LDRH_POST
7405                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
7406 }
7407
7408 /// Return the store opcode for a given store size. If store size >= 8,
7409 /// neon opcode will be returned.
7410 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
7411   if (StSize >= 8)
7412     return StSize == 16 ? ARM::VST1q32wb_fixed
7413                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
7414   if (IsThumb1)
7415     return StSize == 4 ? ARM::tSTRi
7416                        : StSize == 2 ? ARM::tSTRHi
7417                                      : StSize == 1 ? ARM::tSTRBi : 0;
7418   if (IsThumb2)
7419     return StSize == 4 ? ARM::t2STR_POST
7420                        : StSize == 2 ? ARM::t2STRH_POST
7421                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
7422   return StSize == 4 ? ARM::STR_POST_IMM
7423                      : StSize == 2 ? ARM::STRH_POST
7424                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
7425 }
7426
7427 /// Emit a post-increment load operation with given size. The instructions
7428 /// will be added to BB at Pos.
7429 static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
7430                        const TargetInstrInfo *TII, DebugLoc dl,
7431                        unsigned LdSize, unsigned Data, unsigned AddrIn,
7432                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
7433   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
7434   assert(LdOpc != 0 && "Should have a load opcode");
7435   if (LdSize >= 8) {
7436     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
7437                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
7438                        .addImm(0));
7439   } else if (IsThumb1) {
7440     // load + update AddrIn
7441     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
7442                        .addReg(AddrIn).addImm(0));
7443     MachineInstrBuilder MIB =
7444         BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
7445     MIB = AddDefaultT1CC(MIB);
7446     MIB.addReg(AddrIn).addImm(LdSize);
7447     AddDefaultPred(MIB);
7448   } else if (IsThumb2) {
7449     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
7450                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
7451                        .addImm(LdSize));
7452   } else { // arm
7453     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
7454                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
7455                        .addReg(0).addImm(LdSize));
7456   }
7457 }
7458
7459 /// Emit a post-increment store operation with given size. The instructions
7460 /// will be added to BB at Pos.
7461 static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
7462                        const TargetInstrInfo *TII, DebugLoc dl,
7463                        unsigned StSize, unsigned Data, unsigned AddrIn,
7464                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
7465   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
7466   assert(StOpc != 0 && "Should have a store opcode");
7467   if (StSize >= 8) {
7468     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
7469                        .addReg(AddrIn).addImm(0).addReg(Data));
7470   } else if (IsThumb1) {
7471     // store + update AddrIn
7472     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
7473                        .addReg(AddrIn).addImm(0));
7474     MachineInstrBuilder MIB =
7475         BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
7476     MIB = AddDefaultT1CC(MIB);
7477     MIB.addReg(AddrIn).addImm(StSize);
7478     AddDefaultPred(MIB);
7479   } else if (IsThumb2) {
7480     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
7481                        .addReg(Data).addReg(AddrIn).addImm(StSize));
7482   } else { // arm
7483     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
7484                        .addReg(Data).addReg(AddrIn).addReg(0)
7485                        .addImm(StSize));
7486   }
7487 }
7488
7489 MachineBasicBlock *
7490 ARMTargetLowering::EmitStructByval(MachineInstr *MI,
7491                                    MachineBasicBlock *BB) const {
7492   // This pseudo instruction has 3 operands: dst, src, size
7493   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
7494   // Otherwise, we will generate unrolled scalar copies.
7495   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7496   const BasicBlock *LLVM_BB = BB->getBasicBlock();
7497   MachineFunction::iterator It = BB;
7498   ++It;
7499
7500   unsigned dest = MI->getOperand(0).getReg();
7501   unsigned src = MI->getOperand(1).getReg();
7502   unsigned SizeVal = MI->getOperand(2).getImm();
7503   unsigned Align = MI->getOperand(3).getImm();
7504   DebugLoc dl = MI->getDebugLoc();
7505
7506   MachineFunction *MF = BB->getParent();
7507   MachineRegisterInfo &MRI = MF->getRegInfo();
7508   unsigned UnitSize = 0;
7509   const TargetRegisterClass *TRC = nullptr;
7510   const TargetRegisterClass *VecTRC = nullptr;
7511
7512   bool IsThumb1 = Subtarget->isThumb1Only();
7513   bool IsThumb2 = Subtarget->isThumb2();
7514
7515   if (Align & 1) {
7516     UnitSize = 1;
7517   } else if (Align & 2) {
7518     UnitSize = 2;
7519   } else {
7520     // Check whether we can use NEON instructions.
7521     if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
7522         Subtarget->hasNEON()) {
7523       if ((Align % 16 == 0) && SizeVal >= 16)
7524         UnitSize = 16;
7525       else if ((Align % 8 == 0) && SizeVal >= 8)
7526         UnitSize = 8;
7527     }
7528     // Can't use NEON instructions.
7529     if (UnitSize == 0)
7530       UnitSize = 4;
7531   }
7532
7533   // Select the correct opcode and register class for unit size load/store
7534   bool IsNeon = UnitSize >= 8;
7535   TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
7536   if (IsNeon)
7537     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
7538                             : UnitSize == 8 ? &ARM::DPRRegClass
7539                                             : nullptr;
7540
7541   unsigned BytesLeft = SizeVal % UnitSize;
7542   unsigned LoopSize = SizeVal - BytesLeft;
7543
7544   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
7545     // Use LDR and STR to copy.
7546     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
7547     // [destOut] = STR_POST(scratch, destIn, UnitSize)
7548     unsigned srcIn = src;
7549     unsigned destIn = dest;
7550     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
7551       unsigned srcOut = MRI.createVirtualRegister(TRC);
7552       unsigned destOut = MRI.createVirtualRegister(TRC);
7553       unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
7554       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
7555                  IsThumb1, IsThumb2);
7556       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
7557                  IsThumb1, IsThumb2);
7558       srcIn = srcOut;
7559       destIn = destOut;
7560     }
7561
7562     // Handle the leftover bytes with LDRB and STRB.
7563     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
7564     // [destOut] = STRB_POST(scratch, destIn, 1)
7565     for (unsigned i = 0; i < BytesLeft; i++) {
7566       unsigned srcOut = MRI.createVirtualRegister(TRC);
7567       unsigned destOut = MRI.createVirtualRegister(TRC);
7568       unsigned scratch = MRI.createVirtualRegister(TRC);
7569       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
7570                  IsThumb1, IsThumb2);
7571       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
7572                  IsThumb1, IsThumb2);
7573       srcIn = srcOut;
7574       destIn = destOut;
7575     }
7576     MI->eraseFromParent();   // The instruction is gone now.
7577     return BB;
7578   }
7579
7580   // Expand the pseudo op to a loop.
7581   // thisMBB:
7582   //   ...
7583   //   movw varEnd, # --> with thumb2
7584   //   movt varEnd, #
7585   //   ldrcp varEnd, idx --> without thumb2
7586   //   fallthrough --> loopMBB
7587   // loopMBB:
7588   //   PHI varPhi, varEnd, varLoop
7589   //   PHI srcPhi, src, srcLoop
7590   //   PHI destPhi, dst, destLoop
7591   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
7592   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
7593   //   subs varLoop, varPhi, #UnitSize
7594   //   bne loopMBB
7595   //   fallthrough --> exitMBB
7596   // exitMBB:
7597   //   epilogue to handle left-over bytes
7598   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
7599   //   [destOut] = STRB_POST(scratch, destLoop, 1)
7600   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
7601   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
7602   MF->insert(It, loopMBB);
7603   MF->insert(It, exitMBB);
7604
7605   // Transfer the remainder of BB and its successor edges to exitMBB.
7606   exitMBB->splice(exitMBB->begin(), BB,
7607                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
7608   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
7609
7610   // Load an immediate to varEnd.
7611   unsigned varEnd = MRI.createVirtualRegister(TRC);
7612   if (Subtarget->useMovt(*MF)) {
7613     unsigned Vtmp = varEnd;
7614     if ((LoopSize & 0xFFFF0000) != 0)
7615       Vtmp = MRI.createVirtualRegister(TRC);
7616     AddDefaultPred(BuildMI(BB, dl,
7617                            TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16),
7618                            Vtmp).addImm(LoopSize & 0xFFFF));
7619
7620     if ((LoopSize & 0xFFFF0000) != 0)
7621       AddDefaultPred(BuildMI(BB, dl,
7622                              TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16),
7623                              varEnd)
7624                          .addReg(Vtmp)
7625                          .addImm(LoopSize >> 16));
7626   } else {
7627     MachineConstantPool *ConstantPool = MF->getConstantPool();
7628     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7629     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
7630
7631     // MachineConstantPool wants an explicit alignment.
7632     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
7633     if (Align == 0)
7634       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
7635     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7636
7637     if (IsThumb1)
7638       AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
7639           varEnd, RegState::Define).addConstantPoolIndex(Idx));
7640     else
7641       AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
7642           varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
7643   }
7644   BB->addSuccessor(loopMBB);
7645
7646   // Generate the loop body:
7647   //   varPhi = PHI(varLoop, varEnd)
7648   //   srcPhi = PHI(srcLoop, src)
7649   //   destPhi = PHI(destLoop, dst)
7650   MachineBasicBlock *entryBB = BB;
7651   BB = loopMBB;
7652   unsigned varLoop = MRI.createVirtualRegister(TRC);
7653   unsigned varPhi = MRI.createVirtualRegister(TRC);
7654   unsigned srcLoop = MRI.createVirtualRegister(TRC);
7655   unsigned srcPhi = MRI.createVirtualRegister(TRC);
7656   unsigned destLoop = MRI.createVirtualRegister(TRC);
7657   unsigned destPhi = MRI.createVirtualRegister(TRC);
7658
7659   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
7660     .addReg(varLoop).addMBB(loopMBB)
7661     .addReg(varEnd).addMBB(entryBB);
7662   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
7663     .addReg(srcLoop).addMBB(loopMBB)
7664     .addReg(src).addMBB(entryBB);
7665   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
7666     .addReg(destLoop).addMBB(loopMBB)
7667     .addReg(dest).addMBB(entryBB);
7668
7669   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
7670   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
7671   unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
7672   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
7673              IsThumb1, IsThumb2);
7674   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
7675              IsThumb1, IsThumb2);
7676
7677   // Decrement loop variable by UnitSize.
7678   if (IsThumb1) {
7679     MachineInstrBuilder MIB =
7680         BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
7681     MIB = AddDefaultT1CC(MIB);
7682     MIB.addReg(varPhi).addImm(UnitSize);
7683     AddDefaultPred(MIB);
7684   } else {
7685     MachineInstrBuilder MIB =
7686         BuildMI(*BB, BB->end(), dl,
7687                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
7688     AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
7689     MIB->getOperand(5).setReg(ARM::CPSR);
7690     MIB->getOperand(5).setIsDef(true);
7691   }
7692   BuildMI(*BB, BB->end(), dl,
7693           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
7694       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
7695
7696   // loopMBB can loop back to loopMBB or fall through to exitMBB.
7697   BB->addSuccessor(loopMBB);
7698   BB->addSuccessor(exitMBB);
7699
7700   // Add epilogue to handle BytesLeft.
7701   BB = exitMBB;
7702   MachineInstr *StartOfExit = exitMBB->begin();
7703
7704   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
7705   //   [destOut] = STRB_POST(scratch, destLoop, 1)
7706   unsigned srcIn = srcLoop;
7707   unsigned destIn = destLoop;
7708   for (unsigned i = 0; i < BytesLeft; i++) {
7709     unsigned srcOut = MRI.createVirtualRegister(TRC);
7710     unsigned destOut = MRI.createVirtualRegister(TRC);
7711     unsigned scratch = MRI.createVirtualRegister(TRC);
7712     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
7713                IsThumb1, IsThumb2);
7714     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
7715                IsThumb1, IsThumb2);
7716     srcIn = srcOut;
7717     destIn = destOut;
7718   }
7719
7720   MI->eraseFromParent();   // The instruction is gone now.
7721   return BB;
7722 }
7723
7724 MachineBasicBlock *
7725 ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
7726                                        MachineBasicBlock *MBB) const {
7727   const TargetMachine &TM = getTargetMachine();
7728   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
7729   DebugLoc DL = MI->getDebugLoc();
7730
7731   assert(Subtarget->isTargetWindows() &&
7732          "__chkstk is only supported on Windows");
7733   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
7734
7735   // __chkstk takes the number of words to allocate on the stack in R4, and
7736   // returns the stack adjustment in number of bytes in R4.  This will not
7737   // clober any other registers (other than the obvious lr).
7738   //
7739   // Although, technically, IP should be considered a register which may be
7740   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
7741   // thumb-2 environment, so there is no interworking required.  As a result, we
7742   // do not expect a veneer to be emitted by the linker, clobbering IP.
7743   //
7744   // Each module receives its own copy of __chkstk, so no import thunk is
7745   // required, again, ensuring that IP is not clobbered.
7746   //
7747   // Finally, although some linkers may theoretically provide a trampoline for
7748   // out of range calls (which is quite common due to a 32M range limitation of
7749   // branches for Thumb), we can generate the long-call version via
7750   // -mcmodel=large, alleviating the need for the trampoline which may clobber
7751   // IP.
7752
7753   switch (TM.getCodeModel()) {
7754   case CodeModel::Small:
7755   case CodeModel::Medium:
7756   case CodeModel::Default:
7757   case CodeModel::Kernel:
7758     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
7759       .addImm((unsigned)ARMCC::AL).addReg(0)
7760       .addExternalSymbol("__chkstk")
7761       .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
7762       .addReg(ARM::R4, RegState::Implicit | RegState::Define)
7763       .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
7764     break;
7765   case CodeModel::Large:
7766   case CodeModel::JITDefault: {
7767     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7768     unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
7769
7770     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
7771       .addExternalSymbol("__chkstk");
7772     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
7773       .addImm((unsigned)ARMCC::AL).addReg(0)
7774       .addReg(Reg, RegState::Kill)
7775       .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
7776       .addReg(ARM::R4, RegState::Implicit | RegState::Define)
7777       .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
7778     break;
7779   }
7780   }
7781
7782   AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
7783                                       ARM::SP)
7784                               .addReg(ARM::SP).addReg(ARM::R4)));
7785
7786   MI->eraseFromParent();
7787   return MBB;
7788 }
7789
7790 MachineBasicBlock *
7791 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
7792                                                MachineBasicBlock *BB) const {
7793   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7794   DebugLoc dl = MI->getDebugLoc();
7795   bool isThumb2 = Subtarget->isThumb2();
7796   switch (MI->getOpcode()) {
7797   default: {
7798     MI->dump();
7799     llvm_unreachable("Unexpected instr type to insert");
7800   }
7801   // The Thumb2 pre-indexed stores have the same MI operands, they just
7802   // define them differently in the .td files from the isel patterns, so
7803   // they need pseudos.
7804   case ARM::t2STR_preidx:
7805     MI->setDesc(TII->get(ARM::t2STR_PRE));
7806     return BB;
7807   case ARM::t2STRB_preidx:
7808     MI->setDesc(TII->get(ARM::t2STRB_PRE));
7809     return BB;
7810   case ARM::t2STRH_preidx:
7811     MI->setDesc(TII->get(ARM::t2STRH_PRE));
7812     return BB;
7813
7814   case ARM::STRi_preidx:
7815   case ARM::STRBi_preidx: {
7816     unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
7817       ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
7818     // Decode the offset.
7819     unsigned Offset = MI->getOperand(4).getImm();
7820     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
7821     Offset = ARM_AM::getAM2Offset(Offset);
7822     if (isSub)
7823       Offset = -Offset;
7824
7825     MachineMemOperand *MMO = *MI->memoperands_begin();
7826     BuildMI(*BB, MI, dl, TII->get(NewOpc))
7827       .addOperand(MI->getOperand(0))  // Rn_wb
7828       .addOperand(MI->getOperand(1))  // Rt
7829       .addOperand(MI->getOperand(2))  // Rn
7830       .addImm(Offset)                 // offset (skip GPR==zero_reg)
7831       .addOperand(MI->getOperand(5))  // pred
7832       .addOperand(MI->getOperand(6))
7833       .addMemOperand(MMO);
7834     MI->eraseFromParent();
7835     return BB;
7836   }
7837   case ARM::STRr_preidx:
7838   case ARM::STRBr_preidx:
7839   case ARM::STRH_preidx: {
7840     unsigned NewOpc;
7841     switch (MI->getOpcode()) {
7842     default: llvm_unreachable("unexpected opcode!");
7843     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
7844     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
7845     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
7846     }
7847     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
7848     for (unsigned i = 0; i < MI->getNumOperands(); ++i)
7849       MIB.addOperand(MI->getOperand(i));
7850     MI->eraseFromParent();
7851     return BB;
7852   }
7853
7854   case ARM::tMOVCCr_pseudo: {
7855     // To "insert" a SELECT_CC instruction, we actually have to insert the
7856     // diamond control-flow pattern.  The incoming instruction knows the
7857     // destination vreg to set, the condition code register to branch on, the
7858     // true/false values to select between, and a branch opcode to use.
7859     const BasicBlock *LLVM_BB = BB->getBasicBlock();
7860     MachineFunction::iterator It = BB;
7861     ++It;
7862
7863     //  thisMBB:
7864     //  ...
7865     //   TrueVal = ...
7866     //   cmpTY ccX, r1, r2
7867     //   bCC copy1MBB
7868     //   fallthrough --> copy0MBB
7869     MachineBasicBlock *thisMBB  = BB;
7870     MachineFunction *F = BB->getParent();
7871     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
7872     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
7873     F->insert(It, copy0MBB);
7874     F->insert(It, sinkMBB);
7875
7876     // Transfer the remainder of BB and its successor edges to sinkMBB.
7877     sinkMBB->splice(sinkMBB->begin(), BB,
7878                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
7879     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
7880
7881     BB->addSuccessor(copy0MBB);
7882     BB->addSuccessor(sinkMBB);
7883
7884     BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
7885       .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
7886
7887     //  copy0MBB:
7888     //   %FalseValue = ...
7889     //   # fallthrough to sinkMBB
7890     BB = copy0MBB;
7891
7892     // Update machine-CFG edges
7893     BB->addSuccessor(sinkMBB);
7894
7895     //  sinkMBB:
7896     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
7897     //  ...
7898     BB = sinkMBB;
7899     BuildMI(*BB, BB->begin(), dl,
7900             TII->get(ARM::PHI), MI->getOperand(0).getReg())
7901       .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
7902       .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
7903
7904     MI->eraseFromParent();   // The pseudo instruction is gone now.
7905     return BB;
7906   }
7907
7908   case ARM::BCCi64:
7909   case ARM::BCCZi64: {
7910     // If there is an unconditional branch to the other successor, remove it.
7911     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
7912
7913     // Compare both parts that make up the double comparison separately for
7914     // equality.
7915     bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
7916
7917     unsigned LHS1 = MI->getOperand(1).getReg();
7918     unsigned LHS2 = MI->getOperand(2).getReg();
7919     if (RHSisZero) {
7920       AddDefaultPred(BuildMI(BB, dl,
7921                              TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
7922                      .addReg(LHS1).addImm(0));
7923       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
7924         .addReg(LHS2).addImm(0)
7925         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
7926     } else {
7927       unsigned RHS1 = MI->getOperand(3).getReg();
7928       unsigned RHS2 = MI->getOperand(4).getReg();
7929       AddDefaultPred(BuildMI(BB, dl,
7930                              TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
7931                      .addReg(LHS1).addReg(RHS1));
7932       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
7933         .addReg(LHS2).addReg(RHS2)
7934         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
7935     }
7936
7937     MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
7938     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
7939     if (MI->getOperand(0).getImm() == ARMCC::NE)
7940       std::swap(destMBB, exitMBB);
7941
7942     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
7943       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
7944     if (isThumb2)
7945       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
7946     else
7947       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
7948
7949     MI->eraseFromParent();   // The pseudo instruction is gone now.
7950     return BB;
7951   }
7952
7953   case ARM::Int_eh_sjlj_setjmp:
7954   case ARM::Int_eh_sjlj_setjmp_nofp:
7955   case ARM::tInt_eh_sjlj_setjmp:
7956   case ARM::t2Int_eh_sjlj_setjmp:
7957   case ARM::t2Int_eh_sjlj_setjmp_nofp:
7958     return BB;
7959
7960   case ARM::Int_eh_sjlj_setup_dispatch:
7961     EmitSjLjDispatchBlock(MI, BB);
7962     return BB;
7963
7964   case ARM::ABS:
7965   case ARM::t2ABS: {
7966     // To insert an ABS instruction, we have to insert the
7967     // diamond control-flow pattern.  The incoming instruction knows the
7968     // source vreg to test against 0, the destination vreg to set,
7969     // the condition code register to branch on, the
7970     // true/false values to select between, and a branch opcode to use.
7971     // It transforms
7972     //     V1 = ABS V0
7973     // into
7974     //     V2 = MOVS V0
7975     //     BCC                      (branch to SinkBB if V0 >= 0)
7976     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
7977     //     SinkBB: V1 = PHI(V2, V3)
7978     const BasicBlock *LLVM_BB = BB->getBasicBlock();
7979     MachineFunction::iterator BBI = BB;
7980     ++BBI;
7981     MachineFunction *Fn = BB->getParent();
7982     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
7983     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
7984     Fn->insert(BBI, RSBBB);
7985     Fn->insert(BBI, SinkBB);
7986
7987     unsigned int ABSSrcReg = MI->getOperand(1).getReg();
7988     unsigned int ABSDstReg = MI->getOperand(0).getReg();
7989     bool ABSSrcKIll = MI->getOperand(1).isKill();
7990     bool isThumb2 = Subtarget->isThumb2();
7991     MachineRegisterInfo &MRI = Fn->getRegInfo();
7992     // In Thumb mode S must not be specified if source register is the SP or
7993     // PC and if destination register is the SP, so restrict register class
7994     unsigned NewRsbDstReg =
7995       MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
7996
7997     // Transfer the remainder of BB and its successor edges to sinkMBB.
7998     SinkBB->splice(SinkBB->begin(), BB,
7999                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
8000     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
8001
8002     BB->addSuccessor(RSBBB);
8003     BB->addSuccessor(SinkBB);
8004
8005     // fall through to SinkMBB
8006     RSBBB->addSuccessor(SinkBB);
8007
8008     // insert a cmp at the end of BB
8009     AddDefaultPred(BuildMI(BB, dl,
8010                            TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8011                    .addReg(ABSSrcReg).addImm(0));
8012
8013     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
8014     BuildMI(BB, dl,
8015       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
8016       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
8017
8018     // insert rsbri in RSBBB
8019     // Note: BCC and rsbri will be converted into predicated rsbmi
8020     // by if-conversion pass
8021     BuildMI(*RSBBB, RSBBB->begin(), dl,
8022       TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
8023       .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
8024       .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
8025
8026     // insert PHI in SinkBB,
8027     // reuse ABSDstReg to not change uses of ABS instruction
8028     BuildMI(*SinkBB, SinkBB->begin(), dl,
8029       TII->get(ARM::PHI), ABSDstReg)
8030       .addReg(NewRsbDstReg).addMBB(RSBBB)
8031       .addReg(ABSSrcReg).addMBB(BB);
8032
8033     // remove ABS instruction
8034     MI->eraseFromParent();
8035
8036     // return last added BB
8037     return SinkBB;
8038   }
8039   case ARM::COPY_STRUCT_BYVAL_I32:
8040     ++NumLoopByVals;
8041     return EmitStructByval(MI, BB);
8042   case ARM::WIN__CHKSTK:
8043     return EmitLowered__chkstk(MI, BB);
8044   }
8045 }
8046
8047 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
8048                                                       SDNode *Node) const {
8049   const MCInstrDesc *MCID = &MI->getDesc();
8050   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
8051   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
8052   // operand is still set to noreg. If needed, set the optional operand's
8053   // register to CPSR, and remove the redundant implicit def.
8054   //
8055   // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
8056
8057   // Rename pseudo opcodes.
8058   unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
8059   if (NewOpc) {
8060     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
8061     MCID = &TII->get(NewOpc);
8062
8063     assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
8064            "converted opcode should be the same except for cc_out");
8065
8066     MI->setDesc(*MCID);
8067
8068     // Add the optional cc_out operand
8069     MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
8070   }
8071   unsigned ccOutIdx = MCID->getNumOperands() - 1;
8072
8073   // Any ARM instruction that sets the 's' bit should specify an optional
8074   // "cc_out" operand in the last operand position.
8075   if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
8076     assert(!NewOpc && "Optional cc_out operand required");
8077     return;
8078   }
8079   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
8080   // since we already have an optional CPSR def.
8081   bool definesCPSR = false;
8082   bool deadCPSR = false;
8083   for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
8084        i != e; ++i) {
8085     const MachineOperand &MO = MI->getOperand(i);
8086     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
8087       definesCPSR = true;
8088       if (MO.isDead())
8089         deadCPSR = true;
8090       MI->RemoveOperand(i);
8091       break;
8092     }
8093   }
8094   if (!definesCPSR) {
8095     assert(!NewOpc && "Optional cc_out operand required");
8096     return;
8097   }
8098   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
8099   if (deadCPSR) {
8100     assert(!MI->getOperand(ccOutIdx).getReg() &&
8101            "expect uninitialized optional cc_out operand");
8102     return;
8103   }
8104
8105   // If this instruction was defined with an optional CPSR def and its dag node
8106   // had a live implicit CPSR def, then activate the optional CPSR def.
8107   MachineOperand &MO = MI->getOperand(ccOutIdx);
8108   MO.setReg(ARM::CPSR);
8109   MO.setIsDef(true);
8110 }
8111
8112 //===----------------------------------------------------------------------===//
8113 //                           ARM Optimization Hooks
8114 //===----------------------------------------------------------------------===//
8115
8116 // Helper function that checks if N is a null or all ones constant.
8117 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
8118   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
8119   if (!C)
8120     return false;
8121   return AllOnes ? C->isAllOnesValue() : C->isNullValue();
8122 }
8123
8124 // Return true if N is conditionally 0 or all ones.
8125 // Detects these expressions where cc is an i1 value:
8126 //
8127 //   (select cc 0, y)   [AllOnes=0]
8128 //   (select cc y, 0)   [AllOnes=0]
8129 //   (zext cc)          [AllOnes=0]
8130 //   (sext cc)          [AllOnes=0/1]
8131 //   (select cc -1, y)  [AllOnes=1]
8132 //   (select cc y, -1)  [AllOnes=1]
8133 //
8134 // Invert is set when N is the null/all ones constant when CC is false.
8135 // OtherOp is set to the alternative value of N.
8136 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
8137                                        SDValue &CC, bool &Invert,
8138                                        SDValue &OtherOp,
8139                                        SelectionDAG &DAG) {
8140   switch (N->getOpcode()) {
8141   default: return false;
8142   case ISD::SELECT: {
8143     CC = N->getOperand(0);
8144     SDValue N1 = N->getOperand(1);
8145     SDValue N2 = N->getOperand(2);
8146     if (isZeroOrAllOnes(N1, AllOnes)) {
8147       Invert = false;
8148       OtherOp = N2;
8149       return true;
8150     }
8151     if (isZeroOrAllOnes(N2, AllOnes)) {
8152       Invert = true;
8153       OtherOp = N1;
8154       return true;
8155     }
8156     return false;
8157   }
8158   case ISD::ZERO_EXTEND:
8159     // (zext cc) can never be the all ones value.
8160     if (AllOnes)
8161       return false;
8162     // Fall through.
8163   case ISD::SIGN_EXTEND: {
8164     SDLoc dl(N);
8165     EVT VT = N->getValueType(0);
8166     CC = N->getOperand(0);
8167     if (CC.getValueType() != MVT::i1)
8168       return false;
8169     Invert = !AllOnes;
8170     if (AllOnes)
8171       // When looking for an AllOnes constant, N is an sext, and the 'other'
8172       // value is 0.
8173       OtherOp = DAG.getConstant(0, dl, VT);
8174     else if (N->getOpcode() == ISD::ZERO_EXTEND)
8175       // When looking for a 0 constant, N can be zext or sext.
8176       OtherOp = DAG.getConstant(1, dl, VT);
8177     else
8178       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
8179                                 VT);
8180     return true;
8181   }
8182   }
8183 }
8184
8185 // Combine a constant select operand into its use:
8186 //
8187 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
8188 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
8189 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
8190 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
8191 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
8192 //
8193 // The transform is rejected if the select doesn't have a constant operand that
8194 // is null, or all ones when AllOnes is set.
8195 //
8196 // Also recognize sext/zext from i1:
8197 //
8198 //   (add (zext cc), x) -> (select cc (add x, 1), x)
8199 //   (add (sext cc), x) -> (select cc (add x, -1), x)
8200 //
8201 // These transformations eventually create predicated instructions.
8202 //
8203 // @param N       The node to transform.
8204 // @param Slct    The N operand that is a select.
8205 // @param OtherOp The other N operand (x above).
8206 // @param DCI     Context.
8207 // @param AllOnes Require the select constant to be all ones instead of null.
8208 // @returns The new node, or SDValue() on failure.
8209 static
8210 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
8211                             TargetLowering::DAGCombinerInfo &DCI,
8212                             bool AllOnes = false) {
8213   SelectionDAG &DAG = DCI.DAG;
8214   EVT VT = N->getValueType(0);
8215   SDValue NonConstantVal;
8216   SDValue CCOp;
8217   bool SwapSelectOps;
8218   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
8219                                   NonConstantVal, DAG))
8220     return SDValue();
8221
8222   // Slct is now know to be the desired identity constant when CC is true.
8223   SDValue TrueVal = OtherOp;
8224   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
8225                                  OtherOp, NonConstantVal);
8226   // Unless SwapSelectOps says CC should be false.
8227   if (SwapSelectOps)
8228     std::swap(TrueVal, FalseVal);
8229
8230   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
8231                      CCOp, TrueVal, FalseVal);
8232 }
8233
8234 // Attempt combineSelectAndUse on each operand of a commutative operator N.
8235 static
8236 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
8237                                        TargetLowering::DAGCombinerInfo &DCI) {
8238   SDValue N0 = N->getOperand(0);
8239   SDValue N1 = N->getOperand(1);
8240   if (N0.getNode()->hasOneUse()) {
8241     SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
8242     if (Result.getNode())
8243       return Result;
8244   }
8245   if (N1.getNode()->hasOneUse()) {
8246     SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
8247     if (Result.getNode())
8248       return Result;
8249   }
8250   return SDValue();
8251 }
8252
8253 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
8254 // (only after legalization).
8255 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
8256                                  TargetLowering::DAGCombinerInfo &DCI,
8257                                  const ARMSubtarget *Subtarget) {
8258
8259   // Only perform optimization if after legalize, and if NEON is available. We
8260   // also expected both operands to be BUILD_VECTORs.
8261   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
8262       || N0.getOpcode() != ISD::BUILD_VECTOR
8263       || N1.getOpcode() != ISD::BUILD_VECTOR)
8264     return SDValue();
8265
8266   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
8267   EVT VT = N->getValueType(0);
8268   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
8269     return SDValue();
8270
8271   // Check that the vector operands are of the right form.
8272   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
8273   // operands, where N is the size of the formed vector.
8274   // Each EXTRACT_VECTOR should have the same input vector and odd or even
8275   // index such that we have a pair wise add pattern.
8276
8277   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
8278   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8279     return SDValue();
8280   SDValue Vec = N0->getOperand(0)->getOperand(0);
8281   SDNode *V = Vec.getNode();
8282   unsigned nextIndex = 0;
8283
8284   // For each operands to the ADD which are BUILD_VECTORs,
8285   // check to see if each of their operands are an EXTRACT_VECTOR with
8286   // the same vector and appropriate index.
8287   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
8288     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
8289         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
8290
8291       SDValue ExtVec0 = N0->getOperand(i);
8292       SDValue ExtVec1 = N1->getOperand(i);
8293
8294       // First operand is the vector, verify its the same.
8295       if (V != ExtVec0->getOperand(0).getNode() ||
8296           V != ExtVec1->getOperand(0).getNode())
8297         return SDValue();
8298
8299       // Second is the constant, verify its correct.
8300       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
8301       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
8302
8303       // For the constant, we want to see all the even or all the odd.
8304       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
8305           || C1->getZExtValue() != nextIndex+1)
8306         return SDValue();
8307
8308       // Increment index.
8309       nextIndex+=2;
8310     } else
8311       return SDValue();
8312   }
8313
8314   // Create VPADDL node.
8315   SelectionDAG &DAG = DCI.DAG;
8316   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8317
8318   SDLoc dl(N);
8319
8320   // Build operand list.
8321   SmallVector<SDValue, 8> Ops;
8322   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
8323                                 TLI.getPointerTy(DAG.getDataLayout())));
8324
8325   // Input is the vector.
8326   Ops.push_back(Vec);
8327
8328   // Get widened type and narrowed type.
8329   MVT widenType;
8330   unsigned numElem = VT.getVectorNumElements();
8331
8332   EVT inputLaneType = Vec.getValueType().getVectorElementType();
8333   switch (inputLaneType.getSimpleVT().SimpleTy) {
8334     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
8335     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
8336     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
8337     default:
8338       llvm_unreachable("Invalid vector element type for padd optimization.");
8339   }
8340
8341   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
8342   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
8343   return DAG.getNode(ExtOp, dl, VT, tmp);
8344 }
8345
8346 static SDValue findMUL_LOHI(SDValue V) {
8347   if (V->getOpcode() == ISD::UMUL_LOHI ||
8348       V->getOpcode() == ISD::SMUL_LOHI)
8349     return V;
8350   return SDValue();
8351 }
8352
8353 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
8354                                      TargetLowering::DAGCombinerInfo &DCI,
8355                                      const ARMSubtarget *Subtarget) {
8356
8357   if (Subtarget->isThumb1Only()) return SDValue();
8358
8359   // Only perform the checks after legalize when the pattern is available.
8360   if (DCI.isBeforeLegalize()) return SDValue();
8361
8362   // Look for multiply add opportunities.
8363   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
8364   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
8365   // a glue link from the first add to the second add.
8366   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
8367   // a S/UMLAL instruction.
8368   //                  UMUL_LOHI
8369   //                 / :lo    \ :hi
8370   //                /          \          [no multiline comment]
8371   //    loAdd ->  ADDE         |
8372   //                 \ :glue  /
8373   //                  \      /
8374   //                    ADDC   <- hiAdd
8375   //
8376   assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
8377   SDValue AddcOp0 = AddcNode->getOperand(0);
8378   SDValue AddcOp1 = AddcNode->getOperand(1);
8379
8380   // Check if the two operands are from the same mul_lohi node.
8381   if (AddcOp0.getNode() == AddcOp1.getNode())
8382     return SDValue();
8383
8384   assert(AddcNode->getNumValues() == 2 &&
8385          AddcNode->getValueType(0) == MVT::i32 &&
8386          "Expect ADDC with two result values. First: i32");
8387
8388   // Check that we have a glued ADDC node.
8389   if (AddcNode->getValueType(1) != MVT::Glue)
8390     return SDValue();
8391
8392   // Check that the ADDC adds the low result of the S/UMUL_LOHI.
8393   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
8394       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
8395       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
8396       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
8397     return SDValue();
8398
8399   // Look for the glued ADDE.
8400   SDNode* AddeNode = AddcNode->getGluedUser();
8401   if (!AddeNode)
8402     return SDValue();
8403
8404   // Make sure it is really an ADDE.
8405   if (AddeNode->getOpcode() != ISD::ADDE)
8406     return SDValue();
8407
8408   assert(AddeNode->getNumOperands() == 3 &&
8409          AddeNode->getOperand(2).getValueType() == MVT::Glue &&
8410          "ADDE node has the wrong inputs");
8411
8412   // Check for the triangle shape.
8413   SDValue AddeOp0 = AddeNode->getOperand(0);
8414   SDValue AddeOp1 = AddeNode->getOperand(1);
8415
8416   // Make sure that the ADDE operands are not coming from the same node.
8417   if (AddeOp0.getNode() == AddeOp1.getNode())
8418     return SDValue();
8419
8420   // Find the MUL_LOHI node walking up ADDE's operands.
8421   bool IsLeftOperandMUL = false;
8422   SDValue MULOp = findMUL_LOHI(AddeOp0);
8423   if (MULOp == SDValue())
8424    MULOp = findMUL_LOHI(AddeOp1);
8425   else
8426     IsLeftOperandMUL = true;
8427   if (MULOp == SDValue())
8428     return SDValue();
8429
8430   // Figure out the right opcode.
8431   unsigned Opc = MULOp->getOpcode();
8432   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
8433
8434   // Figure out the high and low input values to the MLAL node.
8435   SDValue* HiAdd = nullptr;
8436   SDValue* LoMul = nullptr;
8437   SDValue* LowAdd = nullptr;
8438
8439   // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
8440   if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
8441     return SDValue();
8442
8443   if (IsLeftOperandMUL)
8444     HiAdd = &AddeOp1;
8445   else
8446     HiAdd = &AddeOp0;
8447
8448
8449   // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
8450   // whose low result is fed to the ADDC we are checking.
8451
8452   if (AddcOp0 == MULOp.getValue(0)) {
8453     LoMul = &AddcOp0;
8454     LowAdd = &AddcOp1;
8455   }
8456   if (AddcOp1 == MULOp.getValue(0)) {
8457     LoMul = &AddcOp1;
8458     LowAdd = &AddcOp0;
8459   }
8460
8461   if (!LoMul)
8462     return SDValue();
8463
8464   // Create the merged node.
8465   SelectionDAG &DAG = DCI.DAG;
8466
8467   // Build operand list.
8468   SmallVector<SDValue, 8> Ops;
8469   Ops.push_back(LoMul->getOperand(0));
8470   Ops.push_back(LoMul->getOperand(1));
8471   Ops.push_back(*LowAdd);
8472   Ops.push_back(*HiAdd);
8473
8474   SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
8475                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
8476
8477   // Replace the ADDs' nodes uses by the MLA node's values.
8478   SDValue HiMLALResult(MLALNode.getNode(), 1);
8479   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
8480
8481   SDValue LoMLALResult(MLALNode.getNode(), 0);
8482   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
8483
8484   // Return original node to notify the driver to stop replacing.
8485   SDValue resNode(AddcNode, 0);
8486   return resNode;
8487 }
8488
8489 /// PerformADDCCombine - Target-specific dag combine transform from
8490 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
8491 static SDValue PerformADDCCombine(SDNode *N,
8492                                  TargetLowering::DAGCombinerInfo &DCI,
8493                                  const ARMSubtarget *Subtarget) {
8494
8495   return AddCombineTo64bitMLAL(N, DCI, Subtarget);
8496
8497 }
8498
8499 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
8500 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
8501 /// called with the default operands, and if that fails, with commuted
8502 /// operands.
8503 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
8504                                           TargetLowering::DAGCombinerInfo &DCI,
8505                                           const ARMSubtarget *Subtarget){
8506
8507   // Attempt to create vpaddl for this add.
8508   SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
8509   if (Result.getNode())
8510     return Result;
8511
8512   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
8513   if (N0.getNode()->hasOneUse()) {
8514     SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
8515     if (Result.getNode()) return Result;
8516   }
8517   return SDValue();
8518 }
8519
8520 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
8521 ///
8522 static SDValue PerformADDCombine(SDNode *N,
8523                                  TargetLowering::DAGCombinerInfo &DCI,
8524                                  const ARMSubtarget *Subtarget) {
8525   SDValue N0 = N->getOperand(0);
8526   SDValue N1 = N->getOperand(1);
8527
8528   // First try with the default operand order.
8529   SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
8530   if (Result.getNode())
8531     return Result;
8532
8533   // If that didn't work, try again with the operands commuted.
8534   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
8535 }
8536
8537 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
8538 ///
8539 static SDValue PerformSUBCombine(SDNode *N,
8540                                  TargetLowering::DAGCombinerInfo &DCI) {
8541   SDValue N0 = N->getOperand(0);
8542   SDValue N1 = N->getOperand(1);
8543
8544   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
8545   if (N1.getNode()->hasOneUse()) {
8546     SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
8547     if (Result.getNode()) return Result;
8548   }
8549
8550   return SDValue();
8551 }
8552
8553 /// PerformVMULCombine
8554 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
8555 /// special multiplier accumulator forwarding.
8556 ///   vmul d3, d0, d2
8557 ///   vmla d3, d1, d2
8558 /// is faster than
8559 ///   vadd d3, d0, d1
8560 ///   vmul d3, d3, d2
8561 //  However, for (A + B) * (A + B),
8562 //    vadd d2, d0, d1
8563 //    vmul d3, d0, d2
8564 //    vmla d3, d1, d2
8565 //  is slower than
8566 //    vadd d2, d0, d1
8567 //    vmul d3, d2, d2
8568 static SDValue PerformVMULCombine(SDNode *N,
8569                                   TargetLowering::DAGCombinerInfo &DCI,
8570                                   const ARMSubtarget *Subtarget) {
8571   if (!Subtarget->hasVMLxForwarding())
8572     return SDValue();
8573
8574   SelectionDAG &DAG = DCI.DAG;
8575   SDValue N0 = N->getOperand(0);
8576   SDValue N1 = N->getOperand(1);
8577   unsigned Opcode = N0.getOpcode();
8578   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
8579       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
8580     Opcode = N1.getOpcode();
8581     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
8582         Opcode != ISD::FADD && Opcode != ISD::FSUB)
8583       return SDValue();
8584     std::swap(N0, N1);
8585   }
8586
8587   if (N0 == N1)
8588     return SDValue();
8589
8590   EVT VT = N->getValueType(0);
8591   SDLoc DL(N);
8592   SDValue N00 = N0->getOperand(0);
8593   SDValue N01 = N0->getOperand(1);
8594   return DAG.getNode(Opcode, DL, VT,
8595                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
8596                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
8597 }
8598
8599 static SDValue PerformMULCombine(SDNode *N,
8600                                  TargetLowering::DAGCombinerInfo &DCI,
8601                                  const ARMSubtarget *Subtarget) {
8602   SelectionDAG &DAG = DCI.DAG;
8603
8604   if (Subtarget->isThumb1Only())
8605     return SDValue();
8606
8607   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
8608     return SDValue();
8609
8610   EVT VT = N->getValueType(0);
8611   if (VT.is64BitVector() || VT.is128BitVector())
8612     return PerformVMULCombine(N, DCI, Subtarget);
8613   if (VT != MVT::i32)
8614     return SDValue();
8615
8616   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8617   if (!C)
8618     return SDValue();
8619
8620   int64_t MulAmt = C->getSExtValue();
8621   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
8622
8623   ShiftAmt = ShiftAmt & (32 - 1);
8624   SDValue V = N->getOperand(0);
8625   SDLoc DL(N);
8626
8627   SDValue Res;
8628   MulAmt >>= ShiftAmt;
8629
8630   if (MulAmt >= 0) {
8631     if (isPowerOf2_32(MulAmt - 1)) {
8632       // (mul x, 2^N + 1) => (add (shl x, N), x)
8633       Res = DAG.getNode(ISD::ADD, DL, VT,
8634                         V,
8635                         DAG.getNode(ISD::SHL, DL, VT,
8636                                     V,
8637                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
8638                                                     MVT::i32)));
8639     } else if (isPowerOf2_32(MulAmt + 1)) {
8640       // (mul x, 2^N - 1) => (sub (shl x, N), x)
8641       Res = DAG.getNode(ISD::SUB, DL, VT,
8642                         DAG.getNode(ISD::SHL, DL, VT,
8643                                     V,
8644                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
8645                                                     MVT::i32)),
8646                         V);
8647     } else
8648       return SDValue();
8649   } else {
8650     uint64_t MulAmtAbs = -MulAmt;
8651     if (isPowerOf2_32(MulAmtAbs + 1)) {
8652       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
8653       Res = DAG.getNode(ISD::SUB, DL, VT,
8654                         V,
8655                         DAG.getNode(ISD::SHL, DL, VT,
8656                                     V,
8657                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
8658                                                     MVT::i32)));
8659     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
8660       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
8661       Res = DAG.getNode(ISD::ADD, DL, VT,
8662                         V,
8663                         DAG.getNode(ISD::SHL, DL, VT,
8664                                     V,
8665                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
8666                                                     MVT::i32)));
8667       Res = DAG.getNode(ISD::SUB, DL, VT,
8668                         DAG.getConstant(0, DL, MVT::i32), Res);
8669
8670     } else
8671       return SDValue();
8672   }
8673
8674   if (ShiftAmt != 0)
8675     Res = DAG.getNode(ISD::SHL, DL, VT,
8676                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
8677
8678   // Do not add new nodes to DAG combiner worklist.
8679   DCI.CombineTo(N, Res, false);
8680   return SDValue();
8681 }
8682
8683 static SDValue PerformANDCombine(SDNode *N,
8684                                  TargetLowering::DAGCombinerInfo &DCI,
8685                                  const ARMSubtarget *Subtarget) {
8686
8687   // Attempt to use immediate-form VBIC
8688   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
8689   SDLoc dl(N);
8690   EVT VT = N->getValueType(0);
8691   SelectionDAG &DAG = DCI.DAG;
8692
8693   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8694     return SDValue();
8695
8696   APInt SplatBits, SplatUndef;
8697   unsigned SplatBitSize;
8698   bool HasAnyUndefs;
8699   if (BVN &&
8700       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
8701     if (SplatBitSize <= 64) {
8702       EVT VbicVT;
8703       SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
8704                                       SplatUndef.getZExtValue(), SplatBitSize,
8705                                       DAG, dl, VbicVT, VT.is128BitVector(),
8706                                       OtherModImm);
8707       if (Val.getNode()) {
8708         SDValue Input =
8709           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
8710         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
8711         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
8712       }
8713     }
8714   }
8715
8716   if (!Subtarget->isThumb1Only()) {
8717     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
8718     SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
8719     if (Result.getNode())
8720       return Result;
8721   }
8722
8723   return SDValue();
8724 }
8725
8726 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
8727 static SDValue PerformORCombine(SDNode *N,
8728                                 TargetLowering::DAGCombinerInfo &DCI,
8729                                 const ARMSubtarget *Subtarget) {
8730   // Attempt to use immediate-form VORR
8731   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
8732   SDLoc dl(N);
8733   EVT VT = N->getValueType(0);
8734   SelectionDAG &DAG = DCI.DAG;
8735
8736   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8737     return SDValue();
8738
8739   APInt SplatBits, SplatUndef;
8740   unsigned SplatBitSize;
8741   bool HasAnyUndefs;
8742   if (BVN && Subtarget->hasNEON() &&
8743       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
8744     if (SplatBitSize <= 64) {
8745       EVT VorrVT;
8746       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
8747                                       SplatUndef.getZExtValue(), SplatBitSize,
8748                                       DAG, dl, VorrVT, VT.is128BitVector(),
8749                                       OtherModImm);
8750       if (Val.getNode()) {
8751         SDValue Input =
8752           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
8753         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
8754         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
8755       }
8756     }
8757   }
8758
8759   if (!Subtarget->isThumb1Only()) {
8760     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
8761     SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
8762     if (Result.getNode())
8763       return Result;
8764   }
8765
8766   // The code below optimizes (or (and X, Y), Z).
8767   // The AND operand needs to have a single user to make these optimizations
8768   // profitable.
8769   SDValue N0 = N->getOperand(0);
8770   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
8771     return SDValue();
8772   SDValue N1 = N->getOperand(1);
8773
8774   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
8775   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
8776       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
8777     APInt SplatUndef;
8778     unsigned SplatBitSize;
8779     bool HasAnyUndefs;
8780
8781     APInt SplatBits0, SplatBits1;
8782     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
8783     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
8784     // Ensure that the second operand of both ands are constants
8785     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
8786                                       HasAnyUndefs) && !HasAnyUndefs) {
8787         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
8788                                           HasAnyUndefs) && !HasAnyUndefs) {
8789             // Ensure that the bit width of the constants are the same and that
8790             // the splat arguments are logical inverses as per the pattern we
8791             // are trying to simplify.
8792             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
8793                 SplatBits0 == ~SplatBits1) {
8794                 // Canonicalize the vector type to make instruction selection
8795                 // simpler.
8796                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
8797                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
8798                                              N0->getOperand(1),
8799                                              N0->getOperand(0),
8800                                              N1->getOperand(0));
8801                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
8802             }
8803         }
8804     }
8805   }
8806
8807   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
8808   // reasonable.
8809
8810   // BFI is only available on V6T2+
8811   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
8812     return SDValue();
8813
8814   SDLoc DL(N);
8815   // 1) or (and A, mask), val => ARMbfi A, val, mask
8816   //      iff (val & mask) == val
8817   //
8818   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
8819   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
8820   //          && mask == ~mask2
8821   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
8822   //          && ~mask == mask2
8823   //  (i.e., copy a bitfield value into another bitfield of the same width)
8824
8825   if (VT != MVT::i32)
8826     return SDValue();
8827
8828   SDValue N00 = N0.getOperand(0);
8829
8830   // The value and the mask need to be constants so we can verify this is
8831   // actually a bitfield set. If the mask is 0xffff, we can do better
8832   // via a movt instruction, so don't use BFI in that case.
8833   SDValue MaskOp = N0.getOperand(1);
8834   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
8835   if (!MaskC)
8836     return SDValue();
8837   unsigned Mask = MaskC->getZExtValue();
8838   if (Mask == 0xffff)
8839     return SDValue();
8840   SDValue Res;
8841   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
8842   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
8843   if (N1C) {
8844     unsigned Val = N1C->getZExtValue();
8845     if ((Val & ~Mask) != Val)
8846       return SDValue();
8847
8848     if (ARM::isBitFieldInvertedMask(Mask)) {
8849       Val >>= countTrailingZeros(~Mask);
8850
8851       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
8852                         DAG.getConstant(Val, DL, MVT::i32),
8853                         DAG.getConstant(Mask, DL, MVT::i32));
8854
8855       // Do not add new nodes to DAG combiner worklist.
8856       DCI.CombineTo(N, Res, false);
8857       return SDValue();
8858     }
8859   } else if (N1.getOpcode() == ISD::AND) {
8860     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
8861     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
8862     if (!N11C)
8863       return SDValue();
8864     unsigned Mask2 = N11C->getZExtValue();
8865
8866     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
8867     // as is to match.
8868     if (ARM::isBitFieldInvertedMask(Mask) &&
8869         (Mask == ~Mask2)) {
8870       // The pack halfword instruction works better for masks that fit it,
8871       // so use that when it's available.
8872       if (Subtarget->hasT2ExtractPack() &&
8873           (Mask == 0xffff || Mask == 0xffff0000))
8874         return SDValue();
8875       // 2a
8876       unsigned amt = countTrailingZeros(Mask2);
8877       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
8878                         DAG.getConstant(amt, DL, MVT::i32));
8879       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
8880                         DAG.getConstant(Mask, DL, MVT::i32));
8881       // Do not add new nodes to DAG combiner worklist.
8882       DCI.CombineTo(N, Res, false);
8883       return SDValue();
8884     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
8885                (~Mask == Mask2)) {
8886       // The pack halfword instruction works better for masks that fit it,
8887       // so use that when it's available.
8888       if (Subtarget->hasT2ExtractPack() &&
8889           (Mask2 == 0xffff || Mask2 == 0xffff0000))
8890         return SDValue();
8891       // 2b
8892       unsigned lsb = countTrailingZeros(Mask);
8893       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
8894                         DAG.getConstant(lsb, DL, MVT::i32));
8895       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
8896                         DAG.getConstant(Mask2, DL, MVT::i32));
8897       // Do not add new nodes to DAG combiner worklist.
8898       DCI.CombineTo(N, Res, false);
8899       return SDValue();
8900     }
8901   }
8902
8903   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
8904       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
8905       ARM::isBitFieldInvertedMask(~Mask)) {
8906     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
8907     // where lsb(mask) == #shamt and masked bits of B are known zero.
8908     SDValue ShAmt = N00.getOperand(1);
8909     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
8910     unsigned LSB = countTrailingZeros(Mask);
8911     if (ShAmtC != LSB)
8912       return SDValue();
8913
8914     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
8915                       DAG.getConstant(~Mask, DL, MVT::i32));
8916
8917     // Do not add new nodes to DAG combiner worklist.
8918     DCI.CombineTo(N, Res, false);
8919   }
8920
8921   return SDValue();
8922 }
8923
8924 static SDValue PerformXORCombine(SDNode *N,
8925                                  TargetLowering::DAGCombinerInfo &DCI,
8926                                  const ARMSubtarget *Subtarget) {
8927   EVT VT = N->getValueType(0);
8928   SelectionDAG &DAG = DCI.DAG;
8929
8930   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8931     return SDValue();
8932
8933   if (!Subtarget->isThumb1Only()) {
8934     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
8935     SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
8936     if (Result.getNode())
8937       return Result;
8938   }
8939
8940   return SDValue();
8941 }
8942
8943 /// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
8944 /// the bits being cleared by the AND are not demanded by the BFI.
8945 static SDValue PerformBFICombine(SDNode *N,
8946                                  TargetLowering::DAGCombinerInfo &DCI) {
8947   SDValue N1 = N->getOperand(1);
8948   if (N1.getOpcode() == ISD::AND) {
8949     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
8950     if (!N11C)
8951       return SDValue();
8952     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
8953     unsigned LSB = countTrailingZeros(~InvMask);
8954     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
8955     assert(Width <
8956                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
8957            "undefined behavior");
8958     unsigned Mask = (1u << Width) - 1;
8959     unsigned Mask2 = N11C->getZExtValue();
8960     if ((Mask & (~Mask2)) == 0)
8961       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
8962                              N->getOperand(0), N1.getOperand(0),
8963                              N->getOperand(2));
8964   }
8965   return SDValue();
8966 }
8967
8968 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
8969 /// ARMISD::VMOVRRD.
8970 static SDValue PerformVMOVRRDCombine(SDNode *N,
8971                                      TargetLowering::DAGCombinerInfo &DCI,
8972                                      const ARMSubtarget *Subtarget) {
8973   // vmovrrd(vmovdrr x, y) -> x,y
8974   SDValue InDouble = N->getOperand(0);
8975   if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
8976     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
8977
8978   // vmovrrd(load f64) -> (load i32), (load i32)
8979   SDNode *InNode = InDouble.getNode();
8980   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
8981       InNode->getValueType(0) == MVT::f64 &&
8982       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
8983       !cast<LoadSDNode>(InNode)->isVolatile()) {
8984     // TODO: Should this be done for non-FrameIndex operands?
8985     LoadSDNode *LD = cast<LoadSDNode>(InNode);
8986
8987     SelectionDAG &DAG = DCI.DAG;
8988     SDLoc DL(LD);
8989     SDValue BasePtr = LD->getBasePtr();
8990     SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
8991                                  LD->getPointerInfo(), LD->isVolatile(),
8992                                  LD->isNonTemporal(), LD->isInvariant(),
8993                                  LD->getAlignment());
8994
8995     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
8996                                     DAG.getConstant(4, DL, MVT::i32));
8997     SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
8998                                  LD->getPointerInfo(), LD->isVolatile(),
8999                                  LD->isNonTemporal(), LD->isInvariant(),
9000                                  std::min(4U, LD->getAlignment() / 2));
9001
9002     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
9003     if (DCI.DAG.getDataLayout().isBigEndian())
9004       std::swap (NewLD1, NewLD2);
9005     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
9006     return Result;
9007   }
9008
9009   return SDValue();
9010 }
9011
9012 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
9013 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
9014 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
9015   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
9016   SDValue Op0 = N->getOperand(0);
9017   SDValue Op1 = N->getOperand(1);
9018   if (Op0.getOpcode() == ISD::BITCAST)
9019     Op0 = Op0.getOperand(0);
9020   if (Op1.getOpcode() == ISD::BITCAST)
9021     Op1 = Op1.getOperand(0);
9022   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
9023       Op0.getNode() == Op1.getNode() &&
9024       Op0.getResNo() == 0 && Op1.getResNo() == 1)
9025     return DAG.getNode(ISD::BITCAST, SDLoc(N),
9026                        N->getValueType(0), Op0.getOperand(0));
9027   return SDValue();
9028 }
9029
9030 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
9031 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
9032 /// i64 vector to have f64 elements, since the value can then be loaded
9033 /// directly into a VFP register.
9034 static bool hasNormalLoadOperand(SDNode *N) {
9035   unsigned NumElts = N->getValueType(0).getVectorNumElements();
9036   for (unsigned i = 0; i < NumElts; ++i) {
9037     SDNode *Elt = N->getOperand(i).getNode();
9038     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
9039       return true;
9040   }
9041   return false;
9042 }
9043
9044 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
9045 /// ISD::BUILD_VECTOR.
9046 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
9047                                           TargetLowering::DAGCombinerInfo &DCI,
9048                                           const ARMSubtarget *Subtarget) {
9049   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
9050   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
9051   // into a pair of GPRs, which is fine when the value is used as a scalar,
9052   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
9053   SelectionDAG &DAG = DCI.DAG;
9054   if (N->getNumOperands() == 2) {
9055     SDValue RV = PerformVMOVDRRCombine(N, DAG);
9056     if (RV.getNode())
9057       return RV;
9058   }
9059
9060   // Load i64 elements as f64 values so that type legalization does not split
9061   // them up into i32 values.
9062   EVT VT = N->getValueType(0);
9063   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
9064     return SDValue();
9065   SDLoc dl(N);
9066   SmallVector<SDValue, 8> Ops;
9067   unsigned NumElts = VT.getVectorNumElements();
9068   for (unsigned i = 0; i < NumElts; ++i) {
9069     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
9070     Ops.push_back(V);
9071     // Make the DAGCombiner fold the bitcast.
9072     DCI.AddToWorklist(V.getNode());
9073   }
9074   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
9075   SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops);
9076   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
9077 }
9078
9079 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
9080 static SDValue
9081 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
9082   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
9083   // At that time, we may have inserted bitcasts from integer to float.
9084   // If these bitcasts have survived DAGCombine, change the lowering of this
9085   // BUILD_VECTOR in something more vector friendly, i.e., that does not
9086   // force to use floating point types.
9087
9088   // Make sure we can change the type of the vector.
9089   // This is possible iff:
9090   // 1. The vector is only used in a bitcast to a integer type. I.e.,
9091   //    1.1. Vector is used only once.
9092   //    1.2. Use is a bit convert to an integer type.
9093   // 2. The size of its operands are 32-bits (64-bits are not legal).
9094   EVT VT = N->getValueType(0);
9095   EVT EltVT = VT.getVectorElementType();
9096
9097   // Check 1.1. and 2.
9098   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
9099     return SDValue();
9100
9101   // By construction, the input type must be float.
9102   assert(EltVT == MVT::f32 && "Unexpected type!");
9103
9104   // Check 1.2.
9105   SDNode *Use = *N->use_begin();
9106   if (Use->getOpcode() != ISD::BITCAST ||
9107       Use->getValueType(0).isFloatingPoint())
9108     return SDValue();
9109
9110   // Check profitability.
9111   // Model is, if more than half of the relevant operands are bitcast from
9112   // i32, turn the build_vector into a sequence of insert_vector_elt.
9113   // Relevant operands are everything that is not statically
9114   // (i.e., at compile time) bitcasted.
9115   unsigned NumOfBitCastedElts = 0;
9116   unsigned NumElts = VT.getVectorNumElements();
9117   unsigned NumOfRelevantElts = NumElts;
9118   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
9119     SDValue Elt = N->getOperand(Idx);
9120     if (Elt->getOpcode() == ISD::BITCAST) {
9121       // Assume only bit cast to i32 will go away.
9122       if (Elt->getOperand(0).getValueType() == MVT::i32)
9123         ++NumOfBitCastedElts;
9124     } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
9125       // Constants are statically casted, thus do not count them as
9126       // relevant operands.
9127       --NumOfRelevantElts;
9128   }
9129
9130   // Check if more than half of the elements require a non-free bitcast.
9131   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
9132     return SDValue();
9133
9134   SelectionDAG &DAG = DCI.DAG;
9135   // Create the new vector type.
9136   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
9137   // Check if the type is legal.
9138   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9139   if (!TLI.isTypeLegal(VecVT))
9140     return SDValue();
9141
9142   // Combine:
9143   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
9144   // => BITCAST INSERT_VECTOR_ELT
9145   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
9146   //                      (BITCAST EN), N.
9147   SDValue Vec = DAG.getUNDEF(VecVT);
9148   SDLoc dl(N);
9149   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
9150     SDValue V = N->getOperand(Idx);
9151     if (V.getOpcode() == ISD::UNDEF)
9152       continue;
9153     if (V.getOpcode() == ISD::BITCAST &&
9154         V->getOperand(0).getValueType() == MVT::i32)
9155       // Fold obvious case.
9156       V = V.getOperand(0);
9157     else {
9158       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
9159       // Make the DAGCombiner fold the bitcasts.
9160       DCI.AddToWorklist(V.getNode());
9161     }
9162     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
9163     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
9164   }
9165   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
9166   // Make the DAGCombiner fold the bitcasts.
9167   DCI.AddToWorklist(Vec.getNode());
9168   return Vec;
9169 }
9170
9171 /// PerformInsertEltCombine - Target-specific dag combine xforms for
9172 /// ISD::INSERT_VECTOR_ELT.
9173 static SDValue PerformInsertEltCombine(SDNode *N,
9174                                        TargetLowering::DAGCombinerInfo &DCI) {
9175   // Bitcast an i64 load inserted into a vector to f64.
9176   // Otherwise, the i64 value will be legalized to a pair of i32 values.
9177   EVT VT = N->getValueType(0);
9178   SDNode *Elt = N->getOperand(1).getNode();
9179   if (VT.getVectorElementType() != MVT::i64 ||
9180       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
9181     return SDValue();
9182
9183   SelectionDAG &DAG = DCI.DAG;
9184   SDLoc dl(N);
9185   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
9186                                  VT.getVectorNumElements());
9187   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
9188   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
9189   // Make the DAGCombiner fold the bitcasts.
9190   DCI.AddToWorklist(Vec.getNode());
9191   DCI.AddToWorklist(V.getNode());
9192   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
9193                                Vec, V, N->getOperand(2));
9194   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
9195 }
9196
9197 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
9198 /// ISD::VECTOR_SHUFFLE.
9199 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
9200   // The LLVM shufflevector instruction does not require the shuffle mask
9201   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
9202   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
9203   // operands do not match the mask length, they are extended by concatenating
9204   // them with undef vectors.  That is probably the right thing for other
9205   // targets, but for NEON it is better to concatenate two double-register
9206   // size vector operands into a single quad-register size vector.  Do that
9207   // transformation here:
9208   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
9209   //   shuffle(concat(v1, v2), undef)
9210   SDValue Op0 = N->getOperand(0);
9211   SDValue Op1 = N->getOperand(1);
9212   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
9213       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
9214       Op0.getNumOperands() != 2 ||
9215       Op1.getNumOperands() != 2)
9216     return SDValue();
9217   SDValue Concat0Op1 = Op0.getOperand(1);
9218   SDValue Concat1Op1 = Op1.getOperand(1);
9219   if (Concat0Op1.getOpcode() != ISD::UNDEF ||
9220       Concat1Op1.getOpcode() != ISD::UNDEF)
9221     return SDValue();
9222   // Skip the transformation if any of the types are illegal.
9223   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9224   EVT VT = N->getValueType(0);
9225   if (!TLI.isTypeLegal(VT) ||
9226       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
9227       !TLI.isTypeLegal(Concat1Op1.getValueType()))
9228     return SDValue();
9229
9230   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
9231                                   Op0.getOperand(0), Op1.getOperand(0));
9232   // Translate the shuffle mask.
9233   SmallVector<int, 16> NewMask;
9234   unsigned NumElts = VT.getVectorNumElements();
9235   unsigned HalfElts = NumElts/2;
9236   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
9237   for (unsigned n = 0; n < NumElts; ++n) {
9238     int MaskElt = SVN->getMaskElt(n);
9239     int NewElt = -1;
9240     if (MaskElt < (int)HalfElts)
9241       NewElt = MaskElt;
9242     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
9243       NewElt = HalfElts + MaskElt - NumElts;
9244     NewMask.push_back(NewElt);
9245   }
9246   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
9247                               DAG.getUNDEF(VT), NewMask.data());
9248 }
9249
9250 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
9251 /// NEON load/store intrinsics, and generic vector load/stores, to merge
9252 /// base address updates.
9253 /// For generic load/stores, the memory type is assumed to be a vector.
9254 /// The caller is assumed to have checked legality.
9255 static SDValue CombineBaseUpdate(SDNode *N,
9256                                  TargetLowering::DAGCombinerInfo &DCI) {
9257   SelectionDAG &DAG = DCI.DAG;
9258   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
9259                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
9260   const bool isStore = N->getOpcode() == ISD::STORE;
9261   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
9262   SDValue Addr = N->getOperand(AddrOpIdx);
9263   MemSDNode *MemN = cast<MemSDNode>(N);
9264   SDLoc dl(N);
9265
9266   // Search for a use of the address operand that is an increment.
9267   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
9268          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
9269     SDNode *User = *UI;
9270     if (User->getOpcode() != ISD::ADD ||
9271         UI.getUse().getResNo() != Addr.getResNo())
9272       continue;
9273
9274     // Check that the add is independent of the load/store.  Otherwise, folding
9275     // it would create a cycle.
9276     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
9277       continue;
9278
9279     // Find the new opcode for the updating load/store.
9280     bool isLoadOp = true;
9281     bool isLaneOp = false;
9282     unsigned NewOpc = 0;
9283     unsigned NumVecs = 0;
9284     if (isIntrinsic) {
9285       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
9286       switch (IntNo) {
9287       default: llvm_unreachable("unexpected intrinsic for Neon base update");
9288       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
9289         NumVecs = 1; break;
9290       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
9291         NumVecs = 2; break;
9292       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
9293         NumVecs = 3; break;
9294       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
9295         NumVecs = 4; break;
9296       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
9297         NumVecs = 2; isLaneOp = true; break;
9298       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
9299         NumVecs = 3; isLaneOp = true; break;
9300       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
9301         NumVecs = 4; isLaneOp = true; break;
9302       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
9303         NumVecs = 1; isLoadOp = false; break;
9304       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
9305         NumVecs = 2; isLoadOp = false; break;
9306       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
9307         NumVecs = 3; isLoadOp = false; break;
9308       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
9309         NumVecs = 4; isLoadOp = false; break;
9310       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
9311         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
9312       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
9313         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
9314       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
9315         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
9316       }
9317     } else {
9318       isLaneOp = true;
9319       switch (N->getOpcode()) {
9320       default: llvm_unreachable("unexpected opcode for Neon base update");
9321       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
9322       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
9323       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
9324       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
9325         NumVecs = 1; isLaneOp = false; break;
9326       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
9327         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
9328       }
9329     }
9330
9331     // Find the size of memory referenced by the load/store.
9332     EVT VecTy;
9333     if (isLoadOp) {
9334       VecTy = N->getValueType(0);
9335     } else if (isIntrinsic) {
9336       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
9337     } else {
9338       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
9339       VecTy = N->getOperand(1).getValueType();
9340     }
9341
9342     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
9343     if (isLaneOp)
9344       NumBytes /= VecTy.getVectorNumElements();
9345
9346     // If the increment is a constant, it must match the memory ref size.
9347     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
9348     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
9349       uint64_t IncVal = CInc->getZExtValue();
9350       if (IncVal != NumBytes)
9351         continue;
9352     } else if (NumBytes >= 3 * 16) {
9353       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
9354       // separate instructions that make it harder to use a non-constant update.
9355       continue;
9356     }
9357
9358     // OK, we found an ADD we can fold into the base update.
9359     // Now, create a _UPD node, taking care of not breaking alignment.
9360
9361     EVT AlignedVecTy = VecTy;
9362     unsigned Alignment = MemN->getAlignment();
9363
9364     // If this is a less-than-standard-aligned load/store, change the type to
9365     // match the standard alignment.
9366     // The alignment is overlooked when selecting _UPD variants; and it's
9367     // easier to introduce bitcasts here than fix that.
9368     // There are 3 ways to get to this base-update combine:
9369     // - intrinsics: they are assumed to be properly aligned (to the standard
9370     //   alignment of the memory type), so we don't need to do anything.
9371     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
9372     //   intrinsics, so, likewise, there's nothing to do.
9373     // - generic load/store instructions: the alignment is specified as an
9374     //   explicit operand, rather than implicitly as the standard alignment
9375     //   of the memory type (like the intrisics).  We need to change the
9376     //   memory type to match the explicit alignment.  That way, we don't
9377     //   generate non-standard-aligned ARMISD::VLDx nodes.
9378     if (isa<LSBaseSDNode>(N)) {
9379       if (Alignment == 0)
9380         Alignment = 1;
9381       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
9382         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
9383         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
9384         assert(!isLaneOp && "Unexpected generic load/store lane.");
9385         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
9386         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
9387       }
9388       // Don't set an explicit alignment on regular load/stores that we want
9389       // to transform to VLD/VST 1_UPD nodes.
9390       // This matches the behavior of regular load/stores, which only get an
9391       // explicit alignment if the MMO alignment is larger than the standard
9392       // alignment of the memory type.
9393       // Intrinsics, however, always get an explicit alignment, set to the
9394       // alignment of the MMO.
9395       Alignment = 1;
9396     }
9397
9398     // Create the new updating load/store node.
9399     // First, create an SDVTList for the new updating node's results.
9400     EVT Tys[6];
9401     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
9402     unsigned n;
9403     for (n = 0; n < NumResultVecs; ++n)
9404       Tys[n] = AlignedVecTy;
9405     Tys[n++] = MVT::i32;
9406     Tys[n] = MVT::Other;
9407     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
9408
9409     // Then, gather the new node's operands.
9410     SmallVector<SDValue, 8> Ops;
9411     Ops.push_back(N->getOperand(0)); // incoming chain
9412     Ops.push_back(N->getOperand(AddrOpIdx));
9413     Ops.push_back(Inc);
9414
9415     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
9416       // Try to match the intrinsic's signature
9417       Ops.push_back(StN->getValue());
9418     } else {
9419       // Loads (and of course intrinsics) match the intrinsics' signature,
9420       // so just add all but the alignment operand.
9421       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
9422         Ops.push_back(N->getOperand(i));
9423     }
9424
9425     // For all node types, the alignment operand is always the last one.
9426     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
9427
9428     // If this is a non-standard-aligned STORE, the penultimate operand is the
9429     // stored value.  Bitcast it to the aligned type.
9430     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
9431       SDValue &StVal = Ops[Ops.size()-2];
9432       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
9433     }
9434
9435     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
9436                                            Ops, AlignedVecTy,
9437                                            MemN->getMemOperand());
9438
9439     // Update the uses.
9440     SmallVector<SDValue, 5> NewResults;
9441     for (unsigned i = 0; i < NumResultVecs; ++i)
9442       NewResults.push_back(SDValue(UpdN.getNode(), i));
9443
9444     // If this is an non-standard-aligned LOAD, the first result is the loaded
9445     // value.  Bitcast it to the expected result type.
9446     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
9447       SDValue &LdVal = NewResults[0];
9448       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
9449     }
9450
9451     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
9452     DCI.CombineTo(N, NewResults);
9453     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
9454
9455     break;
9456   }
9457   return SDValue();
9458 }
9459
9460 static SDValue PerformVLDCombine(SDNode *N,
9461                                  TargetLowering::DAGCombinerInfo &DCI) {
9462   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9463     return SDValue();
9464
9465   return CombineBaseUpdate(N, DCI);
9466 }
9467
9468 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
9469 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
9470 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
9471 /// return true.
9472 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
9473   SelectionDAG &DAG = DCI.DAG;
9474   EVT VT = N->getValueType(0);
9475   // vldN-dup instructions only support 64-bit vectors for N > 1.
9476   if (!VT.is64BitVector())
9477     return false;
9478
9479   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
9480   SDNode *VLD = N->getOperand(0).getNode();
9481   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
9482     return false;
9483   unsigned NumVecs = 0;
9484   unsigned NewOpc = 0;
9485   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
9486   if (IntNo == Intrinsic::arm_neon_vld2lane) {
9487     NumVecs = 2;
9488     NewOpc = ARMISD::VLD2DUP;
9489   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
9490     NumVecs = 3;
9491     NewOpc = ARMISD::VLD3DUP;
9492   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
9493     NumVecs = 4;
9494     NewOpc = ARMISD::VLD4DUP;
9495   } else {
9496     return false;
9497   }
9498
9499   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
9500   // numbers match the load.
9501   unsigned VLDLaneNo =
9502     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
9503   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
9504        UI != UE; ++UI) {
9505     // Ignore uses of the chain result.
9506     if (UI.getUse().getResNo() == NumVecs)
9507       continue;
9508     SDNode *User = *UI;
9509     if (User->getOpcode() != ARMISD::VDUPLANE ||
9510         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
9511       return false;
9512   }
9513
9514   // Create the vldN-dup node.
9515   EVT Tys[5];
9516   unsigned n;
9517   for (n = 0; n < NumVecs; ++n)
9518     Tys[n] = VT;
9519   Tys[n] = MVT::Other;
9520   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
9521   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
9522   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
9523   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
9524                                            Ops, VLDMemInt->getMemoryVT(),
9525                                            VLDMemInt->getMemOperand());
9526
9527   // Update the uses.
9528   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
9529        UI != UE; ++UI) {
9530     unsigned ResNo = UI.getUse().getResNo();
9531     // Ignore uses of the chain result.
9532     if (ResNo == NumVecs)
9533       continue;
9534     SDNode *User = *UI;
9535     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
9536   }
9537
9538   // Now the vldN-lane intrinsic is dead except for its chain result.
9539   // Update uses of the chain.
9540   std::vector<SDValue> VLDDupResults;
9541   for (unsigned n = 0; n < NumVecs; ++n)
9542     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
9543   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
9544   DCI.CombineTo(VLD, VLDDupResults);
9545
9546   return true;
9547 }
9548
9549 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
9550 /// ARMISD::VDUPLANE.
9551 static SDValue PerformVDUPLANECombine(SDNode *N,
9552                                       TargetLowering::DAGCombinerInfo &DCI) {
9553   SDValue Op = N->getOperand(0);
9554
9555   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
9556   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
9557   if (CombineVLDDUP(N, DCI))
9558     return SDValue(N, 0);
9559
9560   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
9561   // redundant.  Ignore bit_converts for now; element sizes are checked below.
9562   while (Op.getOpcode() == ISD::BITCAST)
9563     Op = Op.getOperand(0);
9564   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
9565     return SDValue();
9566
9567   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
9568   unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits();
9569   // The canonical VMOV for a zero vector uses a 32-bit element size.
9570   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9571   unsigned EltBits;
9572   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
9573     EltSize = 8;
9574   EVT VT = N->getValueType(0);
9575   if (EltSize > VT.getVectorElementType().getSizeInBits())
9576     return SDValue();
9577
9578   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
9579 }
9580
9581 static SDValue PerformLOADCombine(SDNode *N,
9582                                   TargetLowering::DAGCombinerInfo &DCI) {
9583   EVT VT = N->getValueType(0);
9584
9585   // If this is a legal vector load, try to combine it into a VLD1_UPD.
9586   if (ISD::isNormalLoad(N) && VT.isVector() &&
9587       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
9588     return CombineBaseUpdate(N, DCI);
9589
9590   return SDValue();
9591 }
9592
9593 /// PerformSTORECombine - Target-specific dag combine xforms for
9594 /// ISD::STORE.
9595 static SDValue PerformSTORECombine(SDNode *N,
9596                                    TargetLowering::DAGCombinerInfo &DCI) {
9597   StoreSDNode *St = cast<StoreSDNode>(N);
9598   if (St->isVolatile())
9599     return SDValue();
9600
9601   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
9602   // pack all of the elements in one place.  Next, store to memory in fewer
9603   // chunks.
9604   SDValue StVal = St->getValue();
9605   EVT VT = StVal.getValueType();
9606   if (St->isTruncatingStore() && VT.isVector()) {
9607     SelectionDAG &DAG = DCI.DAG;
9608     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9609     EVT StVT = St->getMemoryVT();
9610     unsigned NumElems = VT.getVectorNumElements();
9611     assert(StVT != VT && "Cannot truncate to the same type");
9612     unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
9613     unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
9614
9615     // From, To sizes and ElemCount must be pow of two
9616     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
9617
9618     // We are going to use the original vector elt for storing.
9619     // Accumulated smaller vector elements must be a multiple of the store size.
9620     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
9621
9622     unsigned SizeRatio  = FromEltSz / ToEltSz;
9623     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
9624
9625     // Create a type on which we perform the shuffle.
9626     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
9627                                      NumElems*SizeRatio);
9628     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
9629
9630     SDLoc DL(St);
9631     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
9632     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
9633     for (unsigned i = 0; i < NumElems; ++i)
9634       ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
9635                           ? (i + 1) * SizeRatio - 1
9636                           : i * SizeRatio;
9637
9638     // Can't shuffle using an illegal type.
9639     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
9640
9641     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
9642                                 DAG.getUNDEF(WideVec.getValueType()),
9643                                 ShuffleVec.data());
9644     // At this point all of the data is stored at the bottom of the
9645     // register. We now need to save it to mem.
9646
9647     // Find the largest store unit
9648     MVT StoreType = MVT::i8;
9649     for (MVT Tp : MVT::integer_valuetypes()) {
9650       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
9651         StoreType = Tp;
9652     }
9653     // Didn't find a legal store type.
9654     if (!TLI.isTypeLegal(StoreType))
9655       return SDValue();
9656
9657     // Bitcast the original vector into a vector of store-size units
9658     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
9659             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
9660     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
9661     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
9662     SmallVector<SDValue, 8> Chains;
9663     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
9664                                         TLI.getPointerTy(DAG.getDataLayout()));
9665     SDValue BasePtr = St->getBasePtr();
9666
9667     // Perform one or more big stores into memory.
9668     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
9669     for (unsigned I = 0; I < E; I++) {
9670       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
9671                                    StoreType, ShuffWide,
9672                                    DAG.getIntPtrConstant(I, DL));
9673       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
9674                                 St->getPointerInfo(), St->isVolatile(),
9675                                 St->isNonTemporal(), St->getAlignment());
9676       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
9677                             Increment);
9678       Chains.push_back(Ch);
9679     }
9680     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
9681   }
9682
9683   if (!ISD::isNormalStore(St))
9684     return SDValue();
9685
9686   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
9687   // ARM stores of arguments in the same cache line.
9688   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
9689       StVal.getNode()->hasOneUse()) {
9690     SelectionDAG  &DAG = DCI.DAG;
9691     bool isBigEndian = DAG.getDataLayout().isBigEndian();
9692     SDLoc DL(St);
9693     SDValue BasePtr = St->getBasePtr();
9694     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
9695                                   StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
9696                                   BasePtr, St->getPointerInfo(), St->isVolatile(),
9697                                   St->isNonTemporal(), St->getAlignment());
9698
9699     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
9700                                     DAG.getConstant(4, DL, MVT::i32));
9701     return DAG.getStore(NewST1.getValue(0), DL,
9702                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
9703                         OffsetPtr, St->getPointerInfo(), St->isVolatile(),
9704                         St->isNonTemporal(),
9705                         std::min(4U, St->getAlignment() / 2));
9706   }
9707
9708   if (StVal.getValueType() == MVT::i64 &&
9709       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
9710
9711     // Bitcast an i64 store extracted from a vector to f64.
9712     // Otherwise, the i64 value will be legalized to a pair of i32 values.
9713     SelectionDAG &DAG = DCI.DAG;
9714     SDLoc dl(StVal);
9715     SDValue IntVec = StVal.getOperand(0);
9716     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
9717                                    IntVec.getValueType().getVectorNumElements());
9718     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
9719     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
9720                                  Vec, StVal.getOperand(1));
9721     dl = SDLoc(N);
9722     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
9723     // Make the DAGCombiner fold the bitcasts.
9724     DCI.AddToWorklist(Vec.getNode());
9725     DCI.AddToWorklist(ExtElt.getNode());
9726     DCI.AddToWorklist(V.getNode());
9727     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
9728                         St->getPointerInfo(), St->isVolatile(),
9729                         St->isNonTemporal(), St->getAlignment(),
9730                         St->getAAInfo());
9731   }
9732
9733   // If this is a legal vector store, try to combine it into a VST1_UPD.
9734   if (ISD::isNormalStore(N) && VT.isVector() &&
9735       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
9736     return CombineBaseUpdate(N, DCI);
9737
9738   return SDValue();
9739 }
9740
9741 // isConstVecPow2 - Return true if each vector element is a power of 2, all
9742 // elements are the same constant, C, and Log2(C) ranges from 1 to 32.
9743 static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
9744 {
9745   integerPart cN;
9746   integerPart c0 = 0;
9747   for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements();
9748        I != E; I++) {
9749     ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I));
9750     if (!C)
9751       return false;
9752
9753     bool isExact;
9754     APFloat APF = C->getValueAPF();
9755     if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact)
9756         != APFloat::opOK || !isExact)
9757       return false;
9758
9759     c0 = (I == 0) ? cN : c0;
9760     if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32)
9761       return false;
9762   }
9763   C = c0;
9764   return true;
9765 }
9766
9767 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
9768 /// can replace combinations of VMUL and VCVT (floating-point to integer)
9769 /// when the VMUL has a constant operand that is a power of 2.
9770 ///
9771 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
9772 ///  vmul.f32        d16, d17, d16
9773 ///  vcvt.s32.f32    d16, d16
9774 /// becomes:
9775 ///  vcvt.s32.f32    d16, d16, #3
9776 static SDValue PerformVCVTCombine(SDNode *N,
9777                                   TargetLowering::DAGCombinerInfo &DCI,
9778                                   const ARMSubtarget *Subtarget) {
9779   SelectionDAG &DAG = DCI.DAG;
9780   SDValue Op = N->getOperand(0);
9781
9782   if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
9783       Op.getOpcode() != ISD::FMUL)
9784     return SDValue();
9785
9786   uint64_t C;
9787   SDValue N0 = Op->getOperand(0);
9788   SDValue ConstVec = Op->getOperand(1);
9789   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
9790
9791   if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
9792       !isConstVecPow2(ConstVec, isSigned, C))
9793     return SDValue();
9794
9795   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
9796   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
9797   unsigned NumLanes = Op.getValueType().getVectorNumElements();
9798   if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 ||
9799       NumLanes > 4) {
9800     // These instructions only exist converting from f32 to i32. We can handle
9801     // smaller integers by generating an extra truncate, but larger ones would
9802     // be lossy. We also can't handle more then 4 lanes, since these intructions
9803     // only support v2i32/v4i32 types.
9804     return SDValue();
9805   }
9806
9807   SDLoc dl(N);
9808   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
9809     Intrinsic::arm_neon_vcvtfp2fxu;
9810   SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
9811                                  NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
9812                                  DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
9813                                  N0,
9814                                  DAG.getConstant(Log2_64(C), dl, MVT::i32));
9815
9816   if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
9817     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
9818
9819   return FixConv;
9820 }
9821
9822 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
9823 /// can replace combinations of VCVT (integer to floating-point) and VDIV
9824 /// when the VDIV has a constant operand that is a power of 2.
9825 ///
9826 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
9827 ///  vcvt.f32.s32    d16, d16
9828 ///  vdiv.f32        d16, d17, d16
9829 /// becomes:
9830 ///  vcvt.f32.s32    d16, d16, #3
9831 static SDValue PerformVDIVCombine(SDNode *N,
9832                                   TargetLowering::DAGCombinerInfo &DCI,
9833                                   const ARMSubtarget *Subtarget) {
9834   SelectionDAG &DAG = DCI.DAG;
9835   SDValue Op = N->getOperand(0);
9836   unsigned OpOpcode = Op.getNode()->getOpcode();
9837
9838   if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
9839       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
9840     return SDValue();
9841
9842   uint64_t C;
9843   SDValue ConstVec = N->getOperand(1);
9844   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
9845
9846   if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
9847       !isConstVecPow2(ConstVec, isSigned, C))
9848     return SDValue();
9849
9850   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
9851   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
9852   if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
9853     // These instructions only exist converting from i32 to f32. We can handle
9854     // smaller integers by generating an extra extend, but larger ones would
9855     // be lossy.
9856     return SDValue();
9857   }
9858
9859   SDLoc dl(N);
9860   SDValue ConvInput = Op.getOperand(0);
9861   unsigned NumLanes = Op.getValueType().getVectorNumElements();
9862   if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
9863     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
9864                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
9865                             ConvInput);
9866
9867   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
9868     Intrinsic::arm_neon_vcvtfxu2fp;
9869   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
9870                      Op.getValueType(),
9871                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
9872                      ConvInput, DAG.getConstant(Log2_64(C), dl, MVT::i32));
9873 }
9874
9875 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
9876 /// operand of a vector shift operation, where all the elements of the
9877 /// build_vector must have the same constant integer value.
9878 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
9879   // Ignore bit_converts.
9880   while (Op.getOpcode() == ISD::BITCAST)
9881     Op = Op.getOperand(0);
9882   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
9883   APInt SplatBits, SplatUndef;
9884   unsigned SplatBitSize;
9885   bool HasAnyUndefs;
9886   if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
9887                                       HasAnyUndefs, ElementBits) ||
9888       SplatBitSize > ElementBits)
9889     return false;
9890   Cnt = SplatBits.getSExtValue();
9891   return true;
9892 }
9893
9894 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
9895 /// operand of a vector shift left operation.  That value must be in the range:
9896 ///   0 <= Value < ElementBits for a left shift; or
9897 ///   0 <= Value <= ElementBits for a long left shift.
9898 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
9899   assert(VT.isVector() && "vector shift count is not a vector type");
9900   int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
9901   if (! getVShiftImm(Op, ElementBits, Cnt))
9902     return false;
9903   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
9904 }
9905
9906 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
9907 /// operand of a vector shift right operation.  For a shift opcode, the value
9908 /// is positive, but for an intrinsic the value count must be negative. The
9909 /// absolute value must be in the range:
9910 ///   1 <= |Value| <= ElementBits for a right shift; or
9911 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
9912 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
9913                          int64_t &Cnt) {
9914   assert(VT.isVector() && "vector shift count is not a vector type");
9915   int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
9916   if (! getVShiftImm(Op, ElementBits, Cnt))
9917     return false;
9918   if (!isIntrinsic)
9919     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
9920   if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
9921     Cnt = -Cnt;
9922     return true;
9923   }
9924   return false;
9925 }
9926
9927 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
9928 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
9929   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9930   switch (IntNo) {
9931   default:
9932     // Don't do anything for most intrinsics.
9933     break;
9934
9935   case Intrinsic::arm_neon_vabds:
9936     if (!N->getValueType(0).isInteger())
9937       return SDValue();
9938     return DAG.getNode(ISD::SABSDIFF, SDLoc(N), N->getValueType(0),
9939                        N->getOperand(1), N->getOperand(2));
9940   case Intrinsic::arm_neon_vabdu:
9941     return DAG.getNode(ISD::UABSDIFF, SDLoc(N), N->getValueType(0),
9942                        N->getOperand(1), N->getOperand(2));
9943
9944   // Vector shifts: check for immediate versions and lower them.
9945   // Note: This is done during DAG combining instead of DAG legalizing because
9946   // the build_vectors for 64-bit vector element shift counts are generally
9947   // not legal, and it is hard to see their values after they get legalized to
9948   // loads from a constant pool.
9949   case Intrinsic::arm_neon_vshifts:
9950   case Intrinsic::arm_neon_vshiftu:
9951   case Intrinsic::arm_neon_vrshifts:
9952   case Intrinsic::arm_neon_vrshiftu:
9953   case Intrinsic::arm_neon_vrshiftn:
9954   case Intrinsic::arm_neon_vqshifts:
9955   case Intrinsic::arm_neon_vqshiftu:
9956   case Intrinsic::arm_neon_vqshiftsu:
9957   case Intrinsic::arm_neon_vqshiftns:
9958   case Intrinsic::arm_neon_vqshiftnu:
9959   case Intrinsic::arm_neon_vqshiftnsu:
9960   case Intrinsic::arm_neon_vqrshiftns:
9961   case Intrinsic::arm_neon_vqrshiftnu:
9962   case Intrinsic::arm_neon_vqrshiftnsu: {
9963     EVT VT = N->getOperand(1).getValueType();
9964     int64_t Cnt;
9965     unsigned VShiftOpc = 0;
9966
9967     switch (IntNo) {
9968     case Intrinsic::arm_neon_vshifts:
9969     case Intrinsic::arm_neon_vshiftu:
9970       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
9971         VShiftOpc = ARMISD::VSHL;
9972         break;
9973       }
9974       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
9975         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
9976                      ARMISD::VSHRs : ARMISD::VSHRu);
9977         break;
9978       }
9979       return SDValue();
9980
9981     case Intrinsic::arm_neon_vrshifts:
9982     case Intrinsic::arm_neon_vrshiftu:
9983       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
9984         break;
9985       return SDValue();
9986
9987     case Intrinsic::arm_neon_vqshifts:
9988     case Intrinsic::arm_neon_vqshiftu:
9989       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
9990         break;
9991       return SDValue();
9992
9993     case Intrinsic::arm_neon_vqshiftsu:
9994       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
9995         break;
9996       llvm_unreachable("invalid shift count for vqshlu intrinsic");
9997
9998     case Intrinsic::arm_neon_vrshiftn:
9999     case Intrinsic::arm_neon_vqshiftns:
10000     case Intrinsic::arm_neon_vqshiftnu:
10001     case Intrinsic::arm_neon_vqshiftnsu:
10002     case Intrinsic::arm_neon_vqrshiftns:
10003     case Intrinsic::arm_neon_vqrshiftnu:
10004     case Intrinsic::arm_neon_vqrshiftnsu:
10005       // Narrowing shifts require an immediate right shift.
10006       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
10007         break;
10008       llvm_unreachable("invalid shift count for narrowing vector shift "
10009                        "intrinsic");
10010
10011     default:
10012       llvm_unreachable("unhandled vector shift");
10013     }
10014
10015     switch (IntNo) {
10016     case Intrinsic::arm_neon_vshifts:
10017     case Intrinsic::arm_neon_vshiftu:
10018       // Opcode already set above.
10019       break;
10020     case Intrinsic::arm_neon_vrshifts:
10021       VShiftOpc = ARMISD::VRSHRs; break;
10022     case Intrinsic::arm_neon_vrshiftu:
10023       VShiftOpc = ARMISD::VRSHRu; break;
10024     case Intrinsic::arm_neon_vrshiftn:
10025       VShiftOpc = ARMISD::VRSHRN; break;
10026     case Intrinsic::arm_neon_vqshifts:
10027       VShiftOpc = ARMISD::VQSHLs; break;
10028     case Intrinsic::arm_neon_vqshiftu:
10029       VShiftOpc = ARMISD::VQSHLu; break;
10030     case Intrinsic::arm_neon_vqshiftsu:
10031       VShiftOpc = ARMISD::VQSHLsu; break;
10032     case Intrinsic::arm_neon_vqshiftns:
10033       VShiftOpc = ARMISD::VQSHRNs; break;
10034     case Intrinsic::arm_neon_vqshiftnu:
10035       VShiftOpc = ARMISD::VQSHRNu; break;
10036     case Intrinsic::arm_neon_vqshiftnsu:
10037       VShiftOpc = ARMISD::VQSHRNsu; break;
10038     case Intrinsic::arm_neon_vqrshiftns:
10039       VShiftOpc = ARMISD::VQRSHRNs; break;
10040     case Intrinsic::arm_neon_vqrshiftnu:
10041       VShiftOpc = ARMISD::VQRSHRNu; break;
10042     case Intrinsic::arm_neon_vqrshiftnsu:
10043       VShiftOpc = ARMISD::VQRSHRNsu; break;
10044     }
10045
10046     SDLoc dl(N);
10047     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
10048                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
10049   }
10050
10051   case Intrinsic::arm_neon_vshiftins: {
10052     EVT VT = N->getOperand(1).getValueType();
10053     int64_t Cnt;
10054     unsigned VShiftOpc = 0;
10055
10056     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
10057       VShiftOpc = ARMISD::VSLI;
10058     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
10059       VShiftOpc = ARMISD::VSRI;
10060     else {
10061       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
10062     }
10063
10064     SDLoc dl(N);
10065     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
10066                        N->getOperand(1), N->getOperand(2),
10067                        DAG.getConstant(Cnt, dl, MVT::i32));
10068   }
10069
10070   case Intrinsic::arm_neon_vqrshifts:
10071   case Intrinsic::arm_neon_vqrshiftu:
10072     // No immediate versions of these to check for.
10073     break;
10074   }
10075
10076   return SDValue();
10077 }
10078
10079 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
10080 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
10081 /// combining instead of DAG legalizing because the build_vectors for 64-bit
10082 /// vector element shift counts are generally not legal, and it is hard to see
10083 /// their values after they get legalized to loads from a constant pool.
10084 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
10085                                    const ARMSubtarget *ST) {
10086   EVT VT = N->getValueType(0);
10087   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
10088     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
10089     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
10090     SDValue N1 = N->getOperand(1);
10091     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
10092       SDValue N0 = N->getOperand(0);
10093       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
10094           DAG.MaskedValueIsZero(N0.getOperand(0),
10095                                 APInt::getHighBitsSet(32, 16)))
10096         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
10097     }
10098   }
10099
10100   // Nothing to be done for scalar shifts.
10101   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10102   if (!VT.isVector() || !TLI.isTypeLegal(VT))
10103     return SDValue();
10104
10105   assert(ST->hasNEON() && "unexpected vector shift");
10106   int64_t Cnt;
10107
10108   switch (N->getOpcode()) {
10109   default: llvm_unreachable("unexpected shift opcode");
10110
10111   case ISD::SHL:
10112     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
10113       SDLoc dl(N);
10114       return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
10115                          DAG.getConstant(Cnt, dl, MVT::i32));
10116     }
10117     break;
10118
10119   case ISD::SRA:
10120   case ISD::SRL:
10121     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
10122       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
10123                             ARMISD::VSHRs : ARMISD::VSHRu);
10124       SDLoc dl(N);
10125       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
10126                          DAG.getConstant(Cnt, dl, MVT::i32));
10127     }
10128   }
10129   return SDValue();
10130 }
10131
10132 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
10133 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
10134 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
10135                                     const ARMSubtarget *ST) {
10136   SDValue N0 = N->getOperand(0);
10137
10138   // Check for sign- and zero-extensions of vector extract operations of 8-
10139   // and 16-bit vector elements.  NEON supports these directly.  They are
10140   // handled during DAG combining because type legalization will promote them
10141   // to 32-bit types and it is messy to recognize the operations after that.
10142   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
10143     SDValue Vec = N0.getOperand(0);
10144     SDValue Lane = N0.getOperand(1);
10145     EVT VT = N->getValueType(0);
10146     EVT EltVT = N0.getValueType();
10147     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10148
10149     if (VT == MVT::i32 &&
10150         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
10151         TLI.isTypeLegal(Vec.getValueType()) &&
10152         isa<ConstantSDNode>(Lane)) {
10153
10154       unsigned Opc = 0;
10155       switch (N->getOpcode()) {
10156       default: llvm_unreachable("unexpected opcode");
10157       case ISD::SIGN_EXTEND:
10158         Opc = ARMISD::VGETLANEs;
10159         break;
10160       case ISD::ZERO_EXTEND:
10161       case ISD::ANY_EXTEND:
10162         Opc = ARMISD::VGETLANEu;
10163         break;
10164       }
10165       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
10166     }
10167   }
10168
10169   return SDValue();
10170 }
10171
10172 /// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
10173 /// to match f32 max/min patterns to use NEON vmax/vmin instructions.
10174 static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
10175                                        const ARMSubtarget *ST) {
10176   // If the target supports NEON, try to use vmax/vmin instructions for f32
10177   // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
10178   // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
10179   // a NaN; only do the transformation when it matches that behavior.
10180
10181   // For now only do this when using NEON for FP operations; if using VFP, it
10182   // is not obvious that the benefit outweighs the cost of switching to the
10183   // NEON pipeline.
10184   if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
10185       N->getValueType(0) != MVT::f32)
10186     return SDValue();
10187
10188   SDValue CondLHS = N->getOperand(0);
10189   SDValue CondRHS = N->getOperand(1);
10190   SDValue LHS = N->getOperand(2);
10191   SDValue RHS = N->getOperand(3);
10192   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
10193
10194   unsigned Opcode = 0;
10195   bool IsReversed;
10196   if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
10197     IsReversed = false; // x CC y ? x : y
10198   } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
10199     IsReversed = true ; // x CC y ? y : x
10200   } else {
10201     return SDValue();
10202   }
10203
10204   bool IsUnordered;
10205   switch (CC) {
10206   default: break;
10207   case ISD::SETOLT:
10208   case ISD::SETOLE:
10209   case ISD::SETLT:
10210   case ISD::SETLE:
10211   case ISD::SETULT:
10212   case ISD::SETULE:
10213     // If LHS is NaN, an ordered comparison will be false and the result will
10214     // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
10215     // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
10216     IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
10217     if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
10218       break;
10219     // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
10220     // will return -0, so vmin can only be used for unsafe math or if one of
10221     // the operands is known to be nonzero.
10222     if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
10223         !DAG.getTarget().Options.UnsafeFPMath &&
10224         !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
10225       break;
10226     Opcode = IsReversed ? ISD::FMAXNAN : ISD::FMINNAN;
10227     break;
10228
10229   case ISD::SETOGT:
10230   case ISD::SETOGE:
10231   case ISD::SETGT:
10232   case ISD::SETGE:
10233   case ISD::SETUGT:
10234   case ISD::SETUGE:
10235     // If LHS is NaN, an ordered comparison will be false and the result will
10236     // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
10237     // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
10238     IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
10239     if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
10240       break;
10241     // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
10242     // will return +0, so vmax can only be used for unsafe math or if one of
10243     // the operands is known to be nonzero.
10244     if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
10245         !DAG.getTarget().Options.UnsafeFPMath &&
10246         !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
10247       break;
10248     Opcode = IsReversed ? ISD::FMINNAN : ISD::FMAXNAN;
10249     break;
10250   }
10251
10252   if (!Opcode)
10253     return SDValue();
10254   return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
10255 }
10256
10257 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
10258 SDValue
10259 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
10260   SDValue Cmp = N->getOperand(4);
10261   if (Cmp.getOpcode() != ARMISD::CMPZ)
10262     // Only looking at EQ and NE cases.
10263     return SDValue();
10264
10265   EVT VT = N->getValueType(0);
10266   SDLoc dl(N);
10267   SDValue LHS = Cmp.getOperand(0);
10268   SDValue RHS = Cmp.getOperand(1);
10269   SDValue FalseVal = N->getOperand(0);
10270   SDValue TrueVal = N->getOperand(1);
10271   SDValue ARMcc = N->getOperand(2);
10272   ARMCC::CondCodes CC =
10273     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
10274
10275   // Simplify
10276   //   mov     r1, r0
10277   //   cmp     r1, x
10278   //   mov     r0, y
10279   //   moveq   r0, x
10280   // to
10281   //   cmp     r0, x
10282   //   movne   r0, y
10283   //
10284   //   mov     r1, r0
10285   //   cmp     r1, x
10286   //   mov     r0, x
10287   //   movne   r0, y
10288   // to
10289   //   cmp     r0, x
10290   //   movne   r0, y
10291   /// FIXME: Turn this into a target neutral optimization?
10292   SDValue Res;
10293   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
10294     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
10295                       N->getOperand(3), Cmp);
10296   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
10297     SDValue ARMcc;
10298     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
10299     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
10300                       N->getOperand(3), NewCmp);
10301   }
10302
10303   if (Res.getNode()) {
10304     APInt KnownZero, KnownOne;
10305     DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
10306     // Capture demanded bits information that would be otherwise lost.
10307     if (KnownZero == 0xfffffffe)
10308       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
10309                         DAG.getValueType(MVT::i1));
10310     else if (KnownZero == 0xffffff00)
10311       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
10312                         DAG.getValueType(MVT::i8));
10313     else if (KnownZero == 0xffff0000)
10314       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
10315                         DAG.getValueType(MVT::i16));
10316   }
10317
10318   return Res;
10319 }
10320
10321 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
10322                                              DAGCombinerInfo &DCI) const {
10323   switch (N->getOpcode()) {
10324   default: break;
10325   case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
10326   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
10327   case ISD::SUB:        return PerformSUBCombine(N, DCI);
10328   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
10329   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
10330   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
10331   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
10332   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
10333   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
10334   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
10335   case ISD::STORE:      return PerformSTORECombine(N, DCI);
10336   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
10337   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
10338   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
10339   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
10340   case ISD::FP_TO_SINT:
10341   case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
10342   case ISD::FDIV:       return PerformVDIVCombine(N, DCI, Subtarget);
10343   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
10344   case ISD::SHL:
10345   case ISD::SRA:
10346   case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
10347   case ISD::SIGN_EXTEND:
10348   case ISD::ZERO_EXTEND:
10349   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
10350   case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
10351   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
10352   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
10353   case ARMISD::VLD2DUP:
10354   case ARMISD::VLD3DUP:
10355   case ARMISD::VLD4DUP:
10356     return PerformVLDCombine(N, DCI);
10357   case ARMISD::BUILD_VECTOR:
10358     return PerformARMBUILD_VECTORCombine(N, DCI);
10359   case ISD::INTRINSIC_VOID:
10360   case ISD::INTRINSIC_W_CHAIN:
10361     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
10362     case Intrinsic::arm_neon_vld1:
10363     case Intrinsic::arm_neon_vld2:
10364     case Intrinsic::arm_neon_vld3:
10365     case Intrinsic::arm_neon_vld4:
10366     case Intrinsic::arm_neon_vld2lane:
10367     case Intrinsic::arm_neon_vld3lane:
10368     case Intrinsic::arm_neon_vld4lane:
10369     case Intrinsic::arm_neon_vst1:
10370     case Intrinsic::arm_neon_vst2:
10371     case Intrinsic::arm_neon_vst3:
10372     case Intrinsic::arm_neon_vst4:
10373     case Intrinsic::arm_neon_vst2lane:
10374     case Intrinsic::arm_neon_vst3lane:
10375     case Intrinsic::arm_neon_vst4lane:
10376       return PerformVLDCombine(N, DCI);
10377     default: break;
10378     }
10379     break;
10380   }
10381   return SDValue();
10382 }
10383
10384 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
10385                                                           EVT VT) const {
10386   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
10387 }
10388
10389 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
10390                                                        unsigned,
10391                                                        unsigned,
10392                                                        bool *Fast) const {
10393   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
10394   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
10395
10396   switch (VT.getSimpleVT().SimpleTy) {
10397   default:
10398     return false;
10399   case MVT::i8:
10400   case MVT::i16:
10401   case MVT::i32: {
10402     // Unaligned access can use (for example) LRDB, LRDH, LDR
10403     if (AllowsUnaligned) {
10404       if (Fast)
10405         *Fast = Subtarget->hasV7Ops();
10406       return true;
10407     }
10408     return false;
10409   }
10410   case MVT::f64:
10411   case MVT::v2f64: {
10412     // For any little-endian targets with neon, we can support unaligned ld/st
10413     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
10414     // A big-endian target may also explicitly support unaligned accesses
10415     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
10416       if (Fast)
10417         *Fast = true;
10418       return true;
10419     }
10420     return false;
10421   }
10422   }
10423 }
10424
10425 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
10426                        unsigned AlignCheck) {
10427   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
10428           (DstAlign == 0 || DstAlign % AlignCheck == 0));
10429 }
10430
10431 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
10432                                            unsigned DstAlign, unsigned SrcAlign,
10433                                            bool IsMemset, bool ZeroMemset,
10434                                            bool MemcpyStrSrc,
10435                                            MachineFunction &MF) const {
10436   const Function *F = MF.getFunction();
10437
10438   // See if we can use NEON instructions for this...
10439   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
10440       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10441     bool Fast;
10442     if (Size >= 16 &&
10443         (memOpAlign(SrcAlign, DstAlign, 16) ||
10444          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
10445       return MVT::v2f64;
10446     } else if (Size >= 8 &&
10447                (memOpAlign(SrcAlign, DstAlign, 8) ||
10448                 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
10449                  Fast))) {
10450       return MVT::f64;
10451     }
10452   }
10453
10454   // Lowering to i32/i16 if the size permits.
10455   if (Size >= 4)
10456     return MVT::i32;
10457   else if (Size >= 2)
10458     return MVT::i16;
10459
10460   // Let the target-independent logic figure it out.
10461   return MVT::Other;
10462 }
10463
10464 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
10465   if (Val.getOpcode() != ISD::LOAD)
10466     return false;
10467
10468   EVT VT1 = Val.getValueType();
10469   if (!VT1.isSimple() || !VT1.isInteger() ||
10470       !VT2.isSimple() || !VT2.isInteger())
10471     return false;
10472
10473   switch (VT1.getSimpleVT().SimpleTy) {
10474   default: break;
10475   case MVT::i1:
10476   case MVT::i8:
10477   case MVT::i16:
10478     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
10479     return true;
10480   }
10481
10482   return false;
10483 }
10484
10485 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
10486   EVT VT = ExtVal.getValueType();
10487
10488   if (!isTypeLegal(VT))
10489     return false;
10490
10491   // Don't create a loadext if we can fold the extension into a wide/long
10492   // instruction.
10493   // If there's more than one user instruction, the loadext is desirable no
10494   // matter what.  There can be two uses by the same instruction.
10495   if (ExtVal->use_empty() ||
10496       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
10497     return true;
10498
10499   SDNode *U = *ExtVal->use_begin();
10500   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
10501        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
10502     return false;
10503
10504   return true;
10505 }
10506
10507 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
10508   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
10509     return false;
10510
10511   if (!isTypeLegal(EVT::getEVT(Ty1)))
10512     return false;
10513
10514   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
10515
10516   // Assuming the caller doesn't have a zeroext or signext return parameter,
10517   // truncation all the way down to i1 is valid.
10518   return true;
10519 }
10520
10521
10522 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
10523   if (V < 0)
10524     return false;
10525
10526   unsigned Scale = 1;
10527   switch (VT.getSimpleVT().SimpleTy) {
10528   default: return false;
10529   case MVT::i1:
10530   case MVT::i8:
10531     // Scale == 1;
10532     break;
10533   case MVT::i16:
10534     // Scale == 2;
10535     Scale = 2;
10536     break;
10537   case MVT::i32:
10538     // Scale == 4;
10539     Scale = 4;
10540     break;
10541   }
10542
10543   if ((V & (Scale - 1)) != 0)
10544     return false;
10545   V /= Scale;
10546   return V == (V & ((1LL << 5) - 1));
10547 }
10548
10549 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
10550                                       const ARMSubtarget *Subtarget) {
10551   bool isNeg = false;
10552   if (V < 0) {
10553     isNeg = true;
10554     V = - V;
10555   }
10556
10557   switch (VT.getSimpleVT().SimpleTy) {
10558   default: return false;
10559   case MVT::i1:
10560   case MVT::i8:
10561   case MVT::i16:
10562   case MVT::i32:
10563     // + imm12 or - imm8
10564     if (isNeg)
10565       return V == (V & ((1LL << 8) - 1));
10566     return V == (V & ((1LL << 12) - 1));
10567   case MVT::f32:
10568   case MVT::f64:
10569     // Same as ARM mode. FIXME: NEON?
10570     if (!Subtarget->hasVFP2())
10571       return false;
10572     if ((V & 3) != 0)
10573       return false;
10574     V >>= 2;
10575     return V == (V & ((1LL << 8) - 1));
10576   }
10577 }
10578
10579 /// isLegalAddressImmediate - Return true if the integer value can be used
10580 /// as the offset of the target addressing mode for load / store of the
10581 /// given type.
10582 static bool isLegalAddressImmediate(int64_t V, EVT VT,
10583                                     const ARMSubtarget *Subtarget) {
10584   if (V == 0)
10585     return true;
10586
10587   if (!VT.isSimple())
10588     return false;
10589
10590   if (Subtarget->isThumb1Only())
10591     return isLegalT1AddressImmediate(V, VT);
10592   else if (Subtarget->isThumb2())
10593     return isLegalT2AddressImmediate(V, VT, Subtarget);
10594
10595   // ARM mode.
10596   if (V < 0)
10597     V = - V;
10598   switch (VT.getSimpleVT().SimpleTy) {
10599   default: return false;
10600   case MVT::i1:
10601   case MVT::i8:
10602   case MVT::i32:
10603     // +- imm12
10604     return V == (V & ((1LL << 12) - 1));
10605   case MVT::i16:
10606     // +- imm8
10607     return V == (V & ((1LL << 8) - 1));
10608   case MVT::f32:
10609   case MVT::f64:
10610     if (!Subtarget->hasVFP2()) // FIXME: NEON?
10611       return false;
10612     if ((V & 3) != 0)
10613       return false;
10614     V >>= 2;
10615     return V == (V & ((1LL << 8) - 1));
10616   }
10617 }
10618
10619 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
10620                                                       EVT VT) const {
10621   int Scale = AM.Scale;
10622   if (Scale < 0)
10623     return false;
10624
10625   switch (VT.getSimpleVT().SimpleTy) {
10626   default: return false;
10627   case MVT::i1:
10628   case MVT::i8:
10629   case MVT::i16:
10630   case MVT::i32:
10631     if (Scale == 1)
10632       return true;
10633     // r + r << imm
10634     Scale = Scale & ~1;
10635     return Scale == 2 || Scale == 4 || Scale == 8;
10636   case MVT::i64:
10637     // r + r
10638     if (((unsigned)AM.HasBaseReg + Scale) <= 2)
10639       return true;
10640     return false;
10641   case MVT::isVoid:
10642     // Note, we allow "void" uses (basically, uses that aren't loads or
10643     // stores), because arm allows folding a scale into many arithmetic
10644     // operations.  This should be made more precise and revisited later.
10645
10646     // Allow r << imm, but the imm has to be a multiple of two.
10647     if (Scale & 1) return false;
10648     return isPowerOf2_32(Scale);
10649   }
10650 }
10651
10652 /// isLegalAddressingMode - Return true if the addressing mode represented
10653 /// by AM is legal for this target, for a load/store of the specified type.
10654 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
10655                                               const AddrMode &AM, Type *Ty,
10656                                               unsigned AS) const {
10657   EVT VT = getValueType(DL, Ty, true);
10658   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
10659     return false;
10660
10661   // Can never fold addr of global into load/store.
10662   if (AM.BaseGV)
10663     return false;
10664
10665   switch (AM.Scale) {
10666   case 0:  // no scale reg, must be "r+i" or "r", or "i".
10667     break;
10668   case 1:
10669     if (Subtarget->isThumb1Only())
10670       return false;
10671     // FALL THROUGH.
10672   default:
10673     // ARM doesn't support any R+R*scale+imm addr modes.
10674     if (AM.BaseOffs)
10675       return false;
10676
10677     if (!VT.isSimple())
10678       return false;
10679
10680     if (Subtarget->isThumb2())
10681       return isLegalT2ScaledAddressingMode(AM, VT);
10682
10683     int Scale = AM.Scale;
10684     switch (VT.getSimpleVT().SimpleTy) {
10685     default: return false;
10686     case MVT::i1:
10687     case MVT::i8:
10688     case MVT::i32:
10689       if (Scale < 0) Scale = -Scale;
10690       if (Scale == 1)
10691         return true;
10692       // r + r << imm
10693       return isPowerOf2_32(Scale & ~1);
10694     case MVT::i16:
10695     case MVT::i64:
10696       // r + r
10697       if (((unsigned)AM.HasBaseReg + Scale) <= 2)
10698         return true;
10699       return false;
10700
10701     case MVT::isVoid:
10702       // Note, we allow "void" uses (basically, uses that aren't loads or
10703       // stores), because arm allows folding a scale into many arithmetic
10704       // operations.  This should be made more precise and revisited later.
10705
10706       // Allow r << imm, but the imm has to be a multiple of two.
10707       if (Scale & 1) return false;
10708       return isPowerOf2_32(Scale);
10709     }
10710   }
10711   return true;
10712 }
10713
10714 /// isLegalICmpImmediate - Return true if the specified immediate is legal
10715 /// icmp immediate, that is the target has icmp instructions which can compare
10716 /// a register against the immediate without having to materialize the
10717 /// immediate into a register.
10718 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
10719   // Thumb2 and ARM modes can use cmn for negative immediates.
10720   if (!Subtarget->isThumb())
10721     return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
10722   if (Subtarget->isThumb2())
10723     return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
10724   // Thumb1 doesn't have cmn, and only 8-bit immediates.
10725   return Imm >= 0 && Imm <= 255;
10726 }
10727
10728 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
10729 /// *or sub* immediate, that is the target has add or sub instructions which can
10730 /// add a register with the immediate without having to materialize the
10731 /// immediate into a register.
10732 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
10733   // Same encoding for add/sub, just flip the sign.
10734   int64_t AbsImm = std::abs(Imm);
10735   if (!Subtarget->isThumb())
10736     return ARM_AM::getSOImmVal(AbsImm) != -1;
10737   if (Subtarget->isThumb2())
10738     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
10739   // Thumb1 only has 8-bit unsigned immediate.
10740   return AbsImm >= 0 && AbsImm <= 255;
10741 }
10742
10743 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
10744                                       bool isSEXTLoad, SDValue &Base,
10745                                       SDValue &Offset, bool &isInc,
10746                                       SelectionDAG &DAG) {
10747   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
10748     return false;
10749
10750   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
10751     // AddressingMode 3
10752     Base = Ptr->getOperand(0);
10753     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10754       int RHSC = (int)RHS->getZExtValue();
10755       if (RHSC < 0 && RHSC > -256) {
10756         assert(Ptr->getOpcode() == ISD::ADD);
10757         isInc = false;
10758         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
10759         return true;
10760       }
10761     }
10762     isInc = (Ptr->getOpcode() == ISD::ADD);
10763     Offset = Ptr->getOperand(1);
10764     return true;
10765   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
10766     // AddressingMode 2
10767     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10768       int RHSC = (int)RHS->getZExtValue();
10769       if (RHSC < 0 && RHSC > -0x1000) {
10770         assert(Ptr->getOpcode() == ISD::ADD);
10771         isInc = false;
10772         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
10773         Base = Ptr->getOperand(0);
10774         return true;
10775       }
10776     }
10777
10778     if (Ptr->getOpcode() == ISD::ADD) {
10779       isInc = true;
10780       ARM_AM::ShiftOpc ShOpcVal=
10781         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
10782       if (ShOpcVal != ARM_AM::no_shift) {
10783         Base = Ptr->getOperand(1);
10784         Offset = Ptr->getOperand(0);
10785       } else {
10786         Base = Ptr->getOperand(0);
10787         Offset = Ptr->getOperand(1);
10788       }
10789       return true;
10790     }
10791
10792     isInc = (Ptr->getOpcode() == ISD::ADD);
10793     Base = Ptr->getOperand(0);
10794     Offset = Ptr->getOperand(1);
10795     return true;
10796   }
10797
10798   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
10799   return false;
10800 }
10801
10802 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
10803                                      bool isSEXTLoad, SDValue &Base,
10804                                      SDValue &Offset, bool &isInc,
10805                                      SelectionDAG &DAG) {
10806   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
10807     return false;
10808
10809   Base = Ptr->getOperand(0);
10810   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10811     int RHSC = (int)RHS->getZExtValue();
10812     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
10813       assert(Ptr->getOpcode() == ISD::ADD);
10814       isInc = false;
10815       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
10816       return true;
10817     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
10818       isInc = Ptr->getOpcode() == ISD::ADD;
10819       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
10820       return true;
10821     }
10822   }
10823
10824   return false;
10825 }
10826
10827 /// getPreIndexedAddressParts - returns true by value, base pointer and
10828 /// offset pointer and addressing mode by reference if the node's address
10829 /// can be legally represented as pre-indexed load / store address.
10830 bool
10831 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
10832                                              SDValue &Offset,
10833                                              ISD::MemIndexedMode &AM,
10834                                              SelectionDAG &DAG) const {
10835   if (Subtarget->isThumb1Only())
10836     return false;
10837
10838   EVT VT;
10839   SDValue Ptr;
10840   bool isSEXTLoad = false;
10841   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
10842     Ptr = LD->getBasePtr();
10843     VT  = LD->getMemoryVT();
10844     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
10845   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
10846     Ptr = ST->getBasePtr();
10847     VT  = ST->getMemoryVT();
10848   } else
10849     return false;
10850
10851   bool isInc;
10852   bool isLegal = false;
10853   if (Subtarget->isThumb2())
10854     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
10855                                        Offset, isInc, DAG);
10856   else
10857     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
10858                                         Offset, isInc, DAG);
10859   if (!isLegal)
10860     return false;
10861
10862   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
10863   return true;
10864 }
10865
10866 /// getPostIndexedAddressParts - returns true by value, base pointer and
10867 /// offset pointer and addressing mode by reference if this node can be
10868 /// combined with a load / store to form a post-indexed load / store.
10869 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
10870                                                    SDValue &Base,
10871                                                    SDValue &Offset,
10872                                                    ISD::MemIndexedMode &AM,
10873                                                    SelectionDAG &DAG) const {
10874   if (Subtarget->isThumb1Only())
10875     return false;
10876
10877   EVT VT;
10878   SDValue Ptr;
10879   bool isSEXTLoad = false;
10880   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
10881     VT  = LD->getMemoryVT();
10882     Ptr = LD->getBasePtr();
10883     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
10884   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
10885     VT  = ST->getMemoryVT();
10886     Ptr = ST->getBasePtr();
10887   } else
10888     return false;
10889
10890   bool isInc;
10891   bool isLegal = false;
10892   if (Subtarget->isThumb2())
10893     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
10894                                        isInc, DAG);
10895   else
10896     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
10897                                         isInc, DAG);
10898   if (!isLegal)
10899     return false;
10900
10901   if (Ptr != Base) {
10902     // Swap base ptr and offset to catch more post-index load / store when
10903     // it's legal. In Thumb2 mode, offset must be an immediate.
10904     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
10905         !Subtarget->isThumb2())
10906       std::swap(Base, Offset);
10907
10908     // Post-indexed load / store update the base pointer.
10909     if (Ptr != Base)
10910       return false;
10911   }
10912
10913   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
10914   return true;
10915 }
10916
10917 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
10918                                                       APInt &KnownZero,
10919                                                       APInt &KnownOne,
10920                                                       const SelectionDAG &DAG,
10921                                                       unsigned Depth) const {
10922   unsigned BitWidth = KnownOne.getBitWidth();
10923   KnownZero = KnownOne = APInt(BitWidth, 0);
10924   switch (Op.getOpcode()) {
10925   default: break;
10926   case ARMISD::ADDC:
10927   case ARMISD::ADDE:
10928   case ARMISD::SUBC:
10929   case ARMISD::SUBE:
10930     // These nodes' second result is a boolean
10931     if (Op.getResNo() == 0)
10932       break;
10933     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
10934     break;
10935   case ARMISD::CMOV: {
10936     // Bits are known zero/one if known on the LHS and RHS.
10937     DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
10938     if (KnownZero == 0 && KnownOne == 0) return;
10939
10940     APInt KnownZeroRHS, KnownOneRHS;
10941     DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
10942     KnownZero &= KnownZeroRHS;
10943     KnownOne  &= KnownOneRHS;
10944     return;
10945   }
10946   case ISD::INTRINSIC_W_CHAIN: {
10947     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
10948     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
10949     switch (IntID) {
10950     default: return;
10951     case Intrinsic::arm_ldaex:
10952     case Intrinsic::arm_ldrex: {
10953       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
10954       unsigned MemBits = VT.getScalarType().getSizeInBits();
10955       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
10956       return;
10957     }
10958     }
10959   }
10960   }
10961 }
10962
10963 //===----------------------------------------------------------------------===//
10964 //                           ARM Inline Assembly Support
10965 //===----------------------------------------------------------------------===//
10966
10967 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
10968   // Looking for "rev" which is V6+.
10969   if (!Subtarget->hasV6Ops())
10970     return false;
10971
10972   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
10973   std::string AsmStr = IA->getAsmString();
10974   SmallVector<StringRef, 4> AsmPieces;
10975   SplitString(AsmStr, AsmPieces, ";\n");
10976
10977   switch (AsmPieces.size()) {
10978   default: return false;
10979   case 1:
10980     AsmStr = AsmPieces[0];
10981     AsmPieces.clear();
10982     SplitString(AsmStr, AsmPieces, " \t,");
10983
10984     // rev $0, $1
10985     if (AsmPieces.size() == 3 &&
10986         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
10987         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
10988       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
10989       if (Ty && Ty->getBitWidth() == 32)
10990         return IntrinsicLowering::LowerToByteSwap(CI);
10991     }
10992     break;
10993   }
10994
10995   return false;
10996 }
10997
10998 /// getConstraintType - Given a constraint letter, return the type of
10999 /// constraint it is for this target.
11000 ARMTargetLowering::ConstraintType
11001 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
11002   if (Constraint.size() == 1) {
11003     switch (Constraint[0]) {
11004     default:  break;
11005     case 'l': return C_RegisterClass;
11006     case 'w': return C_RegisterClass;
11007     case 'h': return C_RegisterClass;
11008     case 'x': return C_RegisterClass;
11009     case 't': return C_RegisterClass;
11010     case 'j': return C_Other; // Constant for movw.
11011       // An address with a single base register. Due to the way we
11012       // currently handle addresses it is the same as an 'r' memory constraint.
11013     case 'Q': return C_Memory;
11014     }
11015   } else if (Constraint.size() == 2) {
11016     switch (Constraint[0]) {
11017     default: break;
11018     // All 'U+' constraints are addresses.
11019     case 'U': return C_Memory;
11020     }
11021   }
11022   return TargetLowering::getConstraintType(Constraint);
11023 }
11024
11025 /// Examine constraint type and operand type and determine a weight value.
11026 /// This object must already have been set up with the operand type
11027 /// and the current alternative constraint selected.
11028 TargetLowering::ConstraintWeight
11029 ARMTargetLowering::getSingleConstraintMatchWeight(
11030     AsmOperandInfo &info, const char *constraint) const {
11031   ConstraintWeight weight = CW_Invalid;
11032   Value *CallOperandVal = info.CallOperandVal;
11033     // If we don't have a value, we can't do a match,
11034     // but allow it at the lowest weight.
11035   if (!CallOperandVal)
11036     return CW_Default;
11037   Type *type = CallOperandVal->getType();
11038   // Look at the constraint type.
11039   switch (*constraint) {
11040   default:
11041     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
11042     break;
11043   case 'l':
11044     if (type->isIntegerTy()) {
11045       if (Subtarget->isThumb())
11046         weight = CW_SpecificReg;
11047       else
11048         weight = CW_Register;
11049     }
11050     break;
11051   case 'w':
11052     if (type->isFloatingPointTy())
11053       weight = CW_Register;
11054     break;
11055   }
11056   return weight;
11057 }
11058
11059 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
11060 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
11061     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11062   if (Constraint.size() == 1) {
11063     // GCC ARM Constraint Letters
11064     switch (Constraint[0]) {
11065     case 'l': // Low regs or general regs.
11066       if (Subtarget->isThumb())
11067         return RCPair(0U, &ARM::tGPRRegClass);
11068       return RCPair(0U, &ARM::GPRRegClass);
11069     case 'h': // High regs or no regs.
11070       if (Subtarget->isThumb())
11071         return RCPair(0U, &ARM::hGPRRegClass);
11072       break;
11073     case 'r':
11074       if (Subtarget->isThumb1Only())
11075         return RCPair(0U, &ARM::tGPRRegClass);
11076       return RCPair(0U, &ARM::GPRRegClass);
11077     case 'w':
11078       if (VT == MVT::Other)
11079         break;
11080       if (VT == MVT::f32)
11081         return RCPair(0U, &ARM::SPRRegClass);
11082       if (VT.getSizeInBits() == 64)
11083         return RCPair(0U, &ARM::DPRRegClass);
11084       if (VT.getSizeInBits() == 128)
11085         return RCPair(0U, &ARM::QPRRegClass);
11086       break;
11087     case 'x':
11088       if (VT == MVT::Other)
11089         break;
11090       if (VT == MVT::f32)
11091         return RCPair(0U, &ARM::SPR_8RegClass);
11092       if (VT.getSizeInBits() == 64)
11093         return RCPair(0U, &ARM::DPR_8RegClass);
11094       if (VT.getSizeInBits() == 128)
11095         return RCPair(0U, &ARM::QPR_8RegClass);
11096       break;
11097     case 't':
11098       if (VT == MVT::f32)
11099         return RCPair(0U, &ARM::SPRRegClass);
11100       break;
11101     }
11102   }
11103   if (StringRef("{cc}").equals_lower(Constraint))
11104     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
11105
11106   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
11107 }
11108
11109 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11110 /// vector.  If it is invalid, don't add anything to Ops.
11111 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
11112                                                      std::string &Constraint,
11113                                                      std::vector<SDValue>&Ops,
11114                                                      SelectionDAG &DAG) const {
11115   SDValue Result;
11116
11117   // Currently only support length 1 constraints.
11118   if (Constraint.length() != 1) return;
11119
11120   char ConstraintLetter = Constraint[0];
11121   switch (ConstraintLetter) {
11122   default: break;
11123   case 'j':
11124   case 'I': case 'J': case 'K': case 'L':
11125   case 'M': case 'N': case 'O':
11126     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11127     if (!C)
11128       return;
11129
11130     int64_t CVal64 = C->getSExtValue();
11131     int CVal = (int) CVal64;
11132     // None of these constraints allow values larger than 32 bits.  Check
11133     // that the value fits in an int.
11134     if (CVal != CVal64)
11135       return;
11136
11137     switch (ConstraintLetter) {
11138       case 'j':
11139         // Constant suitable for movw, must be between 0 and
11140         // 65535.
11141         if (Subtarget->hasV6T2Ops())
11142           if (CVal >= 0 && CVal <= 65535)
11143             break;
11144         return;
11145       case 'I':
11146         if (Subtarget->isThumb1Only()) {
11147           // This must be a constant between 0 and 255, for ADD
11148           // immediates.
11149           if (CVal >= 0 && CVal <= 255)
11150             break;
11151         } else if (Subtarget->isThumb2()) {
11152           // A constant that can be used as an immediate value in a
11153           // data-processing instruction.
11154           if (ARM_AM::getT2SOImmVal(CVal) != -1)
11155             break;
11156         } else {
11157           // A constant that can be used as an immediate value in a
11158           // data-processing instruction.
11159           if (ARM_AM::getSOImmVal(CVal) != -1)
11160             break;
11161         }
11162         return;
11163
11164       case 'J':
11165         if (Subtarget->isThumb()) {  // FIXME thumb2
11166           // This must be a constant between -255 and -1, for negated ADD
11167           // immediates. This can be used in GCC with an "n" modifier that
11168           // prints the negated value, for use with SUB instructions. It is
11169           // not useful otherwise but is implemented for compatibility.
11170           if (CVal >= -255 && CVal <= -1)
11171             break;
11172         } else {
11173           // This must be a constant between -4095 and 4095. It is not clear
11174           // what this constraint is intended for. Implemented for
11175           // compatibility with GCC.
11176           if (CVal >= -4095 && CVal <= 4095)
11177             break;
11178         }
11179         return;
11180
11181       case 'K':
11182         if (Subtarget->isThumb1Only()) {
11183           // A 32-bit value where only one byte has a nonzero value. Exclude
11184           // zero to match GCC. This constraint is used by GCC internally for
11185           // constants that can be loaded with a move/shift combination.
11186           // It is not useful otherwise but is implemented for compatibility.
11187           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
11188             break;
11189         } else if (Subtarget->isThumb2()) {
11190           // A constant whose bitwise inverse can be used as an immediate
11191           // value in a data-processing instruction. This can be used in GCC
11192           // with a "B" modifier that prints the inverted value, for use with
11193           // BIC and MVN instructions. It is not useful otherwise but is
11194           // implemented for compatibility.
11195           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
11196             break;
11197         } else {
11198           // A constant whose bitwise inverse can be used as an immediate
11199           // value in a data-processing instruction. This can be used in GCC
11200           // with a "B" modifier that prints the inverted value, for use with
11201           // BIC and MVN instructions. It is not useful otherwise but is
11202           // implemented for compatibility.
11203           if (ARM_AM::getSOImmVal(~CVal) != -1)
11204             break;
11205         }
11206         return;
11207
11208       case 'L':
11209         if (Subtarget->isThumb1Only()) {
11210           // This must be a constant between -7 and 7,
11211           // for 3-operand ADD/SUB immediate instructions.
11212           if (CVal >= -7 && CVal < 7)
11213             break;
11214         } else if (Subtarget->isThumb2()) {
11215           // A constant whose negation can be used as an immediate value in a
11216           // data-processing instruction. This can be used in GCC with an "n"
11217           // modifier that prints the negated value, for use with SUB
11218           // instructions. It is not useful otherwise but is implemented for
11219           // compatibility.
11220           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
11221             break;
11222         } else {
11223           // A constant whose negation can be used as an immediate value in a
11224           // data-processing instruction. This can be used in GCC with an "n"
11225           // modifier that prints the negated value, for use with SUB
11226           // instructions. It is not useful otherwise but is implemented for
11227           // compatibility.
11228           if (ARM_AM::getSOImmVal(-CVal) != -1)
11229             break;
11230         }
11231         return;
11232
11233       case 'M':
11234         if (Subtarget->isThumb()) { // FIXME thumb2
11235           // This must be a multiple of 4 between 0 and 1020, for
11236           // ADD sp + immediate.
11237           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
11238             break;
11239         } else {
11240           // A power of two or a constant between 0 and 32.  This is used in
11241           // GCC for the shift amount on shifted register operands, but it is
11242           // useful in general for any shift amounts.
11243           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
11244             break;
11245         }
11246         return;
11247
11248       case 'N':
11249         if (Subtarget->isThumb()) {  // FIXME thumb2
11250           // This must be a constant between 0 and 31, for shift amounts.
11251           if (CVal >= 0 && CVal <= 31)
11252             break;
11253         }
11254         return;
11255
11256       case 'O':
11257         if (Subtarget->isThumb()) {  // FIXME thumb2
11258           // This must be a multiple of 4 between -508 and 508, for
11259           // ADD/SUB sp = sp + immediate.
11260           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
11261             break;
11262         }
11263         return;
11264     }
11265     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
11266     break;
11267   }
11268
11269   if (Result.getNode()) {
11270     Ops.push_back(Result);
11271     return;
11272   }
11273   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11274 }
11275
11276 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
11277   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) &&
11278          "Register-based DivRem lowering only");
11279   unsigned Opcode = Op->getOpcode();
11280   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
11281          "Invalid opcode for Div/Rem lowering");
11282   bool isSigned = (Opcode == ISD::SDIVREM);
11283   EVT VT = Op->getValueType(0);
11284   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
11285
11286   RTLIB::Libcall LC;
11287   switch (VT.getSimpleVT().SimpleTy) {
11288   default: llvm_unreachable("Unexpected request for libcall!");
11289   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
11290   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
11291   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
11292   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
11293   }
11294
11295   SDValue InChain = DAG.getEntryNode();
11296
11297   TargetLowering::ArgListTy Args;
11298   TargetLowering::ArgListEntry Entry;
11299   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
11300     EVT ArgVT = Op->getOperand(i).getValueType();
11301     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
11302     Entry.Node = Op->getOperand(i);
11303     Entry.Ty = ArgTy;
11304     Entry.isSExt = isSigned;
11305     Entry.isZExt = !isSigned;
11306     Args.push_back(Entry);
11307   }
11308
11309   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
11310                                          getPointerTy(DAG.getDataLayout()));
11311
11312   Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
11313
11314   SDLoc dl(Op);
11315   TargetLowering::CallLoweringInfo CLI(DAG);
11316   CLI.setDebugLoc(dl).setChain(InChain)
11317     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
11318     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
11319
11320   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
11321   return CallInfo.first;
11322 }
11323
11324 SDValue
11325 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
11326   assert(Subtarget->isTargetWindows() && "unsupported target platform");
11327   SDLoc DL(Op);
11328
11329   // Get the inputs.
11330   SDValue Chain = Op.getOperand(0);
11331   SDValue Size  = Op.getOperand(1);
11332
11333   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
11334                               DAG.getConstant(2, DL, MVT::i32));
11335
11336   SDValue Flag;
11337   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
11338   Flag = Chain.getValue(1);
11339
11340   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
11341   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
11342
11343   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
11344   Chain = NewSP.getValue(1);
11345
11346   SDValue Ops[2] = { NewSP, Chain };
11347   return DAG.getMergeValues(Ops, DL);
11348 }
11349
11350 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
11351   assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
11352          "Unexpected type for custom-lowering FP_EXTEND");
11353
11354   RTLIB::Libcall LC;
11355   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
11356
11357   SDValue SrcVal = Op.getOperand(0);
11358   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
11359                      /*isSigned*/ false, SDLoc(Op)).first;
11360 }
11361
11362 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
11363   assert(Op.getOperand(0).getValueType() == MVT::f64 &&
11364          Subtarget->isFPOnlySP() &&
11365          "Unexpected type for custom-lowering FP_ROUND");
11366
11367   RTLIB::Libcall LC;
11368   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
11369
11370   SDValue SrcVal = Op.getOperand(0);
11371   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
11372                      /*isSigned*/ false, SDLoc(Op)).first;
11373 }
11374
11375 bool
11376 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
11377   // The ARM target isn't yet aware of offsets.
11378   return false;
11379 }
11380
11381 bool ARM::isBitFieldInvertedMask(unsigned v) {
11382   if (v == 0xffffffff)
11383     return false;
11384
11385   // there can be 1's on either or both "outsides", all the "inside"
11386   // bits must be 0's
11387   return isShiftedMask_32(~v);
11388 }
11389
11390 /// isFPImmLegal - Returns true if the target can instruction select the
11391 /// specified FP immediate natively. If false, the legalizer will
11392 /// materialize the FP immediate as a load from a constant pool.
11393 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
11394   if (!Subtarget->hasVFP3())
11395     return false;
11396   if (VT == MVT::f32)
11397     return ARM_AM::getFP32Imm(Imm) != -1;
11398   if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
11399     return ARM_AM::getFP64Imm(Imm) != -1;
11400   return false;
11401 }
11402
11403 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
11404 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
11405 /// specified in the intrinsic calls.
11406 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
11407                                            const CallInst &I,
11408                                            unsigned Intrinsic) const {
11409   switch (Intrinsic) {
11410   case Intrinsic::arm_neon_vld1:
11411   case Intrinsic::arm_neon_vld2:
11412   case Intrinsic::arm_neon_vld3:
11413   case Intrinsic::arm_neon_vld4:
11414   case Intrinsic::arm_neon_vld2lane:
11415   case Intrinsic::arm_neon_vld3lane:
11416   case Intrinsic::arm_neon_vld4lane: {
11417     Info.opc = ISD::INTRINSIC_W_CHAIN;
11418     // Conservatively set memVT to the entire set of vectors loaded.
11419     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
11420     uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
11421     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
11422     Info.ptrVal = I.getArgOperand(0);
11423     Info.offset = 0;
11424     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
11425     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
11426     Info.vol = false; // volatile loads with NEON intrinsics not supported
11427     Info.readMem = true;
11428     Info.writeMem = false;
11429     return true;
11430   }
11431   case Intrinsic::arm_neon_vst1:
11432   case Intrinsic::arm_neon_vst2:
11433   case Intrinsic::arm_neon_vst3:
11434   case Intrinsic::arm_neon_vst4:
11435   case Intrinsic::arm_neon_vst2lane:
11436   case Intrinsic::arm_neon_vst3lane:
11437   case Intrinsic::arm_neon_vst4lane: {
11438     Info.opc = ISD::INTRINSIC_VOID;
11439     // Conservatively set memVT to the entire set of vectors stored.
11440     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
11441     unsigned NumElts = 0;
11442     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
11443       Type *ArgTy = I.getArgOperand(ArgI)->getType();
11444       if (!ArgTy->isVectorTy())
11445         break;
11446       NumElts += DL.getTypeAllocSize(ArgTy) / 8;
11447     }
11448     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
11449     Info.ptrVal = I.getArgOperand(0);
11450     Info.offset = 0;
11451     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
11452     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
11453     Info.vol = false; // volatile stores with NEON intrinsics not supported
11454     Info.readMem = false;
11455     Info.writeMem = true;
11456     return true;
11457   }
11458   case Intrinsic::arm_ldaex:
11459   case Intrinsic::arm_ldrex: {
11460     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
11461     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
11462     Info.opc = ISD::INTRINSIC_W_CHAIN;
11463     Info.memVT = MVT::getVT(PtrTy->getElementType());
11464     Info.ptrVal = I.getArgOperand(0);
11465     Info.offset = 0;
11466     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
11467     Info.vol = true;
11468     Info.readMem = true;
11469     Info.writeMem = false;
11470     return true;
11471   }
11472   case Intrinsic::arm_stlex:
11473   case Intrinsic::arm_strex: {
11474     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
11475     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
11476     Info.opc = ISD::INTRINSIC_W_CHAIN;
11477     Info.memVT = MVT::getVT(PtrTy->getElementType());
11478     Info.ptrVal = I.getArgOperand(1);
11479     Info.offset = 0;
11480     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
11481     Info.vol = true;
11482     Info.readMem = false;
11483     Info.writeMem = true;
11484     return true;
11485   }
11486   case Intrinsic::arm_stlexd:
11487   case Intrinsic::arm_strexd: {
11488     Info.opc = ISD::INTRINSIC_W_CHAIN;
11489     Info.memVT = MVT::i64;
11490     Info.ptrVal = I.getArgOperand(2);
11491     Info.offset = 0;
11492     Info.align = 8;
11493     Info.vol = true;
11494     Info.readMem = false;
11495     Info.writeMem = true;
11496     return true;
11497   }
11498   case Intrinsic::arm_ldaexd:
11499   case Intrinsic::arm_ldrexd: {
11500     Info.opc = ISD::INTRINSIC_W_CHAIN;
11501     Info.memVT = MVT::i64;
11502     Info.ptrVal = I.getArgOperand(0);
11503     Info.offset = 0;
11504     Info.align = 8;
11505     Info.vol = true;
11506     Info.readMem = true;
11507     Info.writeMem = false;
11508     return true;
11509   }
11510   default:
11511     break;
11512   }
11513
11514   return false;
11515 }
11516
11517 /// \brief Returns true if it is beneficial to convert a load of a constant
11518 /// to just the constant itself.
11519 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
11520                                                           Type *Ty) const {
11521   assert(Ty->isIntegerTy());
11522
11523   unsigned Bits = Ty->getPrimitiveSizeInBits();
11524   if (Bits == 0 || Bits > 32)
11525     return false;
11526   return true;
11527 }
11528
11529 bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; }
11530
11531 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
11532                                         ARM_MB::MemBOpt Domain) const {
11533   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
11534
11535   // First, if the target has no DMB, see what fallback we can use.
11536   if (!Subtarget->hasDataBarrier()) {
11537     // Some ARMv6 cpus can support data barriers with an mcr instruction.
11538     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
11539     // here.
11540     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
11541       Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
11542       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
11543                         Builder.getInt32(0), Builder.getInt32(7),
11544                         Builder.getInt32(10), Builder.getInt32(5)};
11545       return Builder.CreateCall(MCR, args);
11546     } else {
11547       // Instead of using barriers, atomic accesses on these subtargets use
11548       // libcalls.
11549       llvm_unreachable("makeDMB on a target so old that it has no barriers");
11550     }
11551   } else {
11552     Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
11553     // Only a full system barrier exists in the M-class architectures.
11554     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
11555     Constant *CDomain = Builder.getInt32(Domain);
11556     return Builder.CreateCall(DMB, CDomain);
11557   }
11558 }
11559
11560 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
11561 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
11562                                          AtomicOrdering Ord, bool IsStore,
11563                                          bool IsLoad) const {
11564   if (!getInsertFencesForAtomic())
11565     return nullptr;
11566
11567   switch (Ord) {
11568   case NotAtomic:
11569   case Unordered:
11570     llvm_unreachable("Invalid fence: unordered/non-atomic");
11571   case Monotonic:
11572   case Acquire:
11573     return nullptr; // Nothing to do
11574   case SequentiallyConsistent:
11575     if (!IsStore)
11576       return nullptr; // Nothing to do
11577     /*FALLTHROUGH*/
11578   case Release:
11579   case AcquireRelease:
11580     if (Subtarget->isSwift())
11581       return makeDMB(Builder, ARM_MB::ISHST);
11582     // FIXME: add a comment with a link to documentation justifying this.
11583     else
11584       return makeDMB(Builder, ARM_MB::ISH);
11585   }
11586   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
11587 }
11588
11589 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
11590                                           AtomicOrdering Ord, bool IsStore,
11591                                           bool IsLoad) const {
11592   if (!getInsertFencesForAtomic())
11593     return nullptr;
11594
11595   switch (Ord) {
11596   case NotAtomic:
11597   case Unordered:
11598     llvm_unreachable("Invalid fence: unordered/not-atomic");
11599   case Monotonic:
11600   case Release:
11601     return nullptr; // Nothing to do
11602   case Acquire:
11603   case AcquireRelease:
11604   case SequentiallyConsistent:
11605     return makeDMB(Builder, ARM_MB::ISH);
11606   }
11607   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
11608 }
11609
11610 // Loads and stores less than 64-bits are already atomic; ones above that
11611 // are doomed anyway, so defer to the default libcall and blame the OS when
11612 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
11613 // anything for those.
11614 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
11615   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
11616   return (Size == 64) && !Subtarget->isMClass();
11617 }
11618
11619 // Loads and stores less than 64-bits are already atomic; ones above that
11620 // are doomed anyway, so defer to the default libcall and blame the OS when
11621 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
11622 // anything for those.
11623 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
11624 // guarantee, see DDI0406C ARM architecture reference manual,
11625 // sections A8.8.72-74 LDRD)
11626 bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
11627   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
11628   return (Size == 64) && !Subtarget->isMClass();
11629 }
11630
11631 // For the real atomic operations, we have ldrex/strex up to 32 bits,
11632 // and up to 64 bits on the non-M profiles
11633 TargetLoweringBase::AtomicRMWExpansionKind
11634 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
11635   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
11636   return (Size <= (Subtarget->isMClass() ? 32U : 64U))
11637              ? AtomicRMWExpansionKind::LLSC
11638              : AtomicRMWExpansionKind::None;
11639 }
11640
11641 // This has so far only been implemented for MachO.
11642 bool ARMTargetLowering::useLoadStackGuardNode() const {
11643   return Subtarget->isTargetMachO();
11644 }
11645
11646 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
11647                                                   unsigned &Cost) const {
11648   // If we do not have NEON, vector types are not natively supported.
11649   if (!Subtarget->hasNEON())
11650     return false;
11651
11652   // Floating point values and vector values map to the same register file.
11653   // Therefore, although we could do a store extract of a vector type, this is
11654   // better to leave at float as we have more freedom in the addressing mode for
11655   // those.
11656   if (VectorTy->isFPOrFPVectorTy())
11657     return false;
11658
11659   // If the index is unknown at compile time, this is very expensive to lower
11660   // and it is not possible to combine the store with the extract.
11661   if (!isa<ConstantInt>(Idx))
11662     return false;
11663
11664   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
11665   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
11666   // We can do a store + vector extract on any vector that fits perfectly in a D
11667   // or Q register.
11668   if (BitWidth == 64 || BitWidth == 128) {
11669     Cost = 0;
11670     return true;
11671   }
11672   return false;
11673 }
11674
11675 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
11676                                          AtomicOrdering Ord) const {
11677   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
11678   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
11679   bool IsAcquire = isAtLeastAcquire(Ord);
11680
11681   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
11682   // intrinsic must return {i32, i32} and we have to recombine them into a
11683   // single i64 here.
11684   if (ValTy->getPrimitiveSizeInBits() == 64) {
11685     Intrinsic::ID Int =
11686         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
11687     Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
11688
11689     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
11690     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
11691
11692     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
11693     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
11694     if (!Subtarget->isLittle())
11695       std::swap (Lo, Hi);
11696     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
11697     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
11698     return Builder.CreateOr(
11699         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
11700   }
11701
11702   Type *Tys[] = { Addr->getType() };
11703   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
11704   Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
11705
11706   return Builder.CreateTruncOrBitCast(
11707       Builder.CreateCall(Ldrex, Addr),
11708       cast<PointerType>(Addr->getType())->getElementType());
11709 }
11710
11711 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
11712                                                Value *Addr,
11713                                                AtomicOrdering Ord) const {
11714   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
11715   bool IsRelease = isAtLeastRelease(Ord);
11716
11717   // Since the intrinsics must have legal type, the i64 intrinsics take two
11718   // parameters: "i32, i32". We must marshal Val into the appropriate form
11719   // before the call.
11720   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
11721     Intrinsic::ID Int =
11722         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
11723     Function *Strex = Intrinsic::getDeclaration(M, Int);
11724     Type *Int32Ty = Type::getInt32Ty(M->getContext());
11725
11726     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
11727     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
11728     if (!Subtarget->isLittle())
11729       std::swap (Lo, Hi);
11730     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
11731     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
11732   }
11733
11734   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
11735   Type *Tys[] = { Addr->getType() };
11736   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
11737
11738   return Builder.CreateCall(
11739       Strex, {Builder.CreateZExtOrBitCast(
11740                   Val, Strex->getFunctionType()->getParamType(0)),
11741               Addr});
11742 }
11743
11744 /// \brief Lower an interleaved load into a vldN intrinsic.
11745 ///
11746 /// E.g. Lower an interleaved load (Factor = 2):
11747 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
11748 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
11749 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
11750 ///
11751 ///      Into:
11752 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
11753 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
11754 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
11755 bool ARMTargetLowering::lowerInterleavedLoad(
11756     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
11757     ArrayRef<unsigned> Indices, unsigned Factor) const {
11758   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11759          "Invalid interleave factor");
11760   assert(!Shuffles.empty() && "Empty shufflevector input");
11761   assert(Shuffles.size() == Indices.size() &&
11762          "Unmatched number of shufflevectors and indices");
11763
11764   VectorType *VecTy = Shuffles[0]->getType();
11765   Type *EltTy = VecTy->getVectorElementType();
11766
11767   const DataLayout &DL = LI->getModule()->getDataLayout();
11768   unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
11769   bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
11770
11771   // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
11772   // support i64/f64 element).
11773   if ((VecSize != 64 && VecSize != 128) || EltIs64Bits)
11774     return false;
11775
11776   // A pointer vector can not be the return type of the ldN intrinsics. Need to
11777   // load integer vectors first and then convert to pointer vectors.
11778   if (EltTy->isPointerTy())
11779     VecTy =
11780         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
11781
11782   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
11783                                             Intrinsic::arm_neon_vld3,
11784                                             Intrinsic::arm_neon_vld4};
11785
11786   Function *VldnFunc =
11787       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy);
11788
11789   IRBuilder<> Builder(LI);
11790   SmallVector<Value *, 2> Ops;
11791
11792   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
11793   Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
11794   Ops.push_back(Builder.getInt32(LI->getAlignment()));
11795
11796   CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
11797
11798   // Replace uses of each shufflevector with the corresponding vector loaded
11799   // by ldN.
11800   for (unsigned i = 0; i < Shuffles.size(); i++) {
11801     ShuffleVectorInst *SV = Shuffles[i];
11802     unsigned Index = Indices[i];
11803
11804     Value *SubVec = Builder.CreateExtractValue(VldN, Index);
11805
11806     // Convert the integer vector to pointer vector if the element is pointer.
11807     if (EltTy->isPointerTy())
11808       SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
11809
11810     SV->replaceAllUsesWith(SubVec);
11811   }
11812
11813   return true;
11814 }
11815
11816 /// \brief Get a mask consisting of sequential integers starting from \p Start.
11817 ///
11818 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
11819 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
11820                                    unsigned NumElts) {
11821   SmallVector<Constant *, 16> Mask;
11822   for (unsigned i = 0; i < NumElts; i++)
11823     Mask.push_back(Builder.getInt32(Start + i));
11824
11825   return ConstantVector::get(Mask);
11826 }
11827
11828 /// \brief Lower an interleaved store into a vstN intrinsic.
11829 ///
11830 /// E.g. Lower an interleaved store (Factor = 3):
11831 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
11832 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
11833 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
11834 ///
11835 ///      Into:
11836 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
11837 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
11838 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
11839 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
11840 ///
11841 /// Note that the new shufflevectors will be removed and we'll only generate one
11842 /// vst3 instruction in CodeGen.
11843 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
11844                                               ShuffleVectorInst *SVI,
11845                                               unsigned Factor) const {
11846   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11847          "Invalid interleave factor");
11848
11849   VectorType *VecTy = SVI->getType();
11850   assert(VecTy->getVectorNumElements() % Factor == 0 &&
11851          "Invalid interleaved store");
11852
11853   unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
11854   Type *EltTy = VecTy->getVectorElementType();
11855   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
11856
11857   const DataLayout &DL = SI->getModule()->getDataLayout();
11858   unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
11859   bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
11860
11861   // Skip illegal sub vector types and vector types of i64/f64 element (vstN
11862   // doesn't support i64/f64 element).
11863   if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits)
11864     return false;
11865
11866   Value *Op0 = SVI->getOperand(0);
11867   Value *Op1 = SVI->getOperand(1);
11868   IRBuilder<> Builder(SI);
11869
11870   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
11871   // vectors to integer vectors.
11872   if (EltTy->isPointerTy()) {
11873     Type *IntTy = DL.getIntPtrType(EltTy);
11874
11875     // Convert to the corresponding integer vector.
11876     Type *IntVecTy =
11877         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
11878     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
11879     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
11880
11881     SubVecTy = VectorType::get(IntTy, NumSubElts);
11882   }
11883
11884   static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
11885                                        Intrinsic::arm_neon_vst3,
11886                                        Intrinsic::arm_neon_vst4};
11887   Function *VstNFunc = Intrinsic::getDeclaration(
11888       SI->getModule(), StoreInts[Factor - 2], SubVecTy);
11889
11890   SmallVector<Value *, 6> Ops;
11891
11892   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
11893   Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
11894
11895   // Split the shufflevector operands into sub vectors for the new vstN call.
11896   for (unsigned i = 0; i < Factor; i++)
11897     Ops.push_back(Builder.CreateShuffleVector(
11898         Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
11899
11900   Ops.push_back(Builder.getInt32(SI->getAlignment()));
11901   Builder.CreateCall(VstNFunc, Ops);
11902   return true;
11903 }
11904
11905 enum HABaseType {
11906   HA_UNKNOWN = 0,
11907   HA_FLOAT,
11908   HA_DOUBLE,
11909   HA_VECT64,
11910   HA_VECT128
11911 };
11912
11913 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
11914                                    uint64_t &Members) {
11915   if (auto *ST = dyn_cast<StructType>(Ty)) {
11916     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
11917       uint64_t SubMembers = 0;
11918       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
11919         return false;
11920       Members += SubMembers;
11921     }
11922   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
11923     uint64_t SubMembers = 0;
11924     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
11925       return false;
11926     Members += SubMembers * AT->getNumElements();
11927   } else if (Ty->isFloatTy()) {
11928     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
11929       return false;
11930     Members = 1;
11931     Base = HA_FLOAT;
11932   } else if (Ty->isDoubleTy()) {
11933     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
11934       return false;
11935     Members = 1;
11936     Base = HA_DOUBLE;
11937   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
11938     Members = 1;
11939     switch (Base) {
11940     case HA_FLOAT:
11941     case HA_DOUBLE:
11942       return false;
11943     case HA_VECT64:
11944       return VT->getBitWidth() == 64;
11945     case HA_VECT128:
11946       return VT->getBitWidth() == 128;
11947     case HA_UNKNOWN:
11948       switch (VT->getBitWidth()) {
11949       case 64:
11950         Base = HA_VECT64;
11951         return true;
11952       case 128:
11953         Base = HA_VECT128;
11954         return true;
11955       default:
11956         return false;
11957       }
11958     }
11959   }
11960
11961   return (Members > 0 && Members <= 4);
11962 }
11963
11964 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
11965 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
11966 /// passing according to AAPCS rules.
11967 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
11968     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
11969   if (getEffectiveCallingConv(CallConv, isVarArg) !=
11970       CallingConv::ARM_AAPCS_VFP)
11971     return false;
11972
11973   HABaseType Base = HA_UNKNOWN;
11974   uint64_t Members = 0;
11975   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
11976   DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
11977
11978   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
11979   return IsHA || IsIntArray;
11980 }