lib/Target/AArch64/AArch64ISelLowering.cpp

   1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that AArch64 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #define DEBUG_TYPE "aarch64-isel"
  16 #include "AArch64.h"
  17 #include "AArch64ISelLowering.h"
  18 #include "AArch64MachineFunctionInfo.h"
  19 #include "AArch64TargetMachine.h"
  20 #include "AArch64TargetObjectFile.h"
  21 #include "Utils/AArch64BaseInfo.h"
  22 #include "llvm/CodeGen/Analysis.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  28 #include "llvm/IR/CallingConv.h"
  29
  30 using namespace llvm;
  31
  32 static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
  33   assert (TM.getSubtarget<AArch64Subtarget>().isTargetELF() &&
  34           "unknown subtarget type");
  35   return new AArch64ElfTargetObjectFile();
  36 }
  37
  38 AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
  39   : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
  40
  41   const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
  42
  43   // SIMD compares set the entire lane's bits to 1
  44   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  45
  46   // Scalar register <-> type mapping
  47   addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
  48   addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
  49
  50   if (Subtarget->hasFPARMv8()) {
  51     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
  52     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
  53     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
  54     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
  55   }
  56
  57   if (Subtarget->hasNEON()) {
  58     // And the vectors
  59     addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
  60     addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
  61     addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
  62     addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
  63     addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
  64     addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
  65     addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
  66     addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
  67     addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
  68     addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
  69     addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
  70     addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
  71     addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
  72     addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
  73     addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
  74     addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
  75   }
  76
  77   computeRegisterProperties();
  78
  79   // We combine OR nodes for bitfield and NEON BSL operations.
  80   setTargetDAGCombine(ISD::OR);
  81
  82   setTargetDAGCombine(ISD::AND);
  83   setTargetDAGCombine(ISD::SRA);
  84   setTargetDAGCombine(ISD::SRL);
  85   setTargetDAGCombine(ISD::SHL);
  86
  87   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
  88   setTargetDAGCombine(ISD::INTRINSIC_VOID);
  89   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
  90
  91   // AArch64 does not have i1 loads, or much of anything for i1 really.
  92   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
  93   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
  94   setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
  95
  96   setStackPointerRegisterToSaveRestore(AArch64::XSP);
  97   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
  98   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
  99   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
 100
 101   // We'll lower globals to wrappers for selection.
 102   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 103   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 104
 105   // A64 instructions have the comparison predicate attached to the user of the
 106   // result, but having a separate comparison is valuable for matching.
 107   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
 108   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
 109   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
 110   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
 111
 112   setOperationAction(ISD::SELECT, MVT::i32, Custom);
 113   setOperationAction(ISD::SELECT, MVT::i64, Custom);
 114   setOperationAction(ISD::SELECT, MVT::f32, Custom);
 115   setOperationAction(ISD::SELECT, MVT::f64, Custom);
 116
 117   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 118   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
 119   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
 120   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 121
 122   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 123
 124   setOperationAction(ISD::SETCC, MVT::i32, Custom);
 125   setOperationAction(ISD::SETCC, MVT::i64, Custom);
 126   setOperationAction(ISD::SETCC, MVT::f32, Custom);
 127   setOperationAction(ISD::SETCC, MVT::f64, Custom);
 128
 129   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 130   setOperationAction(ISD::JumpTable, MVT::i32, Custom);
 131   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 132
 133   setOperationAction(ISD::VASTART, MVT::Other, Custom);
 134   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
 135   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 136   setOperationAction(ISD::VAARG, MVT::Other, Expand);
 137
 138   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 139   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 140
 141   setOperationAction(ISD::ROTL, MVT::i32, Expand);
 142   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 143
 144   setOperationAction(ISD::UREM, MVT::i32, Expand);
 145   setOperationAction(ISD::UREM, MVT::i64, Expand);
 146   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 147   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
 148
 149   setOperationAction(ISD::SREM, MVT::i32, Expand);
 150   setOperationAction(ISD::SREM, MVT::i64, Expand);
 151   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
 152   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
 153
 154   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 155   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 156   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
 157   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 158
 159   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 160   setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 161
 162   // Legal floating-point operations.
 163   setOperationAction(ISD::FABS, MVT::f32, Legal);
 164   setOperationAction(ISD::FABS, MVT::f64, Legal);
 165
 166   setOperationAction(ISD::FCEIL, MVT::f32, Legal);
 167   setOperationAction(ISD::FCEIL, MVT::f64, Legal);
 168
 169   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
 170   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
 171
 172   setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
 173   setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
 174
 175   setOperationAction(ISD::FNEG, MVT::f32, Legal);
 176   setOperationAction(ISD::FNEG, MVT::f64, Legal);
 177
 178   setOperationAction(ISD::FRINT, MVT::f32, Legal);
 179   setOperationAction(ISD::FRINT, MVT::f64, Legal);
 180
 181   setOperationAction(ISD::FSQRT, MVT::f32, Legal);
 182   setOperationAction(ISD::FSQRT, MVT::f64, Legal);
 183
 184   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 185   setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
 186
 187   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 188   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 189   setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
 190
 191   // Illegal floating-point operations.
 192   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 193   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 194
 195   setOperationAction(ISD::FCOS, MVT::f32, Expand);
 196   setOperationAction(ISD::FCOS, MVT::f64, Expand);
 197
 198   setOperationAction(ISD::FEXP, MVT::f32, Expand);
 199   setOperationAction(ISD::FEXP, MVT::f64, Expand);
 200
 201   setOperationAction(ISD::FEXP2, MVT::f32, Expand);
 202   setOperationAction(ISD::FEXP2, MVT::f64, Expand);
 203
 204   setOperationAction(ISD::FLOG, MVT::f32, Expand);
 205   setOperationAction(ISD::FLOG, MVT::f64, Expand);
 206
 207   setOperationAction(ISD::FLOG2, MVT::f32, Expand);
 208   setOperationAction(ISD::FLOG2, MVT::f64, Expand);
 209
 210   setOperationAction(ISD::FLOG10, MVT::f32, Expand);
 211   setOperationAction(ISD::FLOG10, MVT::f64, Expand);
 212
 213   setOperationAction(ISD::FPOW, MVT::f32, Expand);
 214   setOperationAction(ISD::FPOW, MVT::f64, Expand);
 215
 216   setOperationAction(ISD::FPOWI, MVT::f32, Expand);
 217   setOperationAction(ISD::FPOWI, MVT::f64, Expand);
 218
 219   setOperationAction(ISD::FREM, MVT::f32, Expand);
 220   setOperationAction(ISD::FREM, MVT::f64, Expand);
 221
 222   setOperationAction(ISD::FSIN, MVT::f32, Expand);
 223   setOperationAction(ISD::FSIN, MVT::f64, Expand);
 224
 225   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 226   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 227
 228   // Virtually no operation on f128 is legal, but LLVM can't expand them when
 229   // there's a valid register class, so we need custom operations in most cases.
 230   setOperationAction(ISD::FABS,       MVT::f128, Expand);
 231   setOperationAction(ISD::FADD,       MVT::f128, Custom);
 232   setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
 233   setOperationAction(ISD::FCOS,       MVT::f128, Expand);
 234   setOperationAction(ISD::FDIV,       MVT::f128, Custom);
 235   setOperationAction(ISD::FMA,        MVT::f128, Expand);
 236   setOperationAction(ISD::FMUL,       MVT::f128, Custom);
 237   setOperationAction(ISD::FNEG,       MVT::f128, Expand);
 238   setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
 239   setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
 240   setOperationAction(ISD::FPOW,       MVT::f128, Expand);
 241   setOperationAction(ISD::FREM,       MVT::f128, Expand);
 242   setOperationAction(ISD::FRINT,      MVT::f128, Expand);
 243   setOperationAction(ISD::FSIN,       MVT::f128, Expand);
 244   setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
 245   setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
 246   setOperationAction(ISD::FSUB,       MVT::f128, Custom);
 247   setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
 248   setOperationAction(ISD::SETCC,      MVT::f128, Custom);
 249   setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
 250   setOperationAction(ISD::SELECT,     MVT::f128, Expand);
 251   setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
 252   setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
 253
 254   // Lowering for many of the conversions is actually specified by the non-f128
 255   // type. The LowerXXX function will be trivial when f128 isn't involved.
 256   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 257   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 258   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
 259   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 260   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 261   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
 262   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 263   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 264   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
 265   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 266   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 267   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
 268   setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
 269   setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
 270
 271   // This prevents LLVM trying to compress double constants into a floating
 272   // constant-pool entry and trying to load from there. It's of doubtful benefit
 273   // for A64: we'd need LDR followed by FCVT, I believe.
 274   setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
 275   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
 276   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
 277
 278   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 279   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
 280   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 281   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 282   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 283   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 284
 285   setExceptionPointerRegister(AArch64::X0);
 286   setExceptionSelectorRegister(AArch64::X1);
 287
 288   if (Subtarget->hasNEON()) {
 289     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
 290     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
 291     setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
 292     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
 293     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
 294     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
 295     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
 296     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
 297     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
 298     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
 299     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
 300     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
 301     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
 302     setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
 303     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
 304
 305     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
 306     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
 307     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
 308     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
 309     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
 310     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
 311     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
 312     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
 313     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
 314     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
 315     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
 316     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
 317
 318     setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
 319     setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
 320     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
 321     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
 322     setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
 323     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
 324     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
 325     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
 326     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
 327
 328     setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
 329     setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
 330     setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
 331     setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
 332     setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
 333     setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
 334     setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
 335     setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
 336     setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
 337     setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
 338     setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
 339     setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
 340
 341     setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
 342     setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
 343     setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
 344     setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
 345
 346     setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
 347     setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
 348     setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
 349     setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
 350
 351     setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
 352     setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
 353     setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
 354     setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
 355
 356     setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
 357     setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
 358     setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
 359     setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
 360
 361     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
 362     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
 363     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
 364     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
 365
 366     setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
 367     setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
 368     setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
 369     setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
 370
 371     // Vector ExtLoad and TruncStore are expanded.
 372     for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
 373          I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
 374       MVT VT = (MVT::SimpleValueType) I;
 375       setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
 376       setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
 377       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
 378       for (unsigned II = MVT::FIRST_VECTOR_VALUETYPE;
 379            II <= MVT::LAST_VECTOR_VALUETYPE; ++II) {
 380         MVT VT1 = (MVT::SimpleValueType) II;
 381         // A TruncStore has two vector types of the same number of elements
 382         // and different element sizes.
 383         if (VT.getVectorNumElements() == VT1.getVectorNumElements() &&
 384             VT.getVectorElementType().getSizeInBits()
 385                 > VT1.getVectorElementType().getSizeInBits())
 386           setTruncStoreAction(VT, VT1, Expand);
 387       }
 388     }
 389
 390     // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply.
 391     // FIXME: For a v2i64 multiply, we copy VPR to GPR and do 2 i64 multiplies,
 392     // and then copy back to VPR. This solution may be optimized by Following 3
 393     // NEON instructions:
 394     //        pmull  v2.1q, v0.1d, v1.1d
 395     //        pmull2 v3.1q, v0.2d, v1.2d
 396     //        ins    v2.d[1], v3.d[0]
 397     // As currently we can't verify the correctness of such assumption, we can
 398     // do such optimization in the future.
 399     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 400     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
 401   }
 402 }
 403
 404 EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
 405   // It's reasonably important that this value matches the "natural" legal
 406   // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
 407   // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
 408   if (!VT.isVector()) return MVT::i32;
 409   return VT.changeVectorElementTypeToInteger();
 410 }
 411
 412 static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
 413                                   unsigned &LdrOpc,
 414                                   unsigned &StrOpc) {
 415   static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
 416                                        AArch64::LDXR_word, AArch64::LDXR_dword};
 417   static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
 418                                      AArch64::LDAXR_word, AArch64::LDAXR_dword};
 419   static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
 420                                        AArch64::STXR_word, AArch64::STXR_dword};
 421   static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
 422                                      AArch64::STLXR_word, AArch64::STLXR_dword};
 423
 424   const unsigned *LoadOps, *StoreOps;
 425   if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
 426     LoadOps = LoadAcqs;
 427   else
 428     LoadOps = LoadBares;
 429
 430   if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
 431     StoreOps = StoreRels;
 432   else
 433     StoreOps = StoreBares;
 434
 435   assert(isPowerOf2_32(Size) && Size <= 8 &&
 436          "unsupported size for atomic binary op!");
 437
 438   LdrOpc = LoadOps[Log2_32(Size)];
 439   StrOpc = StoreOps[Log2_32(Size)];
 440 }
 441
 442 // FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
 443 // have value type mapped, and they are both being defined as MVT::untyped.
 444 // Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
 445 // would fail to figure out the register pressure correctly.
 446 std::pair<const TargetRegisterClass*, uint8_t>
 447 AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
 448   const TargetRegisterClass *RRC = 0;
 449   uint8_t Cost = 1;
 450   switch (VT.SimpleTy) {
 451   default:
 452     return TargetLowering::findRepresentativeClass(VT);
 453   case MVT::v4i64:
 454     RRC = &AArch64::QPairRegClass;
 455     Cost = 2;
 456     break;
 457   case MVT::v8i64:
 458     RRC = &AArch64::QQuadRegClass;
 459     Cost = 4;
 460     break;
 461   }
 462   return std::make_pair(RRC, Cost);
 463 }
 464
 465 MachineBasicBlock *
 466 AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 467                                         unsigned Size,
 468                                         unsigned BinOpcode) const {
 469   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
 470   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 471
 472   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 473   MachineFunction *MF = BB->getParent();
 474   MachineFunction::iterator It = BB;
 475   ++It;
 476
 477   unsigned dest = MI->getOperand(0).getReg();
 478   unsigned ptr = MI->getOperand(1).getReg();
 479   unsigned incr = MI->getOperand(2).getReg();
 480   AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
 481   DebugLoc dl = MI->getDebugLoc();
 482
 483   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 484
 485   unsigned ldrOpc, strOpc;
 486   getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
 487
 488   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 489   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 490   MF->insert(It, loopMBB);
 491   MF->insert(It, exitMBB);
 492
 493   // Transfer the remainder of BB and its successor edges to exitMBB.
 494   exitMBB->splice(exitMBB->begin(), BB,
 495                   llvm::next(MachineBasicBlock::iterator(MI)),
 496                   BB->end());
 497   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 498
 499   const TargetRegisterClass *TRC
 500     = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
 501   unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
 502
 503   //  thisMBB:
 504   //   ...
 505   //   fallthrough --> loopMBB
 506   BB->addSuccessor(loopMBB);
 507
 508   //  loopMBB:
 509   //   ldxr dest, ptr
 510   //   <binop> scratch, dest, incr
 511   //   stxr stxr_status, scratch, ptr
 512   //   cbnz stxr_status, loopMBB
 513   //   fallthrough --> exitMBB
 514   BB = loopMBB;
 515   BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
 516   if (BinOpcode) {
 517     // All arithmetic operations we'll be creating are designed to take an extra
 518     // shift or extend operand, which we can conveniently set to zero.
 519
 520     // Operand order needs to go the other way for NAND.
 521     if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
 522       BuildMI(BB, dl, TII->get(BinOpcode), scratch)
 523         .addReg(incr).addReg(dest).addImm(0);
 524     else
 525       BuildMI(BB, dl, TII->get(BinOpcode), scratch)
 526         .addReg(dest).addReg(incr).addImm(0);
 527   }
 528
 529   // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
 530   unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
 531   MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
 532
 533   BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
 534   BuildMI(BB, dl, TII->get(AArch64::CBNZw))
 535     .addReg(stxr_status).addMBB(loopMBB);
 536
 537   BB->addSuccessor(loopMBB);
 538   BB->addSuccessor(exitMBB);
 539
 540   //  exitMBB:
 541   //   ...
 542   BB = exitMBB;
 543
 544   MI->eraseFromParent();   // The instruction is gone now.
 545
 546   return BB;
 547 }
 548
 549 MachineBasicBlock *
 550 AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
 551                                               MachineBasicBlock *BB,
 552                                               unsigned Size,
 553                                               unsigned CmpOp,
 554                                               A64CC::CondCodes Cond) const {
 555   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 556
 557   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 558   MachineFunction *MF = BB->getParent();
 559   MachineFunction::iterator It = BB;
 560   ++It;
 561
 562   unsigned dest = MI->getOperand(0).getReg();
 563   unsigned ptr = MI->getOperand(1).getReg();
 564   unsigned incr = MI->getOperand(2).getReg();
 565   AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
 566
 567   unsigned oldval = dest;
 568   DebugLoc dl = MI->getDebugLoc();
 569
 570   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 571   const TargetRegisterClass *TRC, *TRCsp;
 572   if (Size == 8) {
 573     TRC = &AArch64::GPR64RegClass;
 574     TRCsp = &AArch64::GPR64xspRegClass;
 575   } else {
 576     TRC = &AArch64::GPR32RegClass;
 577     TRCsp = &AArch64::GPR32wspRegClass;
 578   }
 579
 580   unsigned ldrOpc, strOpc;
 581   getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
 582
 583   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 584   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 585   MF->insert(It, loopMBB);
 586   MF->insert(It, exitMBB);
 587
 588   // Transfer the remainder of BB and its successor edges to exitMBB.
 589   exitMBB->splice(exitMBB->begin(), BB,
 590                   llvm::next(MachineBasicBlock::iterator(MI)),
 591                   BB->end());
 592   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 593
 594   unsigned scratch = MRI.createVirtualRegister(TRC);
 595   MRI.constrainRegClass(scratch, TRCsp);
 596
 597   //  thisMBB:
 598   //   ...
 599   //   fallthrough --> loopMBB
 600   BB->addSuccessor(loopMBB);
 601
 602   //  loopMBB:
 603   //   ldxr dest, ptr
 604   //   cmp incr, dest (, sign extend if necessary)
 605   //   csel scratch, dest, incr, cond
 606   //   stxr stxr_status, scratch, ptr
 607   //   cbnz stxr_status, loopMBB
 608   //   fallthrough --> exitMBB
 609   BB = loopMBB;
 610   BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
 611
 612   // Build compare and cmov instructions.
 613   MRI.constrainRegClass(incr, TRCsp);
 614   BuildMI(BB, dl, TII->get(CmpOp))
 615     .addReg(incr).addReg(oldval).addImm(0);
 616
 617   BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
 618           scratch)
 619     .addReg(oldval).addReg(incr).addImm(Cond);
 620
 621   unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
 622   MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
 623
 624   BuildMI(BB, dl, TII->get(strOpc), stxr_status)
 625     .addReg(scratch).addReg(ptr);
 626   BuildMI(BB, dl, TII->get(AArch64::CBNZw))
 627     .addReg(stxr_status).addMBB(loopMBB);
 628
 629   BB->addSuccessor(loopMBB);
 630   BB->addSuccessor(exitMBB);
 631
 632   //  exitMBB:
 633   //   ...
 634   BB = exitMBB;
 635
 636   MI->eraseFromParent();   // The instruction is gone now.
 637
 638   return BB;
 639 }
 640
 641 MachineBasicBlock *
 642 AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
 643                                          MachineBasicBlock *BB,
 644                                          unsigned Size) const {
 645   unsigned dest    = MI->getOperand(0).getReg();
 646   unsigned ptr     = MI->getOperand(1).getReg();
 647   unsigned oldval  = MI->getOperand(2).getReg();
 648   unsigned newval  = MI->getOperand(3).getReg();
 649   AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
 650   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 651   DebugLoc dl = MI->getDebugLoc();
 652
 653   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 654   const TargetRegisterClass *TRCsp;
 655   TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
 656
 657   unsigned ldrOpc, strOpc;
 658   getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
 659
 660   MachineFunction *MF = BB->getParent();
 661   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 662   MachineFunction::iterator It = BB;
 663   ++It; // insert the new blocks after the current block
 664
 665   MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
 666   MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
 667   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 668   MF->insert(It, loop1MBB);
 669   MF->insert(It, loop2MBB);
 670   MF->insert(It, exitMBB);
 671
 672   // Transfer the remainder of BB and its successor edges to exitMBB.
 673   exitMBB->splice(exitMBB->begin(), BB,
 674                   llvm::next(MachineBasicBlock::iterator(MI)),
 675                   BB->end());
 676   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 677
 678   //  thisMBB:
 679   //   ...
 680   //   fallthrough --> loop1MBB
 681   BB->addSuccessor(loop1MBB);
 682
 683   // loop1MBB:
 684   //   ldxr dest, [ptr]
 685   //   cmp dest, oldval
 686   //   b.ne exitMBB
 687   BB = loop1MBB;
 688   BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
 689
 690   unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
 691   MRI.constrainRegClass(dest, TRCsp);
 692   BuildMI(BB, dl, TII->get(CmpOp))
 693     .addReg(dest).addReg(oldval).addImm(0);
 694   BuildMI(BB, dl, TII->get(AArch64::Bcc))
 695     .addImm(A64CC::NE).addMBB(exitMBB);
 696   BB->addSuccessor(loop2MBB);
 697   BB->addSuccessor(exitMBB);
 698
 699   // loop2MBB:
 700   //   strex stxr_status, newval, [ptr]
 701   //   cbnz stxr_status, loop1MBB
 702   BB = loop2MBB;
 703   unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
 704   MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
 705
 706   BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
 707   BuildMI(BB, dl, TII->get(AArch64::CBNZw))
 708     .addReg(stxr_status).addMBB(loop1MBB);
 709   BB->addSuccessor(loop1MBB);
 710   BB->addSuccessor(exitMBB);
 711
 712   //  exitMBB:
 713   //   ...
 714   BB = exitMBB;
 715
 716   MI->eraseFromParent();   // The instruction is gone now.
 717
 718   return BB;
 719 }
 720
 721 MachineBasicBlock *
 722 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
 723                                     MachineBasicBlock *MBB) const {
 724   // We materialise the F128CSEL pseudo-instruction using conditional branches
 725   // and loads, giving an instruciton sequence like:
 726   //     str q0, [sp]
 727   //     b.ne IfTrue
 728   //     b Finish
 729   // IfTrue:
 730   //     str q1, [sp]
 731   // Finish:
 732   //     ldr q0, [sp]
 733   //
 734   // Using virtual registers would probably not be beneficial since COPY
 735   // instructions are expensive for f128 (there's no actual instruction to
 736   // implement them).
 737   //
 738   // An alternative would be to do an integer-CSEL on some address. E.g.:
 739   //     mov x0, sp
 740   //     add x1, sp, #16
 741   //     str q0, [x0]
 742   //     str q1, [x1]
 743   //     csel x0, x0, x1, ne
 744   //     ldr q0, [x0]
 745   //
 746   // It's unclear which approach is actually optimal.
 747   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 748   MachineFunction *MF = MBB->getParent();
 749   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
 750   DebugLoc DL = MI->getDebugLoc();
 751   MachineFunction::iterator It = MBB;
 752   ++It;
 753
 754   unsigned DestReg = MI->getOperand(0).getReg();
 755   unsigned IfTrueReg = MI->getOperand(1).getReg();
 756   unsigned IfFalseReg = MI->getOperand(2).getReg();
 757   unsigned CondCode = MI->getOperand(3).getImm();
 758   bool NZCVKilled = MI->getOperand(4).isKill();
 759
 760   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
 761   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
 762   MF->insert(It, TrueBB);
 763   MF->insert(It, EndBB);
 764
 765   // Transfer rest of current basic-block to EndBB
 766   EndBB->splice(EndBB->begin(), MBB,
 767                 llvm::next(MachineBasicBlock::iterator(MI)),
 768                 MBB->end());
 769   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
 770
 771   // We need somewhere to store the f128 value needed.
 772   int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
 773
 774   //     [... start of incoming MBB ...]
 775   //     str qIFFALSE, [sp]
 776   //     b.cc IfTrue
 777   //     b Done
 778   BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
 779     .addReg(IfFalseReg)
 780     .addFrameIndex(ScratchFI)
 781     .addImm(0);
 782   BuildMI(MBB, DL, TII->get(AArch64::Bcc))
 783     .addImm(CondCode)
 784     .addMBB(TrueBB);
 785   BuildMI(MBB, DL, TII->get(AArch64::Bimm))
 786     .addMBB(EndBB);
 787   MBB->addSuccessor(TrueBB);
 788   MBB->addSuccessor(EndBB);
 789
 790   if (!NZCVKilled) {
 791     // NZCV is live-through TrueBB.
 792     TrueBB->addLiveIn(AArch64::NZCV);
 793     EndBB->addLiveIn(AArch64::NZCV);
 794   }
 795
 796   // IfTrue:
 797   //     str qIFTRUE, [sp]
 798   BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
 799     .addReg(IfTrueReg)
 800     .addFrameIndex(ScratchFI)
 801     .addImm(0);
 802
 803   // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
 804   // blocks.
 805   TrueBB->addSuccessor(EndBB);
 806
 807   // Done:
 808   //     ldr qDEST, [sp]
 809   //     [... rest of incoming MBB ...]
 810   MachineInstr *StartOfEnd = EndBB->begin();
 811   BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
 812     .addFrameIndex(ScratchFI)
 813     .addImm(0);
 814
 815   MI->eraseFromParent();
 816   return EndBB;
 817 }
 818
 819 MachineBasicBlock *
 820 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 821                                                  MachineBasicBlock *MBB) const {
 822   switch (MI->getOpcode()) {
 823   default: llvm_unreachable("Unhandled instruction with custom inserter");
 824   case AArch64::F128CSEL:
 825     return EmitF128CSEL(MI, MBB);
 826   case AArch64::ATOMIC_LOAD_ADD_I8:
 827     return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
 828   case AArch64::ATOMIC_LOAD_ADD_I16:
 829     return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
 830   case AArch64::ATOMIC_LOAD_ADD_I32:
 831     return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
 832   case AArch64::ATOMIC_LOAD_ADD_I64:
 833     return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
 834
 835   case AArch64::ATOMIC_LOAD_SUB_I8:
 836     return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
 837   case AArch64::ATOMIC_LOAD_SUB_I16:
 838     return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
 839   case AArch64::ATOMIC_LOAD_SUB_I32:
 840     return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
 841   case AArch64::ATOMIC_LOAD_SUB_I64:
 842     return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
 843
 844   case AArch64::ATOMIC_LOAD_AND_I8:
 845     return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
 846   case AArch64::ATOMIC_LOAD_AND_I16:
 847     return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
 848   case AArch64::ATOMIC_LOAD_AND_I32:
 849     return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
 850   case AArch64::ATOMIC_LOAD_AND_I64:
 851     return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
 852
 853   case AArch64::ATOMIC_LOAD_OR_I8:
 854     return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
 855   case AArch64::ATOMIC_LOAD_OR_I16:
 856     return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
 857   case AArch64::ATOMIC_LOAD_OR_I32:
 858     return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
 859   case AArch64::ATOMIC_LOAD_OR_I64:
 860     return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
 861
 862   case AArch64::ATOMIC_LOAD_XOR_I8:
 863     return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
 864   case AArch64::ATOMIC_LOAD_XOR_I16:
 865     return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
 866   case AArch64::ATOMIC_LOAD_XOR_I32:
 867     return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
 868   case AArch64::ATOMIC_LOAD_XOR_I64:
 869     return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
 870
 871   case AArch64::ATOMIC_LOAD_NAND_I8:
 872     return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
 873   case AArch64::ATOMIC_LOAD_NAND_I16:
 874     return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
 875   case AArch64::ATOMIC_LOAD_NAND_I32:
 876     return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
 877   case AArch64::ATOMIC_LOAD_NAND_I64:
 878     return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
 879
 880   case AArch64::ATOMIC_LOAD_MIN_I8:
 881     return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
 882   case AArch64::ATOMIC_LOAD_MIN_I16:
 883     return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
 884   case AArch64::ATOMIC_LOAD_MIN_I32:
 885     return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
 886   case AArch64::ATOMIC_LOAD_MIN_I64:
 887     return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
 888
 889   case AArch64::ATOMIC_LOAD_MAX_I8:
 890     return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
 891   case AArch64::ATOMIC_LOAD_MAX_I16:
 892     return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
 893   case AArch64::ATOMIC_LOAD_MAX_I32:
 894     return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
 895   case AArch64::ATOMIC_LOAD_MAX_I64:
 896     return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
 897
 898   case AArch64::ATOMIC_LOAD_UMIN_I8:
 899     return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
 900   case AArch64::ATOMIC_LOAD_UMIN_I16:
 901     return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
 902   case AArch64::ATOMIC_LOAD_UMIN_I32:
 903     return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
 904   case AArch64::ATOMIC_LOAD_UMIN_I64:
 905     return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
 906
 907   case AArch64::ATOMIC_LOAD_UMAX_I8:
 908     return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
 909   case AArch64::ATOMIC_LOAD_UMAX_I16:
 910     return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
 911   case AArch64::ATOMIC_LOAD_UMAX_I32:
 912     return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
 913   case AArch64::ATOMIC_LOAD_UMAX_I64:
 914     return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
 915
 916   case AArch64::ATOMIC_SWAP_I8:
 917     return emitAtomicBinary(MI, MBB, 1, 0);
 918   case AArch64::ATOMIC_SWAP_I16:
 919     return emitAtomicBinary(MI, MBB, 2, 0);
 920   case AArch64::ATOMIC_SWAP_I32:
 921     return emitAtomicBinary(MI, MBB, 4, 0);
 922   case AArch64::ATOMIC_SWAP_I64:
 923     return emitAtomicBinary(MI, MBB, 8, 0);
 924
 925   case AArch64::ATOMIC_CMP_SWAP_I8:
 926     return emitAtomicCmpSwap(MI, MBB, 1);
 927   case AArch64::ATOMIC_CMP_SWAP_I16:
 928     return emitAtomicCmpSwap(MI, MBB, 2);
 929   case AArch64::ATOMIC_CMP_SWAP_I32:
 930     return emitAtomicCmpSwap(MI, MBB, 4);
 931   case AArch64::ATOMIC_CMP_SWAP_I64:
 932     return emitAtomicCmpSwap(MI, MBB, 8);
 933   }
 934 }
 935
 936
 937 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
 938   switch (Opcode) {
 939   case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
 940   case AArch64ISD::Call:           return "AArch64ISD::Call";
 941   case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
 942   case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
 943   case AArch64ISD::BFI:            return "AArch64ISD::BFI";
 944   case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
 945   case AArch64ISD::Ret:            return "AArch64ISD::Ret";
 946   case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
 947   case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
 948   case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
 949   case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
 950   case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
 951   case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
 952   case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
 953   case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
 954
 955   case AArch64ISD::NEON_MOVIMM:
 956     return "AArch64ISD::NEON_MOVIMM";
 957   case AArch64ISD::NEON_MVNIMM:
 958     return "AArch64ISD::NEON_MVNIMM";
 959   case AArch64ISD::NEON_FMOVIMM:
 960     return "AArch64ISD::NEON_FMOVIMM";
 961   case AArch64ISD::NEON_CMP:
 962     return "AArch64ISD::NEON_CMP";
 963   case AArch64ISD::NEON_CMPZ:
 964     return "AArch64ISD::NEON_CMPZ";
 965   case AArch64ISD::NEON_TST:
 966     return "AArch64ISD::NEON_TST";
 967   case AArch64ISD::NEON_QSHLs:
 968     return "AArch64ISD::NEON_QSHLs";
 969   case AArch64ISD::NEON_QSHLu:
 970     return "AArch64ISD::NEON_QSHLu";
 971   case AArch64ISD::NEON_VDUP:
 972     return "AArch64ISD::NEON_VDUP";
 973   case AArch64ISD::NEON_VDUPLANE:
 974     return "AArch64ISD::NEON_VDUPLANE";
 975   case AArch64ISD::NEON_REV16:
 976     return "AArch64ISD::NEON_REV16";
 977   case AArch64ISD::NEON_REV32:
 978     return "AArch64ISD::NEON_REV32";
 979   case AArch64ISD::NEON_REV64:
 980     return "AArch64ISD::NEON_REV64";
 981   case AArch64ISD::NEON_UZP1:
 982     return "AArch64ISD::NEON_UZP1";
 983   case AArch64ISD::NEON_UZP2:
 984     return "AArch64ISD::NEON_UZP2";
 985   case AArch64ISD::NEON_ZIP1:
 986     return "AArch64ISD::NEON_ZIP1";
 987   case AArch64ISD::NEON_ZIP2:
 988     return "AArch64ISD::NEON_ZIP2";
 989   case AArch64ISD::NEON_TRN1:
 990     return "AArch64ISD::NEON_TRN1";
 991   case AArch64ISD::NEON_TRN2:
 992     return "AArch64ISD::NEON_TRN2";
 993   case AArch64ISD::NEON_LD1_UPD:
 994     return "AArch64ISD::NEON_LD1_UPD";
 995   case AArch64ISD::NEON_LD2_UPD:
 996     return "AArch64ISD::NEON_LD2_UPD";
 997   case AArch64ISD::NEON_LD3_UPD:
 998     return "AArch64ISD::NEON_LD3_UPD";
 999   case AArch64ISD::NEON_LD4_UPD:
1000     return "AArch64ISD::NEON_LD4_UPD";
1001   case AArch64ISD::NEON_ST1_UPD:
1002     return "AArch64ISD::NEON_ST1_UPD";
1003   case AArch64ISD::NEON_ST2_UPD:
1004     return "AArch64ISD::NEON_ST2_UPD";
1005   case AArch64ISD::NEON_ST3_UPD:
1006     return "AArch64ISD::NEON_ST3_UPD";
1007   case AArch64ISD::NEON_ST4_UPD:
1008     return "AArch64ISD::NEON_ST4_UPD";
1009   case AArch64ISD::NEON_LD1x2_UPD:
1010     return "AArch64ISD::NEON_LD1x2_UPD";
1011   case AArch64ISD::NEON_LD1x3_UPD:
1012     return "AArch64ISD::NEON_LD1x3_UPD";
1013   case AArch64ISD::NEON_LD1x4_UPD:
1014     return "AArch64ISD::NEON_LD1x4_UPD";
1015   case AArch64ISD::NEON_ST1x2_UPD:
1016     return "AArch64ISD::NEON_ST1x2_UPD";
1017   case AArch64ISD::NEON_ST1x3_UPD:
1018     return "AArch64ISD::NEON_ST1x3_UPD";
1019   case AArch64ISD::NEON_ST1x4_UPD:
1020     return "AArch64ISD::NEON_ST1x4_UPD";
1021   case AArch64ISD::NEON_LD2DUP:
1022     return "AArch64ISD::NEON_LD2DUP";
1023   case AArch64ISD::NEON_LD3DUP:
1024     return "AArch64ISD::NEON_LD3DUP";
1025   case AArch64ISD::NEON_LD4DUP:
1026     return "AArch64ISD::NEON_LD4DUP";
1027   case AArch64ISD::NEON_LD2DUP_UPD:
1028     return "AArch64ISD::NEON_LD2DUP_UPD";
1029   case AArch64ISD::NEON_LD3DUP_UPD:
1030     return "AArch64ISD::NEON_LD3DUP_UPD";
1031   case AArch64ISD::NEON_LD4DUP_UPD:
1032     return "AArch64ISD::NEON_LD4DUP_UPD";
1033   case AArch64ISD::NEON_LD2LN_UPD:
1034     return "AArch64ISD::NEON_LD2LN_UPD";
1035   case AArch64ISD::NEON_LD3LN_UPD:
1036     return "AArch64ISD::NEON_LD3LN_UPD";
1037   case AArch64ISD::NEON_LD4LN_UPD:
1038     return "AArch64ISD::NEON_LD4LN_UPD";
1039   case AArch64ISD::NEON_ST2LN_UPD:
1040     return "AArch64ISD::NEON_ST2LN_UPD";
1041   case AArch64ISD::NEON_ST3LN_UPD:
1042     return "AArch64ISD::NEON_ST3LN_UPD";
1043   case AArch64ISD::NEON_ST4LN_UPD:
1044     return "AArch64ISD::NEON_ST4LN_UPD";
1045   case AArch64ISD::NEON_VEXTRACT:
1046     return "AArch64ISD::NEON_VEXTRACT";
1047   default:
1048     return NULL;
1049   }
1050 }
1051
1052 static const uint16_t AArch64FPRArgRegs[] = {
1053   AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
1054   AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
1055 };
1056 static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
1057
1058 static const uint16_t AArch64ArgRegs[] = {
1059   AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
1060   AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
1061 };
1062 static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
1063
1064 static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
1065                                  CCValAssign::LocInfo LocInfo,
1066                                  ISD::ArgFlagsTy ArgFlags, CCState &State) {
1067   // Mark all remaining general purpose registers as allocated. We don't
1068   // backtrack: if (for example) an i128 gets put on the stack, no subsequent
1069   // i64 will go in registers (C.11).
1070   for (unsigned i = 0; i < NumArgRegs; ++i)
1071     State.AllocateReg(AArch64ArgRegs[i]);
1072
1073   return false;
1074 }
1075
1076 #include "AArch64GenCallingConv.inc"
1077
1078 CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1079
1080   switch(CC) {
1081   default: llvm_unreachable("Unsupported calling convention");
1082   case CallingConv::Fast:
1083   case CallingConv::C:
1084     return CC_A64_APCS;
1085   }
1086 }
1087
1088 void
1089 AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
1090                                            SDLoc DL, SDValue &Chain) const {
1091   MachineFunction &MF = DAG.getMachineFunction();
1092   MachineFrameInfo *MFI = MF.getFrameInfo();
1093   AArch64MachineFunctionInfo *FuncInfo
1094     = MF.getInfo<AArch64MachineFunctionInfo>();
1095
1096   SmallVector<SDValue, 8> MemOps;
1097
1098   unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
1099                                                          NumArgRegs);
1100   unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
1101                                                          NumFPRArgRegs);
1102
1103   unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
1104   int GPRIdx = 0;
1105   if (GPRSaveSize != 0) {
1106     GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
1107
1108     SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
1109
1110     for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
1111       unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
1112       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
1113       SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
1114                                    MachinePointerInfo::getStack(i * 8),
1115                                    false, false, 0);
1116       MemOps.push_back(Store);
1117       FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1118                         DAG.getConstant(8, getPointerTy()));
1119     }
1120   }
1121
1122   if (getSubtarget()->hasFPARMv8()) {
1123   unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
1124   int FPRIdx = 0;
1125     // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
1126     // can omit a register save area if we know we'll never use registers of
1127     // that class.
1128     if (FPRSaveSize != 0) {
1129       FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
1130
1131       SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
1132
1133       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
1134         unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
1135             &AArch64::FPR128RegClass);
1136         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
1137         SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
1138             MachinePointerInfo::getStack(i * 16),
1139             false, false, 0);
1140         MemOps.push_back(Store);
1141         FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1142             DAG.getConstant(16, getPointerTy()));
1143       }
1144     }
1145     FuncInfo->setVariadicFPRIdx(FPRIdx);
1146     FuncInfo->setVariadicFPRSize(FPRSaveSize);
1147   }
1148
1149   int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
1150
1151   FuncInfo->setVariadicStackIdx(StackIdx);
1152   FuncInfo->setVariadicGPRIdx(GPRIdx);
1153   FuncInfo->setVariadicGPRSize(GPRSaveSize);
1154
1155   if (!MemOps.empty()) {
1156     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
1157                         MemOps.size());
1158   }
1159 }
1160
1161
1162 SDValue
1163 AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
1164                                       CallingConv::ID CallConv, bool isVarArg,
1165                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1166                                       SDLoc dl, SelectionDAG &DAG,
1167                                       SmallVectorImpl<SDValue> &InVals) const {
1168   MachineFunction &MF = DAG.getMachineFunction();
1169   AArch64MachineFunctionInfo *FuncInfo
1170     = MF.getInfo<AArch64MachineFunctionInfo>();
1171   MachineFrameInfo *MFI = MF.getFrameInfo();
1172   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1173
1174   SmallVector<CCValAssign, 16> ArgLocs;
1175   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1176                  getTargetMachine(), ArgLocs, *DAG.getContext());
1177   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1178
1179   SmallVector<SDValue, 16> ArgValues;
1180
1181   SDValue ArgValue;
1182   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1183     CCValAssign &VA = ArgLocs[i];
1184     ISD::ArgFlagsTy Flags = Ins[i].Flags;
1185
1186     if (Flags.isByVal()) {
1187       // Byval is used for small structs and HFAs in the PCS, but the system
1188       // should work in a non-compliant manner for larger structs.
1189       EVT PtrTy = getPointerTy();
1190       int Size = Flags.getByValSize();
1191       unsigned NumRegs = (Size + 7) / 8;
1192
1193       unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
1194                                                  VA.getLocMemOffset(),
1195                                                  false);
1196       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
1197       InVals.push_back(FrameIdxN);
1198
1199       continue;
1200     } else if (VA.isRegLoc()) {
1201       MVT RegVT = VA.getLocVT();
1202       const TargetRegisterClass *RC = getRegClassFor(RegVT);
1203       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1204
1205       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1206     } else { // VA.isRegLoc()
1207       assert(VA.isMemLoc());
1208
1209       int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
1210                                       VA.getLocMemOffset(), true);
1211
1212       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1213       ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
1214                              MachinePointerInfo::getFixedStack(FI),
1215                              false, false, false, 0);
1216
1217
1218     }
1219
1220     switch (VA.getLocInfo()) {
1221     default: llvm_unreachable("Unknown loc info!");
1222     case CCValAssign::Full: break;
1223     case CCValAssign::BCvt:
1224       ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
1225       break;
1226     case CCValAssign::SExt:
1227     case CCValAssign::ZExt:
1228     case CCValAssign::AExt:
1229     case CCValAssign::FPExt: {
1230       unsigned DestSize = VA.getValVT().getSizeInBits();
1231       unsigned DestSubReg;
1232
1233       switch (DestSize) {
1234       case 8: DestSubReg = AArch64::sub_8; break;
1235       case 16: DestSubReg = AArch64::sub_16; break;
1236       case 32: DestSubReg = AArch64::sub_32; break;
1237       case 64: DestSubReg = AArch64::sub_64; break;
1238       default: llvm_unreachable("Unexpected argument promotion");
1239       }
1240
1241       ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
1242                                    VA.getValVT(), ArgValue,
1243                                    DAG.getTargetConstant(DestSubReg, MVT::i32)),
1244                          0);
1245       break;
1246     }
1247     }
1248
1249     InVals.push_back(ArgValue);
1250   }
1251
1252   if (isVarArg)
1253     SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
1254
1255   unsigned StackArgSize = CCInfo.getNextStackOffset();
1256   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
1257     // This is a non-standard ABI so by fiat I say we're allowed to make full
1258     // use of the stack area to be popped, which must be aligned to 16 bytes in
1259     // any case:
1260     StackArgSize = RoundUpToAlignment(StackArgSize, 16);
1261
1262     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
1263     // a multiple of 16.
1264     FuncInfo->setArgumentStackToRestore(StackArgSize);
1265
1266     // This realignment carries over to the available bytes below. Our own
1267     // callers will guarantee the space is free by giving an aligned value to
1268     // CALLSEQ_START.
1269   }
1270   // Even if we're not expected to free up the space, it's useful to know how
1271   // much is there while considering tail calls (because we can reuse it).
1272   FuncInfo->setBytesInStackArgArea(StackArgSize);
1273
1274   return Chain;
1275 }
1276
1277 SDValue
1278 AArch64TargetLowering::LowerReturn(SDValue Chain,
1279                                    CallingConv::ID CallConv, bool isVarArg,
1280                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
1281                                    const SmallVectorImpl<SDValue> &OutVals,
1282                                    SDLoc dl, SelectionDAG &DAG) const {
1283   // CCValAssign - represent the assignment of the return value to a location.
1284   SmallVector<CCValAssign, 16> RVLocs;
1285
1286   // CCState - Info about the registers and stack slots.
1287   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1288                  getTargetMachine(), RVLocs, *DAG.getContext());
1289
1290   // Analyze outgoing return values.
1291   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
1292
1293   SDValue Flag;
1294   SmallVector<SDValue, 4> RetOps(1, Chain);
1295
1296   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
1297     // PCS: "If the type, T, of the result of a function is such that
1298     // void func(T arg) would require that arg be passed as a value in a
1299     // register (or set of registers) according to the rules in 5.4, then the
1300     // result is returned in the same registers as would be used for such an
1301     // argument.
1302     //
1303     // Otherwise, the caller shall reserve a block of memory of sufficient
1304     // size and alignment to hold the result. The address of the memory block
1305     // shall be passed as an additional argument to the function in x8."
1306     //
1307     // This is implemented in two places. The register-return values are dealt
1308     // with here, more complex returns are passed as an sret parameter, which
1309     // means we don't have to worry about it during actual return.
1310     CCValAssign &VA = RVLocs[i];
1311     assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
1312
1313
1314     SDValue Arg = OutVals[i];
1315
1316     // There's no convenient note in the ABI about this as there is for normal
1317     // arguments, but it says return values are passed in the same registers as
1318     // an argument would be. I believe that includes the comments about
1319     // unspecified higher bits, putting the burden of widening on the *caller*
1320     // for return values.
1321     switch (VA.getLocInfo()) {
1322     default: llvm_unreachable("Unknown loc info");
1323     case CCValAssign::Full: break;
1324     case CCValAssign::SExt:
1325     case CCValAssign::ZExt:
1326     case CCValAssign::AExt:
1327       // Floating-point values should only be extended when they're going into
1328       // memory, which can't happen here so an integer extend is acceptable.
1329       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1330       break;
1331     case CCValAssign::BCvt:
1332       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1333       break;
1334     }
1335
1336     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
1337     Flag = Chain.getValue(1);
1338     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1339   }
1340
1341   RetOps[0] = Chain;  // Update chain.
1342
1343   // Add the flag if we have it.
1344   if (Flag.getNode())
1345     RetOps.push_back(Flag);
1346
1347   return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
1348                      &RetOps[0], RetOps.size());
1349 }
1350
1351 unsigned AArch64TargetLowering::getByValTypeAlignment(Type *Ty) const {
1352   // This is a new backend. For anything more precise than this a FE should
1353   // set an explicit alignment.
1354   return 4;
1355 }
1356
1357 SDValue
1358 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
1359                                  SmallVectorImpl<SDValue> &InVals) const {
1360   SelectionDAG &DAG                     = CLI.DAG;
1361   SDLoc &dl                             = CLI.DL;
1362   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1363   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1364   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1365   SDValue Chain                         = CLI.Chain;
1366   SDValue Callee                        = CLI.Callee;
1367   bool &IsTailCall                      = CLI.IsTailCall;
1368   CallingConv::ID CallConv              = CLI.CallConv;
1369   bool IsVarArg                         = CLI.IsVarArg;
1370
1371   MachineFunction &MF = DAG.getMachineFunction();
1372   AArch64MachineFunctionInfo *FuncInfo
1373     = MF.getInfo<AArch64MachineFunctionInfo>();
1374   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1375   bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
1376   bool IsSibCall = false;
1377
1378   if (IsTailCall) {
1379     IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1380                     IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1381                                                    Outs, OutVals, Ins, DAG);
1382
1383     // A sibling call is one where we're under the usual C ABI and not planning
1384     // to change that but can still do a tail call:
1385     if (!TailCallOpt && IsTailCall)
1386       IsSibCall = true;
1387   }
1388
1389   SmallVector<CCValAssign, 16> ArgLocs;
1390   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
1391                  getTargetMachine(), ArgLocs, *DAG.getContext());
1392   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1393
1394   // On AArch64 (and all other architectures I'm aware of) the most this has to
1395   // do is adjust the stack pointer.
1396   unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
1397   if (IsSibCall) {
1398     // Since we're not changing the ABI to make this a tail call, the memory
1399     // operands are already available in the caller's incoming argument space.
1400     NumBytes = 0;
1401   }
1402
1403   // FPDiff is the byte offset of the call's argument area from the callee's.
1404   // Stores to callee stack arguments will be placed in FixedStackSlots offset
1405   // by this amount for a tail call. In a sibling call it must be 0 because the
1406   // caller will deallocate the entire stack and the callee still expects its
1407   // arguments to begin at SP+0. Completely unused for non-tail calls.
1408   int FPDiff = 0;
1409
1410   if (IsTailCall && !IsSibCall) {
1411     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1412
1413     // FPDiff will be negative if this tail call requires more space than we
1414     // would automatically have in our incoming argument space. Positive if we
1415     // can actually shrink the stack.
1416     FPDiff = NumReusableBytes - NumBytes;
1417
1418     // The stack pointer must be 16-byte aligned at all times it's used for a
1419     // memory operation, which in practice means at *all* times and in
1420     // particular across call boundaries. Therefore our own arguments started at
1421     // a 16-byte aligned SP and the delta applied for the tail call should
1422     // satisfy the same constraint.
1423     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
1424   }
1425
1426   if (!IsSibCall)
1427     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
1428                                  dl);
1429
1430   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
1431                                         getPointerTy());
1432
1433   SmallVector<SDValue, 8> MemOpChains;
1434   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1435
1436   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1437     CCValAssign &VA = ArgLocs[i];
1438     ISD::ArgFlagsTy Flags = Outs[i].Flags;
1439     SDValue Arg = OutVals[i];
1440
1441     // Callee does the actual widening, so all extensions just use an implicit
1442     // definition of the rest of the Loc. Aesthetically, this would be nicer as
1443     // an ANY_EXTEND, but that isn't valid for floating-point types and this
1444     // alternative works on integer types too.
1445     switch (VA.getLocInfo()) {
1446     default: llvm_unreachable("Unknown loc info!");
1447     case CCValAssign::Full: break;
1448     case CCValAssign::SExt:
1449     case CCValAssign::ZExt:
1450     case CCValAssign::AExt:
1451     case CCValAssign::FPExt: {
1452       unsigned SrcSize = VA.getValVT().getSizeInBits();
1453       unsigned SrcSubReg;
1454
1455       switch (SrcSize) {
1456       case 8: SrcSubReg = AArch64::sub_8; break;
1457       case 16: SrcSubReg = AArch64::sub_16; break;
1458       case 32: SrcSubReg = AArch64::sub_32; break;
1459       case 64: SrcSubReg = AArch64::sub_64; break;
1460       default: llvm_unreachable("Unexpected argument promotion");
1461       }
1462
1463       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
1464                                     VA.getLocVT(),
1465                                     DAG.getUNDEF(VA.getLocVT()),
1466                                     Arg,
1467                                     DAG.getTargetConstant(SrcSubReg, MVT::i32)),
1468                     0);
1469
1470       break;
1471     }
1472     case CCValAssign::BCvt:
1473       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1474       break;
1475     }
1476
1477     if (VA.isRegLoc()) {
1478       // A normal register (sub-) argument. For now we just note it down because
1479       // we want to copy things into registers as late as possible to avoid
1480       // register-pressure (and possibly worse).
1481       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1482       continue;
1483     }
1484
1485     assert(VA.isMemLoc() && "unexpected argument location");
1486
1487     SDValue DstAddr;
1488     MachinePointerInfo DstInfo;
1489     if (IsTailCall) {
1490       uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
1491                                           VA.getLocVT().getSizeInBits();
1492       OpSize = (OpSize + 7) / 8;
1493       int32_t Offset = VA.getLocMemOffset() + FPDiff;
1494       int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
1495
1496       DstAddr = DAG.getFrameIndex(FI, getPointerTy());
1497       DstInfo = MachinePointerInfo::getFixedStack(FI);
1498
1499       // Make sure any stack arguments overlapping with where we're storing are
1500       // loaded before this eventual operation. Otherwise they'll be clobbered.
1501       Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
1502     } else {
1503       SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset());
1504
1505       DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1506       DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
1507     }
1508
1509     if (Flags.isByVal()) {
1510       SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
1511       SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
1512                                   Flags.getByValAlign(),
1513                                   /*isVolatile = */ false,
1514                                   /*alwaysInline = */ false,
1515                                   DstInfo, MachinePointerInfo(0));
1516       MemOpChains.push_back(Cpy);
1517     } else {
1518       // Normal stack argument, put it where it's needed.
1519       SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
1520                                    false, false, 0);
1521       MemOpChains.push_back(Store);
1522     }
1523   }
1524
1525   // The loads and stores generated above shouldn't clash with each
1526   // other. Combining them with this TokenFactor notes that fact for the rest of
1527   // the backend.
1528   if (!MemOpChains.empty())
1529     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1530                         &MemOpChains[0], MemOpChains.size());
1531
1532   // Most of the rest of the instructions need to be glued together; we don't
1533   // want assignments to actual registers used by a call to be rearranged by a
1534   // well-meaning scheduler.
1535   SDValue InFlag;
1536
1537   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1538     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1539                              RegsToPass[i].second, InFlag);
1540     InFlag = Chain.getValue(1);
1541   }
1542
1543   // The linker is responsible for inserting veneers when necessary to put a
1544   // function call destination in range, so we don't need to bother with a
1545   // wrapper here.
1546   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1547     const GlobalValue *GV = G->getGlobal();
1548     Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
1549   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1550     const char *Sym = S->getSymbol();
1551     Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
1552   }
1553
1554   // We don't usually want to end the call-sequence here because we would tidy
1555   // the frame up *after* the call, however in the ABI-changing tail-call case
1556   // we've carefully laid out the parameters so that when sp is reset they'll be
1557   // in the correct location.
1558   if (IsTailCall && !IsSibCall) {
1559     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1560                                DAG.getIntPtrConstant(0, true), InFlag, dl);
1561     InFlag = Chain.getValue(1);
1562   }
1563
1564   // We produce the following DAG scheme for the actual call instruction:
1565   //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
1566   //
1567   // Most arguments aren't going to be used and just keep the values live as
1568   // far as LLVM is concerned. It's expected to be selected as simply "bl
1569   // callee" (for a direct, non-tail call).
1570   std::vector<SDValue> Ops;
1571   Ops.push_back(Chain);
1572   Ops.push_back(Callee);
1573
1574   if (IsTailCall) {
1575     // Each tail call may have to adjust the stack by a different amount, so
1576     // this information must travel along with the operation for eventual
1577     // consumption by emitEpilogue.
1578     Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
1579   }
1580
1581   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1582     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1583                                   RegsToPass[i].second.getValueType()));
1584
1585
1586   // Add a register mask operand representing the call-preserved registers. This
1587   // is used later in codegen to constrain register-allocation.
1588   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
1589   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
1590   assert(Mask && "Missing call preserved mask for calling convention");
1591   Ops.push_back(DAG.getRegisterMask(Mask));
1592
1593   // If we needed glue, put it in as the last argument.
1594   if (InFlag.getNode())
1595     Ops.push_back(InFlag);
1596
1597   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1598
1599   if (IsTailCall) {
1600     return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
1601   }
1602
1603   Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
1604   InFlag = Chain.getValue(1);
1605
1606   // Now we can reclaim the stack, just as well do it before working out where
1607   // our return value is.
1608   if (!IsSibCall) {
1609     uint64_t CalleePopBytes
1610       = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
1611
1612     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1613                                DAG.getIntPtrConstant(CalleePopBytes, true),
1614                                InFlag, dl);
1615     InFlag = Chain.getValue(1);
1616   }
1617
1618   return LowerCallResult(Chain, InFlag, CallConv,
1619                          IsVarArg, Ins, dl, DAG, InVals);
1620 }
1621
1622 SDValue
1623 AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1624                                       CallingConv::ID CallConv, bool IsVarArg,
1625                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1626                                       SDLoc dl, SelectionDAG &DAG,
1627                                       SmallVectorImpl<SDValue> &InVals) const {
1628   // Assign locations to each value returned by this call.
1629   SmallVector<CCValAssign, 16> RVLocs;
1630   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
1631                  getTargetMachine(), RVLocs, *DAG.getContext());
1632   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
1633
1634   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1635     CCValAssign VA = RVLocs[i];
1636
1637     // Return values that are too big to fit into registers should use an sret
1638     // pointer, so this can be a lot simpler than the main argument code.
1639     assert(VA.isRegLoc() && "Memory locations not expected for call return");
1640
1641     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1642                                      InFlag);
1643     Chain = Val.getValue(1);
1644     InFlag = Val.getValue(2);
1645
1646     switch (VA.getLocInfo()) {
1647     default: llvm_unreachable("Unknown loc info!");
1648     case CCValAssign::Full: break;
1649     case CCValAssign::BCvt:
1650       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1651       break;
1652     case CCValAssign::ZExt:
1653     case CCValAssign::SExt:
1654     case CCValAssign::AExt:
1655       // Floating-point arguments only get extended/truncated if they're going
1656       // in memory, so using the integer operation is acceptable here.
1657       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1658       break;
1659     }
1660
1661     InVals.push_back(Val);
1662   }
1663
1664   return Chain;
1665 }
1666
1667 bool
1668 AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
1669                                     CallingConv::ID CalleeCC,
1670                                     bool IsVarArg,
1671                                     bool IsCalleeStructRet,
1672                                     bool IsCallerStructRet,
1673                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
1674                                     const SmallVectorImpl<SDValue> &OutVals,
1675                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1676                                     SelectionDAG& DAG) const {
1677
1678   // For CallingConv::C this function knows whether the ABI needs
1679   // changing. That's not true for other conventions so they will have to opt in
1680   // manually.
1681   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1682     return false;
1683
1684   const MachineFunction &MF = DAG.getMachineFunction();
1685   const Function *CallerF = MF.getFunction();
1686   CallingConv::ID CallerCC = CallerF->getCallingConv();
1687   bool CCMatch = CallerCC == CalleeCC;
1688
1689   // Byval parameters hand the function a pointer directly into the stack area
1690   // we want to reuse during a tail call. Working around this *is* possible (see
1691   // X86) but less efficient and uglier in LowerCall.
1692   for (Function::const_arg_iterator i = CallerF->arg_begin(),
1693          e = CallerF->arg_end(); i != e; ++i)
1694     if (i->hasByValAttr())
1695       return false;
1696
1697   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
1698     if (IsTailCallConvention(CalleeCC) && CCMatch)
1699       return true;
1700     return false;
1701   }
1702
1703   // Now we search for cases where we can use a tail call without changing the
1704   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
1705   // concept.
1706
1707   // I want anyone implementing a new calling convention to think long and hard
1708   // about this assert.
1709   assert((!IsVarArg || CalleeCC == CallingConv::C)
1710          && "Unexpected variadic calling convention");
1711
1712   if (IsVarArg && !Outs.empty()) {
1713     // At least two cases here: if caller is fastcc then we can't have any
1714     // memory arguments (we'd be expected to clean up the stack afterwards). If
1715     // caller is C then we could potentially use its argument area.
1716
1717     // FIXME: for now we take the most conservative of these in both cases:
1718     // disallow all variadic memory operands.
1719     SmallVector<CCValAssign, 16> ArgLocs;
1720     CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
1721                    getTargetMachine(), ArgLocs, *DAG.getContext());
1722
1723     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
1724     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
1725       if (!ArgLocs[i].isRegLoc())
1726         return false;
1727   }
1728
1729   // If the calling conventions do not match, then we'd better make sure the
1730   // results are returned in the same way as what the caller expects.
1731   if (!CCMatch) {
1732     SmallVector<CCValAssign, 16> RVLocs1;
1733     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
1734                     getTargetMachine(), RVLocs1, *DAG.getContext());
1735     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
1736
1737     SmallVector<CCValAssign, 16> RVLocs2;
1738     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
1739                     getTargetMachine(), RVLocs2, *DAG.getContext());
1740     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
1741
1742     if (RVLocs1.size() != RVLocs2.size())
1743       return false;
1744     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
1745       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
1746         return false;
1747       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
1748         return false;
1749       if (RVLocs1[i].isRegLoc()) {
1750         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
1751           return false;
1752       } else {
1753         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
1754           return false;
1755       }
1756     }
1757   }
1758
1759   // Nothing more to check if the callee is taking no arguments
1760   if (Outs.empty())
1761     return true;
1762
1763   SmallVector<CCValAssign, 16> ArgLocs;
1764   CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
1765                  getTargetMachine(), ArgLocs, *DAG.getContext());
1766
1767   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
1768
1769   const AArch64MachineFunctionInfo *FuncInfo
1770     = MF.getInfo<AArch64MachineFunctionInfo>();
1771
1772   // If the stack arguments for this call would fit into our own save area then
1773   // the call can be made tail.
1774   return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
1775 }
1776
1777 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
1778                                                    bool TailCallOpt) const {
1779   return CallCC == CallingConv::Fast && TailCallOpt;
1780 }
1781
1782 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
1783   return CallCC == CallingConv::Fast;
1784 }
1785
1786 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
1787                                                    SelectionDAG &DAG,
1788                                                    MachineFrameInfo *MFI,
1789                                                    int ClobberedFI) const {
1790   SmallVector<SDValue, 8> ArgChains;
1791   int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
1792   int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
1793
1794   // Include the original chain at the beginning of the list. When this is
1795   // used by target LowerCall hooks, this helps legalize find the
1796   // CALLSEQ_BEGIN node.
1797   ArgChains.push_back(Chain);
1798
1799   // Add a chain value for each stack argument corresponding
1800   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1801          UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
1802     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
1803       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
1804         if (FI->getIndex() < 0) {
1805           int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
1806           int64_t InLastByte = InFirstByte;
1807           InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
1808
1809           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1810               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1811             ArgChains.push_back(SDValue(L, 1));
1812         }
1813
1814    // Build a tokenfactor for all the chains.
1815    return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
1816                       &ArgChains[0], ArgChains.size());
1817 }
1818
1819 static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
1820   switch (CC) {
1821   case ISD::SETEQ:  return A64CC::EQ;
1822   case ISD::SETGT:  return A64CC::GT;
1823   case ISD::SETGE:  return A64CC::GE;
1824   case ISD::SETLT:  return A64CC::LT;
1825   case ISD::SETLE:  return A64CC::LE;
1826   case ISD::SETNE:  return A64CC::NE;
1827   case ISD::SETUGT: return A64CC::HI;
1828   case ISD::SETUGE: return A64CC::HS;
1829   case ISD::SETULT: return A64CC::LO;
1830   case ISD::SETULE: return A64CC::LS;
1831   default: llvm_unreachable("Unexpected condition code");
1832   }
1833 }
1834
1835 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
1836   // icmp is implemented using adds/subs immediate, which take an unsigned
1837   // 12-bit immediate, optionally shifted left by 12 bits.
1838
1839   // Symmetric by using adds/subs
1840   if (Val < 0)
1841     Val = -Val;
1842
1843   return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
1844 }
1845
1846 SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
1847                                         ISD::CondCode CC, SDValue &A64cc,
1848                                         SelectionDAG &DAG, SDLoc &dl) const {
1849   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1850     int64_t C = 0;
1851     EVT VT = RHSC->getValueType(0);
1852     bool knownInvalid = false;
1853
1854     // I'm not convinced the rest of LLVM handles these edge cases properly, but
1855     // we can at least get it right.
1856     if (isSignedIntSetCC(CC)) {
1857       C = RHSC->getSExtValue();
1858     } else if (RHSC->getZExtValue() > INT64_MAX) {
1859       // A 64-bit constant not representable by a signed 64-bit integer is far
1860       // too big to fit into a SUBS immediate anyway.
1861       knownInvalid = true;
1862     } else {
1863       C = RHSC->getZExtValue();
1864     }
1865
1866     if (!knownInvalid && !isLegalICmpImmediate(C)) {
1867       // Constant does not fit, try adjusting it by one?
1868       switch (CC) {
1869       default: break;
1870       case ISD::SETLT:
1871       case ISD::SETGE:
1872         if (isLegalICmpImmediate(C-1)) {
1873           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1874           RHS = DAG.getConstant(C-1, VT);
1875         }
1876         break;
1877       case ISD::SETULT:
1878       case ISD::SETUGE:
1879         if (isLegalICmpImmediate(C-1)) {
1880           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1881           RHS = DAG.getConstant(C-1, VT);
1882         }
1883         break;
1884       case ISD::SETLE:
1885       case ISD::SETGT:
1886         if (isLegalICmpImmediate(C+1)) {
1887           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1888           RHS = DAG.getConstant(C+1, VT);
1889         }
1890         break;
1891       case ISD::SETULE:
1892       case ISD::SETUGT:
1893         if (isLegalICmpImmediate(C+1)) {
1894           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1895           RHS = DAG.getConstant(C+1, VT);
1896         }
1897         break;
1898       }
1899     }
1900   }
1901
1902   A64CC::CondCodes CondCode = IntCCToA64CC(CC);
1903   A64cc = DAG.getConstant(CondCode, MVT::i32);
1904   return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
1905                      DAG.getCondCode(CC));
1906 }
1907
1908 static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
1909                                     A64CC::CondCodes &Alternative) {
1910   A64CC::CondCodes CondCode = A64CC::Invalid;
1911   Alternative = A64CC::Invalid;
1912
1913   switch (CC) {
1914   default: llvm_unreachable("Unknown FP condition!");
1915   case ISD::SETEQ:
1916   case ISD::SETOEQ: CondCode = A64CC::EQ; break;
1917   case ISD::SETGT:
1918   case ISD::SETOGT: CondCode = A64CC::GT; break;
1919   case ISD::SETGE:
1920   case ISD::SETOGE: CondCode = A64CC::GE; break;
1921   case ISD::SETOLT: CondCode = A64CC::MI; break;
1922   case ISD::SETOLE: CondCode = A64CC::LS; break;
1923   case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
1924   case ISD::SETO:   CondCode = A64CC::VC; break;
1925   case ISD::SETUO:  CondCode = A64CC::VS; break;
1926   case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
1927   case ISD::SETUGT: CondCode = A64CC::HI; break;
1928   case ISD::SETUGE: CondCode = A64CC::PL; break;
1929   case ISD::SETLT:
1930   case ISD::SETULT: CondCode = A64CC::LT; break;
1931   case ISD::SETLE:
1932   case ISD::SETULE: CondCode = A64CC::LE; break;
1933   case ISD::SETNE:
1934   case ISD::SETUNE: CondCode = A64CC::NE; break;
1935   }
1936   return CondCode;
1937 }
1938
1939 SDValue
1940 AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
1941   SDLoc DL(Op);
1942   EVT PtrVT = getPointerTy();
1943   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
1944
1945   switch(getTargetMachine().getCodeModel()) {
1946   case CodeModel::Small:
1947     // The most efficient code is PC-relative anyway for the small memory model,
1948     // so we don't need to worry about relocation model.
1949     return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
1950                        DAG.getTargetBlockAddress(BA, PtrVT, 0,
1951                                                  AArch64II::MO_NO_FLAG),
1952                        DAG.getTargetBlockAddress(BA, PtrVT, 0,
1953                                                  AArch64II::MO_LO12),
1954                        DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
1955   case CodeModel::Large:
1956     return DAG.getNode(
1957       AArch64ISD::WrapperLarge, DL, PtrVT,
1958       DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
1959       DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
1960       DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
1961       DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
1962   default:
1963     llvm_unreachable("Only small and large code models supported now");
1964   }
1965 }
1966
1967
1968 // (BRCOND chain, val, dest)
1969 SDValue
1970 AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1971   SDLoc dl(Op);
1972   SDValue Chain = Op.getOperand(0);
1973   SDValue TheBit = Op.getOperand(1);
1974   SDValue DestBB = Op.getOperand(2);
1975
1976   // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
1977   // that as the consumer we are responsible for ignoring rubbish in higher
1978   // bits.
1979   TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
1980                        DAG.getConstant(1, MVT::i32));
1981
1982   SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
1983                                DAG.getConstant(0, TheBit.getValueType()),
1984                                DAG.getCondCode(ISD::SETNE));
1985
1986   return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
1987                      A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
1988                      DestBB);
1989 }
1990
1991 // (BR_CC chain, condcode, lhs, rhs, dest)
1992 SDValue
1993 AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
1994   SDLoc dl(Op);
1995   SDValue Chain = Op.getOperand(0);
1996   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
1997   SDValue LHS = Op.getOperand(2);
1998   SDValue RHS = Op.getOperand(3);
1999   SDValue DestBB = Op.getOperand(4);
2000
2001   if (LHS.getValueType() == MVT::f128) {
2002     // f128 comparisons are lowered to runtime calls by a routine which sets
2003     // LHS, RHS and CC appropriately for the rest of this function to continue.
2004     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2005
2006     // If softenSetCCOperands returned a scalar, we need to compare the result
2007     // against zero to select between true and false values.
2008     if (RHS.getNode() == 0) {
2009       RHS = DAG.getConstant(0, LHS.getValueType());
2010       CC = ISD::SETNE;
2011     }
2012   }
2013
2014   if (LHS.getValueType().isInteger()) {
2015     SDValue A64cc;
2016
2017     // Integers are handled in a separate function because the combinations of
2018     // immediates and tests can get hairy and we may want to fiddle things.
2019     SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
2020
2021     return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
2022                        Chain, CmpOp, A64cc, DestBB);
2023   }
2024
2025   // Note that some LLVM floating-point CondCodes can't be lowered to a single
2026   // conditional branch, hence FPCCToA64CC can set a second test, where either
2027   // passing is sufficient.
2028   A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
2029   CondCode = FPCCToA64CC(CC, Alternative);
2030   SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
2031   SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2032                               DAG.getCondCode(CC));
2033   SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
2034                                  Chain, SetCC, A64cc, DestBB);
2035
2036   if (Alternative != A64CC::Invalid) {
2037     A64cc = DAG.getConstant(Alternative, MVT::i32);
2038     A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
2039                            A64BR_CC, SetCC, A64cc, DestBB);
2040
2041   }
2042
2043   return A64BR_CC;
2044 }
2045
2046 SDValue
2047 AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
2048                                        RTLIB::Libcall Call) const {
2049   ArgListTy Args;
2050   ArgListEntry Entry;
2051   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
2052     EVT ArgVT = Op.getOperand(i).getValueType();
2053     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2054     Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
2055     Entry.isSExt = false;
2056     Entry.isZExt = false;
2057     Args.push_back(Entry);
2058   }
2059   SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
2060
2061   Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
2062
2063   // By default, the input chain to this libcall is the entry node of the
2064   // function. If the libcall is going to be emitted as a tail call then
2065   // isUsedByReturnOnly will change it to the right chain if the return
2066   // node which is being folded has a non-entry input chain.
2067   SDValue InChain = DAG.getEntryNode();
2068
2069   // isTailCall may be true since the callee does not reference caller stack
2070   // frame. Check if it's in the right position.
2071   SDValue TCChain = InChain;
2072   bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
2073   if (isTailCall)
2074     InChain = TCChain;
2075
2076   TargetLowering::
2077   CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
2078                     0, getLibcallCallingConv(Call), isTailCall,
2079                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
2080                     Callee, Args, DAG, SDLoc(Op));
2081   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
2082
2083   if (!CallInfo.second.getNode())
2084     // It's a tailcall, return the chain (which is the DAG root).
2085     return DAG.getRoot();
2086
2087   return CallInfo.first;
2088 }
2089
2090 SDValue
2091 AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
2092   if (Op.getOperand(0).getValueType() != MVT::f128) {
2093     // It's legal except when f128 is involved
2094     return Op;
2095   }
2096
2097   RTLIB::Libcall LC;
2098   LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
2099
2100   SDValue SrcVal = Op.getOperand(0);
2101   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
2102                      /*isSigned*/ false, SDLoc(Op)).first;
2103 }
2104
2105 SDValue
2106 AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
2107   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2108
2109   RTLIB::Libcall LC;
2110   LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
2111
2112   return LowerF128ToCall(Op, DAG, LC);
2113 }
2114
2115 SDValue
2116 AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2117                                       bool IsSigned) const {
2118   if (Op.getOperand(0).getValueType() != MVT::f128) {
2119     // It's legal except when f128 is involved
2120     return Op;
2121   }
2122
2123   RTLIB::Libcall LC;
2124   if (IsSigned)
2125     LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
2126   else
2127     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
2128
2129   return LowerF128ToCall(Op, DAG, LC);
2130 }
2131
2132 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
2133   MachineFunction &MF = DAG.getMachineFunction();
2134   MachineFrameInfo *MFI = MF.getFrameInfo();
2135   MFI->setReturnAddressIsTaken(true);
2136
2137   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
2138     return SDValue();
2139
2140   EVT VT = Op.getValueType();
2141   SDLoc dl(Op);
2142   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2143   if (Depth) {
2144     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
2145     SDValue Offset = DAG.getConstant(8, MVT::i64);
2146     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
2147                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
2148                        MachinePointerInfo(), false, false, false, 0);
2149   }
2150
2151   // Return X30, which contains the return address. Mark it an implicit live-in.
2152   unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
2153   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
2154 }
2155
2156
2157 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
2158                                               const {
2159   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
2160   MFI->setFrameAddressIsTaken(true);
2161
2162   EVT VT = Op.getValueType();
2163   SDLoc dl(Op);
2164   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2165   unsigned FrameReg = AArch64::X29;
2166   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
2167   while (Depth--)
2168     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
2169                             MachinePointerInfo(),
2170                             false, false, false, 0);
2171   return FrameAddr;
2172 }
2173
2174 SDValue
2175 AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
2176                                                   SelectionDAG &DAG) const {
2177   assert(getTargetMachine().getCodeModel() == CodeModel::Large);
2178   assert(getTargetMachine().getRelocationModel() == Reloc::Static);
2179
2180   EVT PtrVT = getPointerTy();
2181   SDLoc dl(Op);
2182   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2183   const GlobalValue *GV = GN->getGlobal();
2184
2185   SDValue GlobalAddr = DAG.getNode(
2186       AArch64ISD::WrapperLarge, dl, PtrVT,
2187       DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
2188       DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
2189       DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
2190       DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
2191
2192   if (GN->getOffset() != 0)
2193     return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
2194                        DAG.getConstant(GN->getOffset(), PtrVT));
2195
2196   return GlobalAddr;
2197 }
2198
2199 SDValue
2200 AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
2201                                                   SelectionDAG &DAG) const {
2202   assert(getTargetMachine().getCodeModel() == CodeModel::Small);
2203
2204   EVT PtrVT = getPointerTy();
2205   SDLoc dl(Op);
2206   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2207   const GlobalValue *GV = GN->getGlobal();
2208   unsigned Alignment = GV->getAlignment();
2209   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2210   if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
2211     // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
2212     // to zero when they remain undefined. In PIC mode the GOT can take care of
2213     // this, but in absolute mode we use a constant pool load.
2214     SDValue PoolAddr;
2215     PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2216                            DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
2217                                                      AArch64II::MO_NO_FLAG),
2218                            DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
2219                                                      AArch64II::MO_LO12),
2220                            DAG.getConstant(8, MVT::i32));
2221     SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
2222                                      MachinePointerInfo::getConstantPool(),
2223                                      /*isVolatile=*/ false,
2224                                      /*isNonTemporal=*/ true,
2225                                      /*isInvariant=*/ true, 8);
2226     if (GN->getOffset() != 0)
2227       return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
2228                          DAG.getConstant(GN->getOffset(), PtrVT));
2229
2230     return GlobalAddr;
2231   }
2232
2233   if (Alignment == 0) {
2234     const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
2235     if (GVPtrTy->getElementType()->isSized()) {
2236       Alignment
2237         = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
2238     } else {
2239       // Be conservative if we can't guess, not that it really matters:
2240       // functions and labels aren't valid for loads, and the methods used to
2241       // actually calculate an address work with any alignment.
2242       Alignment = 1;
2243     }
2244   }
2245
2246   unsigned char HiFixup, LoFixup;
2247   bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
2248
2249   if (UseGOT) {
2250     HiFixup = AArch64II::MO_GOT;
2251     LoFixup = AArch64II::MO_GOT_LO12;
2252     Alignment = 8;
2253   } else {
2254     HiFixup = AArch64II::MO_NO_FLAG;
2255     LoFixup = AArch64II::MO_LO12;
2256   }
2257
2258   // AArch64's small model demands the following sequence:
2259   // ADRP x0, somewhere
2260   // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
2261   SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2262                                   DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
2263                                                              HiFixup),
2264                                   DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
2265                                                              LoFixup),
2266                                   DAG.getConstant(Alignment, MVT::i32));
2267
2268   if (UseGOT) {
2269     GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
2270                             GlobalRef);
2271   }
2272
2273   if (GN->getOffset() != 0)
2274     return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
2275                        DAG.getConstant(GN->getOffset(), PtrVT));
2276
2277   return GlobalRef;
2278 }
2279
2280 SDValue
2281 AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
2282                                              SelectionDAG &DAG) const {
2283   // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
2284   // we make those distinctions here.
2285
2286   switch (getTargetMachine().getCodeModel()) {
2287   case CodeModel::Small:
2288     return LowerGlobalAddressELFSmall(Op, DAG);
2289   case CodeModel::Large:
2290     return LowerGlobalAddressELFLarge(Op, DAG);
2291   default:
2292     llvm_unreachable("Only small and large code models supported now");
2293   }
2294 }
2295
2296 SDValue
2297 AArch64TargetLowering::LowerConstantPool(SDValue Op,
2298                                          SelectionDAG &DAG) const {
2299   SDLoc DL(Op);
2300   EVT PtrVT = getPointerTy();
2301   ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Op);
2302   const Constant *C = CN->getConstVal();
2303
2304   switch(getTargetMachine().getCodeModel()) {
2305   case CodeModel::Small:
2306     // The most efficient code is PC-relative anyway for the small memory model,
2307     // so we don't need to worry about relocation model.
2308     return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2309                        DAG.getTargetConstantPool(C, PtrVT, 0, 0,
2310                                                  AArch64II::MO_NO_FLAG),
2311                        DAG.getTargetConstantPool(C, PtrVT, 0, 0,
2312                                                  AArch64II::MO_LO12),
2313                        DAG.getConstant(CN->getAlignment(), MVT::i32));
2314   case CodeModel::Large:
2315     return DAG.getNode(
2316       AArch64ISD::WrapperLarge, DL, PtrVT,
2317       DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
2318       DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
2319       DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
2320       DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC));
2321   default:
2322     llvm_unreachable("Only small and large code models supported now");
2323   }
2324 }
2325
2326 SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
2327                                                 SDValue DescAddr,
2328                                                 SDLoc DL,
2329                                                 SelectionDAG &DAG) const {
2330   EVT PtrVT = getPointerTy();
2331
2332   // The function we need to call is simply the first entry in the GOT for this
2333   // descriptor, load it in preparation.
2334   SDValue Func, Chain;
2335   Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
2336                      DescAddr);
2337
2338   // The function takes only one argument: the address of the descriptor itself
2339   // in X0.
2340   SDValue Glue;
2341   Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
2342   Glue = Chain.getValue(1);
2343
2344   // Finally, there's a special calling-convention which means that the lookup
2345   // must preserve all registers (except X0, obviously).
2346   const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
2347   const AArch64RegisterInfo *A64RI
2348     = static_cast<const AArch64RegisterInfo *>(TRI);
2349   const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
2350
2351   // We're now ready to populate the argument list, as with a normal call:
2352   std::vector<SDValue> Ops;
2353   Ops.push_back(Chain);
2354   Ops.push_back(Func);
2355   Ops.push_back(SymAddr);
2356   Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
2357   Ops.push_back(DAG.getRegisterMask(Mask));
2358   Ops.push_back(Glue);
2359
2360   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2361   Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
2362                       Ops.size());
2363   Glue = Chain.getValue(1);
2364
2365   // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
2366   // back to the generic handling code.
2367   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
2368 }
2369
2370 SDValue
2371 AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
2372                                              SelectionDAG &DAG) const {
2373   assert(getSubtarget()->isTargetELF() &&
2374          "TLS not implemented for non-ELF targets");
2375   assert(getTargetMachine().getCodeModel() == CodeModel::Small
2376          && "TLS only supported in small memory model");
2377   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2378
2379   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
2380
2381   SDValue TPOff;
2382   EVT PtrVT = getPointerTy();
2383   SDLoc DL(Op);
2384   const GlobalValue *GV = GA->getGlobal();
2385
2386   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
2387
2388   if (Model == TLSModel::InitialExec) {
2389     TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2390                         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2391                                                    AArch64II::MO_GOTTPREL),
2392                         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2393                                                    AArch64II::MO_GOTTPREL_LO12),
2394                         DAG.getConstant(8, MVT::i32));
2395     TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
2396                         TPOff);
2397   } else if (Model == TLSModel::LocalExec) {
2398     SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2399                                                AArch64II::MO_TPREL_G1);
2400     SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2401                                                AArch64II::MO_TPREL_G0_NC);
2402
2403     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
2404                                        DAG.getTargetConstant(1, MVT::i32)), 0);
2405     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
2406                                        TPOff, LoVar,
2407                                        DAG.getTargetConstant(0, MVT::i32)), 0);
2408   } else if (Model == TLSModel::GeneralDynamic) {
2409     // Accesses used in this sequence go via the TLS descriptor which lives in
2410     // the GOT. Prepare an address we can use to handle this.
2411     SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2412                                                 AArch64II::MO_TLSDESC);
2413     SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2414                                                 AArch64II::MO_TLSDESC_LO12);
2415     SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2416                                    HiDesc, LoDesc,
2417                                    DAG.getConstant(8, MVT::i32));
2418     SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
2419
2420     TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
2421   } else if (Model == TLSModel::LocalDynamic) {
2422     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
2423     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
2424     // the beginning of the module's TLS region, followed by a DTPREL offset
2425     // calculation.
2426
2427     // These accesses will need deduplicating if there's more than one.
2428     AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
2429       .getInfo<AArch64MachineFunctionInfo>();
2430     MFI->incNumLocalDynamicTLSAccesses();
2431
2432
2433     // Get the location of _TLS_MODULE_BASE_:
2434     SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2435                                                 AArch64II::MO_TLSDESC);
2436     SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2437                                                 AArch64II::MO_TLSDESC_LO12);
2438     SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2439                                    HiDesc, LoDesc,
2440                                    DAG.getConstant(8, MVT::i32));
2441     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
2442
2443     ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
2444
2445     // Get the variable's offset from _TLS_MODULE_BASE_
2446     SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2447                                                AArch64II::MO_DTPREL_G1);
2448     SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2449                                                AArch64II::MO_DTPREL_G0_NC);
2450
2451     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
2452                                        DAG.getTargetConstant(0, MVT::i32)), 0);
2453     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
2454                                        TPOff, LoVar,
2455                                        DAG.getTargetConstant(0, MVT::i32)), 0);
2456   } else
2457       llvm_unreachable("Unsupported TLS access model");
2458
2459
2460   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
2461 }
2462
2463 SDValue
2464 AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2465                                       bool IsSigned) const {
2466   if (Op.getValueType() != MVT::f128) {
2467     // Legal for everything except f128.
2468     return Op;
2469   }
2470
2471   RTLIB::Libcall LC;
2472   if (IsSigned)
2473     LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2474   else
2475     LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2476
2477   return LowerF128ToCall(Op, DAG, LC);
2478 }
2479
2480
2481 SDValue
2482 AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
2483   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
2484   SDLoc dl(JT);
2485   EVT PtrVT = getPointerTy();
2486
2487   // When compiling PIC, jump tables get put in the code section so a static
2488   // relocation-style is acceptable for both cases.
2489   switch (getTargetMachine().getCodeModel()) {
2490   case CodeModel::Small:
2491     return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2492                        DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
2493                        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
2494                                               AArch64II::MO_LO12),
2495                        DAG.getConstant(1, MVT::i32));
2496   case CodeModel::Large:
2497     return DAG.getNode(
2498       AArch64ISD::WrapperLarge, dl, PtrVT,
2499       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
2500       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
2501       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
2502       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
2503   default:
2504     llvm_unreachable("Only small and large code models supported now");
2505   }
2506 }
2507
2508 // (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
2509 SDValue
2510 AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
2511   SDLoc dl(Op);
2512   SDValue LHS = Op.getOperand(0);
2513   SDValue RHS = Op.getOperand(1);
2514   SDValue IfTrue = Op.getOperand(2);
2515   SDValue IfFalse = Op.getOperand(3);
2516   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
2517
2518   if (LHS.getValueType() == MVT::f128) {
2519     // f128 comparisons are lowered to libcalls, but slot in nicely here
2520     // afterwards.
2521     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2522
2523     // If softenSetCCOperands returned a scalar, we need to compare the result
2524     // against zero to select between true and false values.
2525     if (RHS.getNode() == 0) {
2526       RHS = DAG.getConstant(0, LHS.getValueType());
2527       CC = ISD::SETNE;
2528     }
2529   }
2530
2531   if (LHS.getValueType().isInteger()) {
2532     SDValue A64cc;
2533
2534     // Integers are handled in a separate function because the combinations of
2535     // immediates and tests can get hairy and we may want to fiddle things.
2536     SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
2537
2538     return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2539                        CmpOp, IfTrue, IfFalse, A64cc);
2540   }
2541
2542   // Note that some LLVM floating-point CondCodes can't be lowered to a single
2543   // conditional branch, hence FPCCToA64CC can set a second test, where either
2544   // passing is sufficient.
2545   A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
2546   CondCode = FPCCToA64CC(CC, Alternative);
2547   SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
2548   SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2549                               DAG.getCondCode(CC));
2550   SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
2551                                      Op.getValueType(),
2552                                      SetCC, IfTrue, IfFalse, A64cc);
2553
2554   if (Alternative != A64CC::Invalid) {
2555     A64cc = DAG.getConstant(Alternative, MVT::i32);
2556     A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2557                                SetCC, IfTrue, A64SELECT_CC, A64cc);
2558
2559   }
2560
2561   return A64SELECT_CC;
2562 }
2563
2564 // (SELECT testbit, iftrue, iffalse)
2565 SDValue
2566 AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2567   SDLoc dl(Op);
2568   SDValue TheBit = Op.getOperand(0);
2569   SDValue IfTrue = Op.getOperand(1);
2570   SDValue IfFalse = Op.getOperand(2);
2571
2572   // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
2573   // that as the consumer we are responsible for ignoring rubbish in higher
2574   // bits.
2575   TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
2576                        DAG.getConstant(1, MVT::i32));
2577   SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
2578                                DAG.getConstant(0, TheBit.getValueType()),
2579                                DAG.getCondCode(ISD::SETNE));
2580
2581   return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2582                      A64CMP, IfTrue, IfFalse,
2583                      DAG.getConstant(A64CC::NE, MVT::i32));
2584 }
2585
2586 static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
2587   SDLoc DL(Op);
2588   SDValue LHS = Op.getOperand(0);
2589   SDValue RHS = Op.getOperand(1);
2590   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2591   EVT VT = Op.getValueType();
2592   bool Invert = false;
2593   SDValue Op0, Op1;
2594   unsigned Opcode;
2595
2596   if (LHS.getValueType().isInteger()) {
2597
2598     // Attempt to use Vector Integer Compare Mask Test instruction.
2599     // TST = icmp ne (and (op0, op1), zero).
2600     if (CC == ISD::SETNE) {
2601       if (((LHS.getOpcode() == ISD::AND) &&
2602            ISD::isBuildVectorAllZeros(RHS.getNode())) ||
2603           ((RHS.getOpcode() == ISD::AND) &&
2604            ISD::isBuildVectorAllZeros(LHS.getNode()))) {
2605
2606         SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
2607         SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
2608         SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
2609         return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
2610       }
2611     }
2612
2613     // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
2614     // Note: Compare against Zero does not support unsigned predicates.
2615     if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
2616          ISD::isBuildVectorAllZeros(LHS.getNode())) &&
2617         !isUnsignedIntSetCC(CC)) {
2618
2619       // If LHS is the zero value, swap operands and CondCode.
2620       if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
2621         CC = getSetCCSwappedOperands(CC);
2622         Op0 = RHS;
2623       } else
2624         Op0 = LHS;
2625
2626       // Ensure valid CondCode for Compare Mask against Zero instruction:
2627       // EQ, GE, GT, LE, LT.
2628       if (ISD::SETNE == CC) {
2629         Invert = true;
2630         CC = ISD::SETEQ;
2631       }
2632
2633       // Using constant type to differentiate integer and FP compares with zero.
2634       Op1 = DAG.getConstant(0, MVT::i32);
2635       Opcode = AArch64ISD::NEON_CMPZ;
2636
2637     } else {
2638       // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
2639       // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
2640       bool Swap = false;
2641       switch (CC) {
2642       default:
2643         llvm_unreachable("Illegal integer comparison.");
2644       case ISD::SETEQ:
2645       case ISD::SETGT:
2646       case ISD::SETGE:
2647       case ISD::SETUGT:
2648       case ISD::SETUGE:
2649         break;
2650       case ISD::SETNE:
2651         Invert = true;
2652         CC = ISD::SETEQ;
2653         break;
2654       case ISD::SETULT:
2655       case ISD::SETULE:
2656       case ISD::SETLT:
2657       case ISD::SETLE:
2658         Swap = true;
2659         CC = getSetCCSwappedOperands(CC);
2660       }
2661
2662       if (Swap)
2663         std::swap(LHS, RHS);
2664
2665       Opcode = AArch64ISD::NEON_CMP;
2666       Op0 = LHS;
2667       Op1 = RHS;
2668     }
2669
2670     // Generate Compare Mask instr or Compare Mask against Zero instr.
2671     SDValue NeonCmp =
2672         DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
2673
2674     if (Invert)
2675       NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
2676
2677     return NeonCmp;
2678   }
2679
2680   // Now handle Floating Point cases.
2681   // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
2682   if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
2683       ISD::isBuildVectorAllZeros(LHS.getNode())) {
2684
2685     // If LHS is the zero value, swap operands and CondCode.
2686     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
2687       CC = getSetCCSwappedOperands(CC);
2688       Op0 = RHS;
2689     } else
2690       Op0 = LHS;
2691
2692     // Using constant type to differentiate integer and FP compares with zero.
2693     Op1 = DAG.getConstantFP(0, MVT::f32);
2694     Opcode = AArch64ISD::NEON_CMPZ;
2695   } else {
2696     // Attempt to use Vector Floating Point Compare Mask instruction.
2697     Op0 = LHS;
2698     Op1 = RHS;
2699     Opcode = AArch64ISD::NEON_CMP;
2700   }
2701
2702   SDValue NeonCmpAlt;
2703   // Some register compares have to be implemented with swapped CC and operands,
2704   // e.g.: OLT implemented as OGT with swapped operands.
2705   bool SwapIfRegArgs = false;
2706
2707   // Ensure valid CondCode for FP Compare Mask against Zero instruction:
2708   // EQ, GE, GT, LE, LT.
2709   // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
2710   switch (CC) {
2711   default:
2712     llvm_unreachable("Illegal FP comparison");
2713   case ISD::SETUNE:
2714   case ISD::SETNE:
2715     Invert = true; // Fallthrough
2716   case ISD::SETOEQ:
2717   case ISD::SETEQ:
2718     CC = ISD::SETEQ;
2719     break;
2720   case ISD::SETOLT:
2721   case ISD::SETLT:
2722     CC = ISD::SETLT;
2723     SwapIfRegArgs = true;
2724     break;
2725   case ISD::SETOGT:
2726   case ISD::SETGT:
2727     CC = ISD::SETGT;
2728     break;
2729   case ISD::SETOLE:
2730   case ISD::SETLE:
2731     CC = ISD::SETLE;
2732     SwapIfRegArgs = true;
2733     break;
2734   case ISD::SETOGE:
2735   case ISD::SETGE:
2736     CC = ISD::SETGE;
2737     break;
2738   case ISD::SETUGE:
2739     Invert = true;
2740     CC = ISD::SETLT;
2741     SwapIfRegArgs = true;
2742     break;
2743   case ISD::SETULE:
2744     Invert = true;
2745     CC = ISD::SETGT;
2746     break;
2747   case ISD::SETUGT:
2748     Invert = true;
2749     CC = ISD::SETLE;
2750     SwapIfRegArgs = true;
2751     break;
2752   case ISD::SETULT:
2753     Invert = true;
2754     CC = ISD::SETGE;
2755     break;
2756   case ISD::SETUEQ:
2757     Invert = true; // Fallthrough
2758   case ISD::SETONE:
2759     // Expand this to (OGT |OLT).
2760     NeonCmpAlt =
2761         DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
2762     CC = ISD::SETLT;
2763     SwapIfRegArgs = true;
2764     break;
2765   case ISD::SETUO:
2766     Invert = true; // Fallthrough
2767   case ISD::SETO:
2768     // Expand this to (OGE | OLT).
2769     NeonCmpAlt =
2770         DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
2771     CC = ISD::SETLT;
2772     SwapIfRegArgs = true;
2773     break;
2774   }
2775
2776   if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
2777     CC = getSetCCSwappedOperands(CC);
2778     std::swap(Op0, Op1);
2779   }
2780
2781   // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
2782   SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
2783
2784   if (NeonCmpAlt.getNode())
2785     NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
2786
2787   if (Invert)
2788     NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
2789
2790   return NeonCmp;
2791 }
2792
2793 // (SETCC lhs, rhs, condcode)
2794 SDValue
2795 AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
2796   SDLoc dl(Op);
2797   SDValue LHS = Op.getOperand(0);
2798   SDValue RHS = Op.getOperand(1);
2799   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2800   EVT VT = Op.getValueType();
2801
2802   if (VT.isVector())
2803     return LowerVectorSETCC(Op, DAG);
2804
2805   if (LHS.getValueType() == MVT::f128) {
2806     // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
2807     // for the rest of the function (some i32 or i64 values).
2808     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2809
2810     // If softenSetCCOperands returned a scalar, use it.
2811     if (RHS.getNode() == 0) {
2812       assert(LHS.getValueType() == Op.getValueType() &&
2813              "Unexpected setcc expansion!");
2814       return LHS;
2815     }
2816   }
2817
2818   if (LHS.getValueType().isInteger()) {
2819     SDValue A64cc;
2820
2821     // Integers are handled in a separate function because the combinations of
2822     // immediates and tests can get hairy and we may want to fiddle things.
2823     SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
2824
2825     return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
2826                        CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
2827                        A64cc);
2828   }
2829
2830   // Note that some LLVM floating-point CondCodes can't be lowered to a single
2831   // conditional branch, hence FPCCToA64CC can set a second test, where either
2832   // passing is sufficient.
2833   A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
2834   CondCode = FPCCToA64CC(CC, Alternative);
2835   SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
2836   SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2837                               DAG.getCondCode(CC));
2838   SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
2839                                      CmpOp, DAG.getConstant(1, VT),
2840                                      DAG.getConstant(0, VT), A64cc);
2841
2842   if (Alternative != A64CC::Invalid) {
2843     A64cc = DAG.getConstant(Alternative, MVT::i32);
2844     A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
2845                                DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
2846   }
2847
2848   return A64SELECT_CC;
2849 }
2850
2851 SDValue
2852 AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
2853   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
2854   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
2855
2856   // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
2857   // rather than just 8.
2858   return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
2859                        Op.getOperand(1), Op.getOperand(2),
2860                        DAG.getConstant(32, MVT::i32), 8, false, false,
2861                        MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
2862 }
2863
2864 SDValue
2865 AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2866   // The layout of the va_list struct is specified in the AArch64 Procedure Call
2867   // Standard, section B.3.
2868   MachineFunction &MF = DAG.getMachineFunction();
2869   AArch64MachineFunctionInfo *FuncInfo
2870     = MF.getInfo<AArch64MachineFunctionInfo>();
2871   SDLoc DL(Op);
2872
2873   SDValue Chain = Op.getOperand(0);
2874   SDValue VAList = Op.getOperand(1);
2875   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2876   SmallVector<SDValue, 4> MemOps;
2877
2878   // void *__stack at offset 0
2879   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
2880                                     getPointerTy());
2881   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
2882                                 MachinePointerInfo(SV), false, false, 0));
2883
2884   // void *__gr_top at offset 8
2885   int GPRSize = FuncInfo->getVariadicGPRSize();
2886   if (GPRSize > 0) {
2887     SDValue GRTop, GRTopAddr;
2888
2889     GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2890                             DAG.getConstant(8, getPointerTy()));
2891
2892     GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
2893     GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
2894                         DAG.getConstant(GPRSize, getPointerTy()));
2895
2896     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
2897                                   MachinePointerInfo(SV, 8),
2898                                   false, false, 0));
2899   }
2900
2901   // void *__vr_top at offset 16
2902   int FPRSize = FuncInfo->getVariadicFPRSize();
2903   if (FPRSize > 0) {
2904     SDValue VRTop, VRTopAddr;
2905     VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2906                             DAG.getConstant(16, getPointerTy()));
2907
2908     VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
2909     VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
2910                         DAG.getConstant(FPRSize, getPointerTy()));
2911
2912     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
2913                                   MachinePointerInfo(SV, 16),
2914                                   false, false, 0));
2915   }
2916
2917   // int __gr_offs at offset 24
2918   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2919                                    DAG.getConstant(24, getPointerTy()));
2920   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
2921                                 GROffsAddr, MachinePointerInfo(SV, 24),
2922                                 false, false, 0));
2923
2924   // int __vr_offs at offset 28
2925   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2926                                    DAG.getConstant(28, getPointerTy()));
2927   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
2928                                 VROffsAddr, MachinePointerInfo(SV, 28),
2929                                 false, false, 0));
2930
2931   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
2932                      MemOps.size());
2933 }
2934
2935 SDValue
2936 AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2937   switch (Op.getOpcode()) {
2938   default: llvm_unreachable("Don't know how to custom lower this!");
2939   case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
2940   case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
2941   case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
2942   case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
2943   case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
2944   case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
2945   case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
2946   case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
2947   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
2948   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
2949   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
2950   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
2951
2952   case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
2953   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
2954   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
2955   case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
2956   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
2957   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
2958   case ISD::JumpTable: return LowerJumpTable(Op, DAG);
2959   case ISD::SELECT: return LowerSELECT(Op, DAG);
2960   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
2961   case ISD::SETCC: return LowerSETCC(Op, DAG);
2962   case ISD::VACOPY: return LowerVACOPY(Op, DAG);
2963   case ISD::VASTART: return LowerVASTART(Op, DAG);
2964   case ISD::BUILD_VECTOR:
2965     return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
2966   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
2967   }
2968
2969   return SDValue();
2970 }
2971
2972 /// Check if the specified splat value corresponds to a valid vector constant
2973 /// for a Neon instruction with a "modified immediate" operand (e.g., MOVI).  If
2974 /// so, return the encoded 8-bit immediate and the OpCmode instruction fields
2975 /// values.
2976 static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
2977                               unsigned SplatBitSize, SelectionDAG &DAG,
2978                               bool is128Bits, NeonModImmType type, EVT &VT,
2979                               unsigned &Imm, unsigned &OpCmode) {
2980   switch (SplatBitSize) {
2981   default:
2982     llvm_unreachable("unexpected size for isNeonModifiedImm");
2983   case 8: {
2984     if (type != Neon_Mov_Imm)
2985       return false;
2986     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
2987     // Neon movi per byte: Op=0, Cmode=1110.
2988     OpCmode = 0xe;
2989     Imm = SplatBits;
2990     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
2991     break;
2992   }
2993   case 16: {
2994     // Neon move inst per halfword
2995     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
2996     if ((SplatBits & ~0xff) == 0) {
2997       // Value = 0x00nn is 0x00nn LSL 0
2998       // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
2999       // bic:  Op=1, Cmode=1001;  orr:  Op=0, Cmode=1001
3000       // Op=x, Cmode=100y
3001       Imm = SplatBits;
3002       OpCmode = 0x8;
3003       break;
3004     }
3005     if ((SplatBits & ~0xff00) == 0) {
3006       // Value = 0xnn00 is 0x00nn LSL 8
3007       // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
3008       // bic:  Op=1, Cmode=1011;  orr:  Op=0, Cmode=1011
3009       // Op=x, Cmode=101x
3010       Imm = SplatBits >> 8;
3011       OpCmode = 0xa;
3012       break;
3013     }
3014     // can't handle any other
3015     return false;
3016   }
3017
3018   case 32: {
3019     // First the LSL variants (MSL is unusable by some interested instructions).
3020
3021     // Neon move instr per word, shift zeros
3022     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
3023     if ((SplatBits & ~0xff) == 0) {
3024       // Value = 0x000000nn is 0x000000nn LSL 0
3025       // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
3026       // bic:  Op=1, Cmode= 0001; orr:  Op=0, Cmode= 0001
3027       // Op=x, Cmode=000x
3028       Imm = SplatBits;
3029       OpCmode = 0;
3030       break;
3031     }
3032     if ((SplatBits & ~0xff00) == 0) {
3033       // Value = 0x0000nn00 is 0x000000nn LSL 8
3034       // movi: Op=0, Cmode= 0010;  mvni: Op=1, Cmode= 0010
3035       // bic:  Op=1, Cmode= 0011;  orr : Op=0, Cmode= 0011
3036       // Op=x, Cmode=001x
3037       Imm = SplatBits >> 8;
3038       OpCmode = 0x2;
3039       break;
3040     }
3041     if ((SplatBits & ~0xff0000) == 0) {
3042       // Value = 0x00nn0000 is 0x000000nn LSL 16
3043       // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
3044       // bic:  Op=1, Cmode= 0101; orr:  Op=0, Cmode= 0101
3045       // Op=x, Cmode=010x
3046       Imm = SplatBits >> 16;
3047       OpCmode = 0x4;
3048       break;
3049     }
3050     if ((SplatBits & ~0xff000000) == 0) {
3051       // Value = 0xnn000000 is 0x000000nn LSL 24
3052       // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
3053       // bic:  Op=1, Cmode= 0111; orr:  Op=0, Cmode= 0111
3054       // Op=x, Cmode=011x
3055       Imm = SplatBits >> 24;
3056       OpCmode = 0x6;
3057       break;
3058     }
3059
3060     // Now the MSL immediates.
3061
3062     // Neon move instr per word, shift ones
3063     if ((SplatBits & ~0xffff) == 0 &&
3064         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
3065       // Value = 0x0000nnff is 0x000000nn MSL 8
3066       // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
3067       // Op=x, Cmode=1100
3068       Imm = SplatBits >> 8;
3069       OpCmode = 0xc;
3070       break;
3071     }
3072     if ((SplatBits & ~0xffffff) == 0 &&
3073         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
3074       // Value = 0x00nnffff is 0x000000nn MSL 16
3075       // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
3076       // Op=x, Cmode=1101
3077       Imm = SplatBits >> 16;
3078       OpCmode = 0xd;
3079       break;
3080     }
3081     // can't handle any other
3082     return false;
3083   }
3084
3085   case 64: {
3086     if (type != Neon_Mov_Imm)
3087       return false;
3088     // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
3089     // movi Op=1, Cmode=1110.
3090     OpCmode = 0x1e;
3091     uint64_t BitMask = 0xff;
3092     uint64_t Val = 0;
3093     unsigned ImmMask = 1;
3094     Imm = 0;
3095     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
3096       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
3097         Val |= BitMask;
3098         Imm |= ImmMask;
3099       } else if ((SplatBits & BitMask) != 0) {
3100         return false;
3101       }
3102       BitMask <<= 8;
3103       ImmMask <<= 1;
3104     }
3105     SplatBits = Val;
3106     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
3107     break;
3108   }
3109   }
3110
3111   return true;
3112 }
3113
3114 static SDValue PerformANDCombine(SDNode *N,
3115                                  TargetLowering::DAGCombinerInfo &DCI) {
3116
3117   SelectionDAG &DAG = DCI.DAG;
3118   SDLoc DL(N);
3119   EVT VT = N->getValueType(0);
3120
3121   // We're looking for an SRA/SHL pair which form an SBFX.
3122
3123   if (VT != MVT::i32 && VT != MVT::i64)
3124     return SDValue();
3125
3126   if (!isa<ConstantSDNode>(N->getOperand(1)))
3127     return SDValue();
3128
3129   uint64_t TruncMask = N->getConstantOperandVal(1);
3130   if (!isMask_64(TruncMask))
3131     return SDValue();
3132
3133   uint64_t Width = CountPopulation_64(TruncMask);
3134   SDValue Shift = N->getOperand(0);
3135
3136   if (Shift.getOpcode() != ISD::SRL)
3137     return SDValue();
3138
3139   if (!isa<ConstantSDNode>(Shift->getOperand(1)))
3140     return SDValue();
3141   uint64_t LSB = Shift->getConstantOperandVal(1);
3142
3143   if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
3144     return SDValue();
3145
3146   return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
3147                      DAG.getConstant(LSB, MVT::i64),
3148                      DAG.getConstant(LSB + Width - 1, MVT::i64));
3149 }
3150
3151 /// For a true bitfield insert, the bits getting into that contiguous mask
3152 /// should come from the low part of an existing value: they must be formed from
3153 /// a compatible SHL operation (unless they're already low). This function
3154 /// checks that condition and returns the least-significant bit that's
3155 /// intended. If the operation not a field preparation, -1 is returned.
3156 static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
3157                             SDValue &MaskedVal, uint64_t Mask) {
3158   if (!isShiftedMask_64(Mask))
3159     return -1;
3160
3161   // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
3162   // instruction. BFI will do a left-shift by LSB before applying the mask we've
3163   // spotted, so in general we should pre-emptively "undo" that by making sure
3164   // the incoming bits have had a right-shift applied to them.
3165   //
3166   // This right shift, however, will combine with existing left/right shifts. In
3167   // the simplest case of a completely straight bitfield operation, it will be
3168   // expected to completely cancel out with an existing SHL. More complicated
3169   // cases (e.g. bitfield to bitfield copy) may still need a real shift before
3170   // the BFI.
3171
3172   uint64_t LSB = countTrailingZeros(Mask);
3173   int64_t ShiftRightRequired = LSB;
3174   if (MaskedVal.getOpcode() == ISD::SHL &&
3175       isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
3176     ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
3177     MaskedVal = MaskedVal.getOperand(0);
3178   } else if (MaskedVal.getOpcode() == ISD::SRL &&
3179              isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
3180     ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
3181     MaskedVal = MaskedVal.getOperand(0);
3182   }
3183
3184   if (ShiftRightRequired > 0)
3185     MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
3186                             DAG.getConstant(ShiftRightRequired, MVT::i64));
3187   else if (ShiftRightRequired < 0) {
3188     // We could actually end up with a residual left shift, for example with
3189     // "struc.bitfield = val << 1".
3190     MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
3191                             DAG.getConstant(-ShiftRightRequired, MVT::i64));
3192   }
3193
3194   return LSB;
3195 }
3196
3197 /// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
3198 /// a mask and an extension. Returns true if a BFI was found and provides
3199 /// information on its surroundings.
3200 static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
3201                           bool &Extended) {
3202   Extended = false;
3203   if (N.getOpcode() == ISD::ZERO_EXTEND) {
3204     Extended = true;
3205     N = N.getOperand(0);
3206   }
3207
3208   if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
3209     Mask = N->getConstantOperandVal(1);
3210     N = N.getOperand(0);
3211   } else {
3212     // Mask is the whole width.
3213     Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
3214   }
3215
3216   if (N.getOpcode() == AArch64ISD::BFI) {
3217     BFI = N;
3218     return true;
3219   }
3220
3221   return false;
3222 }
3223
3224 /// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
3225 /// is roughly equivalent to (and (BFI ...), mask). This form is used because it
3226 /// can often be further combined with a larger mask. Ultimately, we want mask
3227 /// to be 2^32-1 or 2^64-1 so the AND can be skipped.
3228 static SDValue tryCombineToBFI(SDNode *N,
3229                                TargetLowering::DAGCombinerInfo &DCI,
3230                                const AArch64Subtarget *Subtarget) {
3231   SelectionDAG &DAG = DCI.DAG;
3232   SDLoc DL(N);
3233   EVT VT = N->getValueType(0);
3234
3235   assert(N->getOpcode() == ISD::OR && "Unexpected root");
3236
3237   // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
3238   // abandon the effort.
3239   SDValue LHS = N->getOperand(0);
3240   if (LHS.getOpcode() != ISD::AND)
3241     return SDValue();
3242
3243   uint64_t LHSMask;
3244   if (isa<ConstantSDNode>(LHS.getOperand(1)))
3245     LHSMask = LHS->getConstantOperandVal(1);
3246   else
3247     return SDValue();
3248
3249   // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
3250   // is or abandon the effort.
3251   SDValue RHS = N->getOperand(1);
3252   if (RHS.getOpcode() != ISD::AND)
3253     return SDValue();
3254
3255   uint64_t RHSMask;
3256   if (isa<ConstantSDNode>(RHS.getOperand(1)))
3257     RHSMask = RHS->getConstantOperandVal(1);
3258   else
3259     return SDValue();
3260
3261   // Can't do anything if the masks are incompatible.
3262   if (LHSMask & RHSMask)
3263     return SDValue();
3264
3265   // Now we need one of the masks to be a contiguous field. Without loss of
3266   // generality that should be the RHS one.
3267   SDValue Bitfield = LHS.getOperand(0);
3268   if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
3269     // We know that LHS is a candidate new value, and RHS isn't already a better
3270     // one.
3271     std::swap(LHS, RHS);
3272     std::swap(LHSMask, RHSMask);
3273   }
3274
3275   // We've done our best to put the right operands in the right places, all we
3276   // can do now is check whether a BFI exists.
3277   Bitfield = RHS.getOperand(0);
3278   int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
3279   if (LSB == -1)
3280     return SDValue();
3281
3282   uint32_t Width = CountPopulation_64(RHSMask);
3283   assert(Width && "Expected non-zero bitfield width");
3284
3285   SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
3286                             LHS.getOperand(0), Bitfield,
3287                             DAG.getConstant(LSB, MVT::i64),
3288                             DAG.getConstant(Width, MVT::i64));
3289
3290   // Mask is trivial
3291   if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
3292     return BFI;
3293
3294   return DAG.getNode(ISD::AND, DL, VT, BFI,
3295                      DAG.getConstant(LHSMask | RHSMask, VT));
3296 }
3297
3298 /// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
3299 /// original input. This is surprisingly common because SROA splits things up
3300 /// into i8 chunks, so the originally detected MaskedBFI may actually only act
3301 /// on the low (say) byte of a word. This is then orred into the rest of the
3302 /// word afterwards.
3303 ///
3304 /// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
3305 ///
3306 /// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
3307 /// MaskedBFI. We can also deal with a certain amount of extend/truncate being
3308 /// involved.
3309 static SDValue tryCombineToLargerBFI(SDNode *N,
3310                                      TargetLowering::DAGCombinerInfo &DCI,
3311                                      const AArch64Subtarget *Subtarget) {
3312   SelectionDAG &DAG = DCI.DAG;
3313   SDLoc DL(N);
3314   EVT VT = N->getValueType(0);
3315
3316   // First job is to hunt for a MaskedBFI on either the left or right. Swap
3317   // operands if it's actually on the right.
3318   SDValue BFI;
3319   SDValue PossExtraMask;
3320   uint64_t ExistingMask = 0;
3321   bool Extended = false;
3322   if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
3323     PossExtraMask = N->getOperand(1);
3324   else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
3325     PossExtraMask = N->getOperand(0);
3326   else
3327     return SDValue();
3328
3329   // We can only combine a BFI with another compatible mask.
3330   if (PossExtraMask.getOpcode() != ISD::AND ||
3331       !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
3332     return SDValue();
3333
3334   uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
3335
3336   // Masks must be compatible.
3337   if (ExtraMask & ExistingMask)
3338     return SDValue();
3339
3340   SDValue OldBFIVal = BFI.getOperand(0);
3341   SDValue NewBFIVal = BFI.getOperand(1);
3342   if (Extended) {
3343     // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
3344     // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
3345     // need to be made compatible.
3346     assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
3347            && "Invalid types for BFI");
3348     OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
3349     NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
3350   }
3351
3352   // We need the MaskedBFI to be combined with a mask of the *same* value.
3353   if (PossExtraMask.getOperand(0) != OldBFIVal)
3354     return SDValue();
3355
3356   BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
3357                     OldBFIVal, NewBFIVal,
3358                     BFI.getOperand(2), BFI.getOperand(3));
3359
3360   // If the masking is trivial, we don't need to create it.
3361   if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
3362     return BFI;
3363
3364   return DAG.getNode(ISD::AND, DL, VT, BFI,
3365                      DAG.getConstant(ExtraMask | ExistingMask, VT));
3366 }
3367
3368 /// An EXTR instruction is made up of two shifts, ORed together. This helper
3369 /// searches for and classifies those shifts.
3370 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
3371                          bool &FromHi) {
3372   if (N.getOpcode() == ISD::SHL)
3373     FromHi = false;
3374   else if (N.getOpcode() == ISD::SRL)
3375     FromHi = true;
3376   else
3377     return false;
3378
3379   if (!isa<ConstantSDNode>(N.getOperand(1)))
3380     return false;
3381
3382   ShiftAmount = N->getConstantOperandVal(1);
3383   Src = N->getOperand(0);
3384   return true;
3385 }
3386
3387 /// EXTR instruction extracts a contiguous chunk of bits from two existing
3388 /// registers viewed as a high/low pair. This function looks for the pattern:
3389 /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
3390 /// EXTR. Can't quite be done in TableGen because the two immediates aren't
3391 /// independent.
3392 static SDValue tryCombineToEXTR(SDNode *N,
3393                                 TargetLowering::DAGCombinerInfo &DCI) {
3394   SelectionDAG &DAG = DCI.DAG;
3395   SDLoc DL(N);
3396   EVT VT = N->getValueType(0);
3397
3398   assert(N->getOpcode() == ISD::OR && "Unexpected root");
3399
3400   if (VT != MVT::i32 && VT != MVT::i64)
3401     return SDValue();
3402
3403   SDValue LHS;
3404   uint32_t ShiftLHS = 0;
3405   bool LHSFromHi = 0;
3406   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
3407     return SDValue();
3408
3409   SDValue RHS;
3410   uint32_t ShiftRHS = 0;
3411   bool RHSFromHi = 0;
3412   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
3413     return SDValue();
3414
3415   // If they're both trying to come from the high part of the register, they're
3416   // not really an EXTR.
3417   if (LHSFromHi == RHSFromHi)
3418     return SDValue();
3419
3420   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
3421     return SDValue();
3422
3423   if (LHSFromHi) {
3424     std::swap(LHS, RHS);
3425     std::swap(ShiftLHS, ShiftRHS);
3426   }
3427
3428   return DAG.getNode(AArch64ISD::EXTR, DL, VT,
3429                      LHS, RHS,
3430                      DAG.getConstant(ShiftRHS, MVT::i64));
3431 }
3432
3433 /// Target-specific dag combine xforms for ISD::OR
3434 static SDValue PerformORCombine(SDNode *N,
3435                                 TargetLowering::DAGCombinerInfo &DCI,
3436                                 const AArch64Subtarget *Subtarget) {
3437
3438   SelectionDAG &DAG = DCI.DAG;
3439   SDLoc DL(N);
3440   EVT VT = N->getValueType(0);
3441
3442   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3443     return SDValue();
3444
3445   // Attempt to recognise bitfield-insert operations.
3446   SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
3447   if (Res.getNode())
3448     return Res;
3449
3450   // Attempt to combine an existing MaskedBFI operation into one with a larger
3451   // mask.
3452   Res = tryCombineToLargerBFI(N, DCI, Subtarget);
3453   if (Res.getNode())
3454     return Res;
3455
3456   Res = tryCombineToEXTR(N, DCI);
3457   if (Res.getNode())
3458     return Res;
3459
3460   if (!Subtarget->hasNEON())
3461     return SDValue();
3462
3463   // Attempt to use vector immediate-form BSL
3464   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
3465
3466   SDValue N0 = N->getOperand(0);
3467   if (N0.getOpcode() != ISD::AND)
3468     return SDValue();
3469
3470   SDValue N1 = N->getOperand(1);
3471   if (N1.getOpcode() != ISD::AND)
3472     return SDValue();
3473
3474   if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
3475     APInt SplatUndef;
3476     unsigned SplatBitSize;
3477     bool HasAnyUndefs;
3478     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
3479     APInt SplatBits0;
3480     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
3481                                       HasAnyUndefs) &&
3482         !HasAnyUndefs) {
3483       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
3484       APInt SplatBits1;
3485       if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
3486                                         HasAnyUndefs) && !HasAnyUndefs &&
3487           SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
3488           SplatBits0 == ~SplatBits1) {
3489
3490         return DAG.getNode(ISD::VSELECT, DL, VT, N0->getOperand(1),
3491                            N0->getOperand(0), N1->getOperand(0));
3492       }
3493     }
3494   }
3495
3496   return SDValue();
3497 }
3498
3499 /// Target-specific dag combine xforms for ISD::SRA
3500 static SDValue PerformSRACombine(SDNode *N,
3501                                  TargetLowering::DAGCombinerInfo &DCI) {
3502
3503   SelectionDAG &DAG = DCI.DAG;
3504   SDLoc DL(N);
3505   EVT VT = N->getValueType(0);
3506
3507   // We're looking for an SRA/SHL pair which form an SBFX.
3508
3509   if (VT != MVT::i32 && VT != MVT::i64)
3510     return SDValue();
3511
3512   if (!isa<ConstantSDNode>(N->getOperand(1)))
3513     return SDValue();
3514
3515   uint64_t ExtraSignBits = N->getConstantOperandVal(1);
3516   SDValue Shift = N->getOperand(0);
3517
3518   if (Shift.getOpcode() != ISD::SHL)
3519     return SDValue();
3520
3521   if (!isa<ConstantSDNode>(Shift->getOperand(1)))
3522     return SDValue();
3523
3524   uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
3525   uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
3526   uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
3527
3528   if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
3529     return SDValue();
3530
3531   return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
3532                      DAG.getConstant(LSB, MVT::i64),
3533                      DAG.getConstant(LSB + Width - 1, MVT::i64));
3534 }
3535
3536 /// Check if this is a valid build_vector for the immediate operand of
3537 /// a vector shift operation, where all the elements of the build_vector
3538 /// must have the same constant integer value.
3539 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
3540   // Ignore bit_converts.
3541   while (Op.getOpcode() == ISD::BITCAST)
3542     Op = Op.getOperand(0);
3543   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
3544   APInt SplatBits, SplatUndef;
3545   unsigned SplatBitSize;
3546   bool HasAnyUndefs;
3547   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
3548                                       HasAnyUndefs, ElementBits) ||
3549       SplatBitSize > ElementBits)
3550     return false;
3551   Cnt = SplatBits.getSExtValue();
3552   return true;
3553 }
3554
3555 /// Check if this is a valid build_vector for the immediate operand of
3556 /// a vector shift left operation.  That value must be in the range:
3557 /// 0 <= Value < ElementBits
3558 static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
3559   assert(VT.isVector() && "vector shift count is not a vector type");
3560   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
3561   if (!getVShiftImm(Op, ElementBits, Cnt))
3562     return false;
3563   return (Cnt >= 0 && Cnt < ElementBits);
3564 }
3565
3566 /// Check if this is a valid build_vector for the immediate operand of a
3567 /// vector shift right operation. The value must be in the range:
3568 ///   1 <= Value <= ElementBits
3569 static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
3570   assert(VT.isVector() && "vector shift count is not a vector type");
3571   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
3572   if (!getVShiftImm(Op, ElementBits, Cnt))
3573     return false;
3574   return (Cnt >= 1 && Cnt <= ElementBits);
3575 }
3576
3577 /// Checks for immediate versions of vector shifts and lowers them.
3578 static SDValue PerformShiftCombine(SDNode *N,
3579                                    TargetLowering::DAGCombinerInfo &DCI,
3580                                    const AArch64Subtarget *ST) {
3581   SelectionDAG &DAG = DCI.DAG;
3582   EVT VT = N->getValueType(0);
3583   if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
3584     return PerformSRACombine(N, DCI);
3585
3586   // Nothing to be done for scalar shifts.
3587   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3588   if (!VT.isVector() || !TLI.isTypeLegal(VT))
3589     return SDValue();
3590
3591   assert(ST->hasNEON() && "unexpected vector shift");
3592   int64_t Cnt;
3593
3594   switch (N->getOpcode()) {
3595   default:
3596     llvm_unreachable("unexpected shift opcode");
3597
3598   case ISD::SHL:
3599     if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
3600       SDValue RHS =
3601           DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
3602                       DAG.getConstant(Cnt, MVT::i32));
3603       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
3604     }
3605     break;
3606
3607   case ISD::SRA:
3608   case ISD::SRL:
3609     if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
3610       SDValue RHS =
3611           DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
3612                       DAG.getConstant(Cnt, MVT::i32));
3613       return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
3614     }
3615     break;
3616   }
3617
3618   return SDValue();
3619 }
3620
3621 /// ARM-specific DAG combining for intrinsics.
3622 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
3623   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3624
3625   switch (IntNo) {
3626   default:
3627     // Don't do anything for most intrinsics.
3628     break;
3629
3630   case Intrinsic::arm_neon_vqshifts:
3631   case Intrinsic::arm_neon_vqshiftu:
3632     EVT VT = N->getOperand(1).getValueType();
3633     int64_t Cnt;
3634     if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
3635       break;
3636     unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
3637                              ? AArch64ISD::NEON_QSHLs
3638                              : AArch64ISD::NEON_QSHLu;
3639     return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
3640                        N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
3641   }
3642
3643   return SDValue();
3644 }
3645
3646 /// Target-specific DAG combine function for NEON load/store intrinsics
3647 /// to merge base address updates.
3648 static SDValue CombineBaseUpdate(SDNode *N,
3649                                  TargetLowering::DAGCombinerInfo &DCI) {
3650   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
3651     return SDValue();
3652
3653   SelectionDAG &DAG = DCI.DAG;
3654   bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
3655                       N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
3656   unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
3657   SDValue Addr = N->getOperand(AddrOpIdx);
3658
3659   // Search for a use of the address operand that is an increment.
3660   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
3661        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
3662     SDNode *User = *UI;
3663     if (User->getOpcode() != ISD::ADD ||
3664         UI.getUse().getResNo() != Addr.getResNo())
3665       continue;
3666
3667     // Check that the add is independent of the load/store.  Otherwise, folding
3668     // it would create a cycle.
3669     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
3670       continue;
3671
3672     // Find the new opcode for the updating load/store.
3673     bool isLoad = true;
3674     bool isLaneOp = false;
3675     unsigned NewOpc = 0;
3676     unsigned NumVecs = 0;
3677     if (isIntrinsic) {
3678       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
3679       switch (IntNo) {
3680       default: llvm_unreachable("unexpected intrinsic for Neon base update");
3681       case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
3682         NumVecs = 1; break;
3683       case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
3684         NumVecs = 2; break;
3685       case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
3686         NumVecs = 3; break;
3687       case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
3688         NumVecs = 4; break;
3689       case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
3690         NumVecs = 1; isLoad = false; break;
3691       case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
3692         NumVecs = 2; isLoad = false; break;
3693       case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
3694         NumVecs = 3; isLoad = false; break;
3695       case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
3696         NumVecs = 4; isLoad = false; break;
3697       case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
3698         NumVecs = 2; break;
3699       case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
3700         NumVecs = 3; break;
3701       case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
3702         NumVecs = 4; break;
3703       case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
3704         NumVecs = 2; isLoad = false; break;
3705       case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
3706         NumVecs = 3; isLoad = false; break;
3707       case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
3708         NumVecs = 4; isLoad = false; break;
3709       case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
3710         NumVecs = 2; isLaneOp = true; break;
3711       case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
3712         NumVecs = 3; isLaneOp = true; break;
3713       case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
3714         NumVecs = 4; isLaneOp = true; break;
3715       case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
3716         NumVecs = 2; isLoad = false; isLaneOp = true; break;
3717       case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
3718         NumVecs = 3; isLoad = false; isLaneOp = true; break;
3719       case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
3720         NumVecs = 4; isLoad = false; isLaneOp = true; break;
3721       }
3722     } else {
3723       isLaneOp = true;
3724       switch (N->getOpcode()) {
3725       default: llvm_unreachable("unexpected opcode for Neon base update");
3726       case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
3727         NumVecs = 2; break;
3728       case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
3729         NumVecs = 3; break;
3730       case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
3731         NumVecs = 4; break;
3732       }
3733     }
3734
3735     // Find the size of memory referenced by the load/store.
3736     EVT VecTy;
3737     if (isLoad)
3738       VecTy = N->getValueType(0);
3739     else
3740       VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
3741     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
3742     if (isLaneOp)
3743       NumBytes /= VecTy.getVectorNumElements();
3744
3745     // If the increment is a constant, it must match the memory ref size.
3746     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
3747     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
3748       uint32_t IncVal = CInc->getZExtValue();
3749       if (IncVal != NumBytes)
3750         continue;
3751       Inc = DAG.getTargetConstant(IncVal, MVT::i32);
3752     }
3753
3754     // Create the new updating load/store node.
3755     EVT Tys[6];
3756     unsigned NumResultVecs = (isLoad ? NumVecs : 0);
3757     unsigned n;
3758     for (n = 0; n < NumResultVecs; ++n)
3759       Tys[n] = VecTy;
3760     Tys[n++] = MVT::i64;
3761     Tys[n] = MVT::Other;
3762     SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
3763     SmallVector<SDValue, 8> Ops;
3764     Ops.push_back(N->getOperand(0)); // incoming chain
3765     Ops.push_back(N->getOperand(AddrOpIdx));
3766     Ops.push_back(Inc);
3767     for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
3768       Ops.push_back(N->getOperand(i));
3769     }
3770     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
3771     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
3772                                            Ops.data(), Ops.size(),
3773                                            MemInt->getMemoryVT(),
3774                                            MemInt->getMemOperand());
3775
3776     // Update the uses.
3777     std::vector<SDValue> NewResults;
3778     for (unsigned i = 0; i < NumResultVecs; ++i) {
3779       NewResults.push_back(SDValue(UpdN.getNode(), i));
3780     }
3781     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
3782     DCI.CombineTo(N, NewResults);
3783     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
3784
3785     break;
3786   }
3787   return SDValue();
3788 }
3789
3790 /// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
3791 /// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
3792 /// If so, combine them to a vldN-dup operation and return true.
3793 static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
3794   SelectionDAG &DAG = DCI.DAG;
3795   EVT VT = N->getValueType(0);
3796
3797   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
3798   SDNode *VLD = N->getOperand(0).getNode();
3799   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
3800     return SDValue();
3801   unsigned NumVecs = 0;
3802   unsigned NewOpc = 0;
3803   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
3804   if (IntNo == Intrinsic::arm_neon_vld2lane) {
3805     NumVecs = 2;
3806     NewOpc = AArch64ISD::NEON_LD2DUP;
3807   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
3808     NumVecs = 3;
3809     NewOpc = AArch64ISD::NEON_LD3DUP;
3810   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
3811     NumVecs = 4;
3812     NewOpc = AArch64ISD::NEON_LD4DUP;
3813   } else {
3814     return SDValue();
3815   }
3816
3817   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
3818   // numbers match the load.
3819   unsigned VLDLaneNo =
3820       cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
3821   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
3822        UI != UE; ++UI) {
3823     // Ignore uses of the chain result.
3824     if (UI.getUse().getResNo() == NumVecs)
3825       continue;
3826     SDNode *User = *UI;
3827     if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
3828         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
3829       return SDValue();
3830   }
3831
3832   // Create the vldN-dup node.
3833   EVT Tys[5];
3834   unsigned n;
3835   for (n = 0; n < NumVecs; ++n)
3836     Tys[n] = VT;
3837   Tys[n] = MVT::Other;
3838   SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
3839   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
3840   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
3841   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
3842                                            VLDMemInt->getMemoryVT(),
3843                                            VLDMemInt->getMemOperand());
3844
3845   // Update the uses.
3846   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
3847        UI != UE; ++UI) {
3848     unsigned ResNo = UI.getUse().getResNo();
3849     // Ignore uses of the chain result.
3850     if (ResNo == NumVecs)
3851       continue;
3852     SDNode *User = *UI;
3853     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
3854   }
3855
3856   // Now the vldN-lane intrinsic is dead except for its chain result.
3857   // Update uses of the chain.
3858   std::vector<SDValue> VLDDupResults;
3859   for (unsigned n = 0; n < NumVecs; ++n)
3860     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
3861   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
3862   DCI.CombineTo(VLD, VLDDupResults);
3863
3864   return SDValue(N, 0);
3865 }
3866
3867 SDValue
3868 AArch64TargetLowering::PerformDAGCombine(SDNode *N,
3869                                          DAGCombinerInfo &DCI) const {
3870   switch (N->getOpcode()) {
3871   default: break;
3872   case ISD::AND: return PerformANDCombine(N, DCI);
3873   case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
3874   case ISD::SHL:
3875   case ISD::SRA:
3876   case ISD::SRL:
3877     return PerformShiftCombine(N, DCI, getSubtarget());
3878   case ISD::INTRINSIC_WO_CHAIN:
3879     return PerformIntrinsicCombine(N, DCI.DAG);
3880   case AArch64ISD::NEON_VDUPLANE:
3881     return CombineVLDDUP(N, DCI);
3882   case AArch64ISD::NEON_LD2DUP:
3883   case AArch64ISD::NEON_LD3DUP:
3884   case AArch64ISD::NEON_LD4DUP:
3885     return CombineBaseUpdate(N, DCI);
3886   case ISD::INTRINSIC_VOID:
3887   case ISD::INTRINSIC_W_CHAIN:
3888     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
3889     case Intrinsic::arm_neon_vld1:
3890     case Intrinsic::arm_neon_vld2:
3891     case Intrinsic::arm_neon_vld3:
3892     case Intrinsic::arm_neon_vld4:
3893     case Intrinsic::arm_neon_vst1:
3894     case Intrinsic::arm_neon_vst2:
3895     case Intrinsic::arm_neon_vst3:
3896     case Intrinsic::arm_neon_vst4:
3897     case Intrinsic::arm_neon_vld2lane:
3898     case Intrinsic::arm_neon_vld3lane:
3899     case Intrinsic::arm_neon_vld4lane:
3900     case Intrinsic::aarch64_neon_vld1x2:
3901     case Intrinsic::aarch64_neon_vld1x3:
3902     case Intrinsic::aarch64_neon_vld1x4:
3903     case Intrinsic::aarch64_neon_vst1x2:
3904     case Intrinsic::aarch64_neon_vst1x3:
3905     case Intrinsic::aarch64_neon_vst1x4:
3906     case Intrinsic::arm_neon_vst2lane:
3907     case Intrinsic::arm_neon_vst3lane:
3908     case Intrinsic::arm_neon_vst4lane:
3909       return CombineBaseUpdate(N, DCI);
3910     default:
3911       break;
3912     }
3913   }
3914   return SDValue();
3915 }
3916
3917 bool
3918 AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3919   VT = VT.getScalarType();
3920
3921   if (!VT.isSimple())
3922     return false;
3923
3924   switch (VT.getSimpleVT().SimpleTy) {
3925   case MVT::f16:
3926   case MVT::f32:
3927   case MVT::f64:
3928     return true;
3929   case MVT::f128:
3930     return false;
3931   default:
3932     break;
3933   }
3934
3935   return false;
3936 }
3937
3938 // Check whether a Build Vector could be presented as Shuffle Vector. If yes,
3939 // try to call LowerVECTOR_SHUFFLE to lower it.
3940 bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
3941                                                  SDValue &Res) const {
3942   SDLoc DL(Op);
3943   EVT VT = Op.getValueType();
3944   unsigned NumElts = VT.getVectorNumElements();
3945   unsigned V0NumElts = 0;
3946   int Mask[16];
3947   SDValue V0, V1;
3948
3949   // Check if all elements are extracted from less than 3 vectors.
3950   for (unsigned i = 0; i < NumElts; ++i) {
3951     SDValue Elt = Op.getOperand(i);
3952     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3953       return false;
3954
3955     if (V0.getNode() == 0) {
3956       V0 = Elt.getOperand(0);
3957       V0NumElts = V0.getValueType().getVectorNumElements();
3958     }
3959     if (Elt.getOperand(0) == V0) {
3960       Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
3961       continue;
3962     } else if (V1.getNode() == 0) {
3963       V1 = Elt.getOperand(0);
3964     }
3965     if (Elt.getOperand(0) == V1) {
3966       unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
3967       Mask[i] = (Lane + V0NumElts);
3968       continue;
3969     } else {
3970       return false;
3971     }
3972   }
3973
3974   if (!V1.getNode() && V0NumElts == NumElts * 2) {
3975     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
3976                      DAG.getConstant(NumElts, MVT::i64));
3977     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
3978                      DAG.getConstant(0, MVT::i64));
3979     V0NumElts = V0.getValueType().getVectorNumElements();
3980   }
3981
3982   if (V1.getNode() && NumElts == V0NumElts &&
3983       V0NumElts == V1.getValueType().getVectorNumElements()) {
3984     SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
3985     if(Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
3986       Res = Shuffle;
3987     else
3988       Res = LowerVECTOR_SHUFFLE(Shuffle, DAG);
3989     return true;
3990   } else
3991     return false;
3992 }
3993
3994 // If this is a case we can't handle, return null and let the default
3995 // expansion code take care of it.
3996 SDValue
3997 AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
3998                                          const AArch64Subtarget *ST) const {
3999
4000   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
4001   SDLoc DL(Op);
4002   EVT VT = Op.getValueType();
4003
4004   APInt SplatBits, SplatUndef;
4005   unsigned SplatBitSize;
4006   bool HasAnyUndefs;
4007
4008   unsigned UseNeonMov = VT.getSizeInBits() >= 64;
4009
4010   // Note we favor lowering MOVI over MVNI.
4011   // This has implications on the definition of patterns in TableGen to select
4012   // BIC immediate instructions but not ORR immediate instructions.
4013   // If this lowering order is changed, TableGen patterns for BIC immediate and
4014   // ORR immediate instructions have to be updated.
4015   if (UseNeonMov &&
4016       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
4017     if (SplatBitSize <= 64) {
4018       // First attempt to use vector immediate-form MOVI
4019       EVT NeonMovVT;
4020       unsigned Imm = 0;
4021       unsigned OpCmode = 0;
4022
4023       if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
4024                             SplatBitSize, DAG, VT.is128BitVector(),
4025                             Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
4026         SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
4027         SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
4028
4029         if (ImmVal.getNode() && OpCmodeVal.getNode()) {
4030           SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
4031                                         ImmVal, OpCmodeVal);
4032           return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
4033         }
4034       }
4035
4036       // Then attempt to use vector immediate-form MVNI
4037       uint64_t NegatedImm = (~SplatBits).getZExtValue();
4038       if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
4039                             DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
4040                             Imm, OpCmode)) {
4041         SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
4042         SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
4043         if (ImmVal.getNode() && OpCmodeVal.getNode()) {
4044           SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
4045                                         ImmVal, OpCmodeVal);
4046           return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
4047         }
4048       }
4049
4050       // Attempt to use vector immediate-form FMOV
4051       if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
4052           (VT == MVT::v2f64 && SplatBitSize == 64)) {
4053         APFloat RealVal(
4054             SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
4055             SplatBits);
4056         uint32_t ImmVal;
4057         if (A64Imms::isFPImm(RealVal, ImmVal)) {
4058           SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
4059           return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
4060         }
4061       }
4062     }
4063   }
4064
4065   unsigned NumElts = VT.getVectorNumElements();
4066   bool isOnlyLowElement = true;
4067   bool usesOnlyOneValue = true;
4068   bool hasDominantValue = false;
4069   bool isConstant = true;
4070
4071   // Map of the number of times a particular SDValue appears in the
4072   // element list.
4073   DenseMap<SDValue, unsigned> ValueCounts;
4074   SDValue Value;
4075   for (unsigned i = 0; i < NumElts; ++i) {
4076     SDValue V = Op.getOperand(i);
4077     if (V.getOpcode() == ISD::UNDEF)
4078       continue;
4079     if (i > 0)
4080       isOnlyLowElement = false;
4081     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
4082       isConstant = false;
4083
4084     ValueCounts.insert(std::make_pair(V, 0));
4085     unsigned &Count = ValueCounts[V];
4086
4087     // Is this value dominant? (takes up more than half of the lanes)
4088     if (++Count > (NumElts / 2)) {
4089       hasDominantValue = true;
4090       Value = V;
4091     }
4092   }
4093   if (ValueCounts.size() != 1)
4094     usesOnlyOneValue = false;
4095   if (!Value.getNode() && ValueCounts.size() > 0)
4096     Value = ValueCounts.begin()->first;
4097
4098   if (ValueCounts.size() == 0)
4099     return DAG.getUNDEF(VT);
4100
4101   if (isOnlyLowElement)
4102     return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
4103
4104   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4105   if (hasDominantValue && EltSize <= 64) {
4106     // Use VDUP for non-constant splats.
4107     if (!isConstant) {
4108       SDValue N;
4109
4110       // If we are DUPing a value that comes directly from a vector, we could
4111       // just use DUPLANE. We can only do this if the lane being extracted
4112       // is at a constant index, as the DUP from lane instructions only have
4113       // constant-index forms.
4114       //
4115       // If there is a TRUNCATE between EXTRACT_VECTOR_ELT and DUP, we can
4116       // remove TRUNCATE for DUPLANE by apdating the source vector to
4117       // appropriate vector type and lane index.
4118       //
4119       // FIXME: for now we have v1i8, v1i16, v1i32 legal vector types, if they
4120       // are not legal any more, no need to check the type size in bits should
4121       // be large than 64.
4122       SDValue V = Value;
4123       if (Value->getOpcode() == ISD::TRUNCATE)
4124         V = Value->getOperand(0);
4125       if (V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
4126           isa<ConstantSDNode>(V->getOperand(1)) &&
4127           V->getOperand(0).getValueType().getSizeInBits() >= 64) {
4128
4129         // If the element size of source vector is larger than DUPLANE
4130         // element size, we can do transformation by,
4131         // 1) bitcasting source register to smaller element vector
4132         // 2) mutiplying the lane index by SrcEltSize/ResEltSize
4133         // For example, we can lower
4134         //     "v8i16 vdup_lane(v4i32, 1)"
4135         // to be
4136         //     "v8i16 vdup_lane(v8i16 bitcast(v4i32), 2)".
4137         SDValue SrcVec = V->getOperand(0);
4138         unsigned SrcEltSize =
4139             SrcVec.getValueType().getVectorElementType().getSizeInBits();
4140         unsigned ResEltSize = VT.getVectorElementType().getSizeInBits();
4141         if (SrcEltSize > ResEltSize) {
4142           assert((SrcEltSize % ResEltSize == 0) && "Invalid element size");
4143           SDValue BitCast;
4144           unsigned SrcSize = SrcVec.getValueType().getSizeInBits();
4145           unsigned ResSize = VT.getSizeInBits();
4146
4147           if (SrcSize > ResSize) {
4148             assert((SrcSize % ResSize == 0) && "Invalid vector size");
4149             EVT CastVT =
4150                 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
4151                                  SrcSize / ResEltSize);
4152             BitCast = DAG.getNode(ISD::BITCAST, DL, CastVT, SrcVec);
4153           } else {
4154             assert((SrcSize == ResSize) && "Invalid vector size of source vec");
4155             BitCast = DAG.getNode(ISD::BITCAST, DL, VT, SrcVec);
4156           }
4157
4158           unsigned LaneIdx = V->getConstantOperandVal(1);
4159           SDValue Lane =
4160               DAG.getConstant((SrcEltSize / ResEltSize) * LaneIdx, MVT::i64);
4161           N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, BitCast, Lane);
4162         } else {
4163           assert((SrcEltSize == ResEltSize) &&
4164                  "Invalid element size of source vec");
4165           N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, V->getOperand(0),
4166                           V->getOperand(1));
4167         }
4168       } else
4169         N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
4170
4171       if (!usesOnlyOneValue) {
4172         // The dominant value was splatted as 'N', but we now have to insert
4173         // all differing elements.
4174         for (unsigned I = 0; I < NumElts; ++I) {
4175           if (Op.getOperand(I) == Value)
4176             continue;
4177           SmallVector<SDValue, 3> Ops;
4178           Ops.push_back(N);
4179           Ops.push_back(Op.getOperand(I));
4180           Ops.push_back(DAG.getConstant(I, MVT::i64));
4181           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
4182         }
4183       }
4184       return N;
4185     }
4186     if (usesOnlyOneValue && isConstant) {
4187       return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
4188     }
4189   }
4190   // If all elements are constants and the case above didn't get hit, fall back
4191   // to the default expansion, which will generate a load from the constant
4192   // pool.
4193   if (isConstant)
4194     return SDValue();
4195
4196   // Try to lower this in lowering ShuffleVector way.
4197   SDValue Shuf;
4198   if (isKnownShuffleVector(Op, DAG, Shuf))
4199     return Shuf;
4200
4201   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
4202   // know the default expansion would otherwise fall back on something even
4203   // worse. For a vector with one or two non-undef values, that's
4204   // scalar_to_vector for the elements followed by a shuffle (provided the
4205   // shuffle is valid for the target) and materialization element by element
4206   // on the stack followed by a load for everything else.
4207   if (!isConstant && !usesOnlyOneValue) {
4208     SDValue Vec = DAG.getUNDEF(VT);
4209     for (unsigned i = 0 ; i < NumElts; ++i) {
4210       SDValue V = Op.getOperand(i);
4211       if (V.getOpcode() == ISD::UNDEF)
4212         continue;
4213       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
4214       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
4215     }
4216     return Vec;
4217   }
4218   return SDValue();
4219 }
4220
4221 /// isREVMask - Check if a vector shuffle corresponds to a REV
4222 /// instruction with the specified blocksize.  (The order of the elements
4223 /// within each block of the vector is reversed.)
4224 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
4225   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
4226          "Only possible block sizes for REV are: 16, 32, 64");
4227
4228   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4229   if (EltSz == 64)
4230     return false;
4231
4232   unsigned NumElts = VT.getVectorNumElements();
4233   unsigned BlockElts = M[0] + 1;
4234   // If the first shuffle index is UNDEF, be optimistic.
4235   if (M[0] < 0)
4236     BlockElts = BlockSize / EltSz;
4237
4238   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
4239     return false;
4240
4241   for (unsigned i = 0; i < NumElts; ++i) {
4242     if (M[i] < 0)
4243       continue; // ignore UNDEF indices
4244     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
4245       return false;
4246   }
4247
4248   return true;
4249 }
4250
4251 // isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
4252 // TRN instruction.
4253 static unsigned isPermuteMask(ArrayRef<int> M, EVT VT, bool isV2undef) {
4254   unsigned NumElts = VT.getVectorNumElements();
4255   if (NumElts < 4)
4256     return 0;
4257
4258   bool ismatch = true;
4259
4260   // Check UZP1
4261   for (unsigned i = 0; i < NumElts; ++i) {
4262     unsigned answer = i * 2;
4263     if (isV2undef && answer >= NumElts)
4264       answer -= NumElts;
4265     if (M[i] != -1 && (unsigned)M[i] != answer) {
4266       ismatch = false;
4267       break;
4268     }
4269   }
4270   if (ismatch)
4271     return AArch64ISD::NEON_UZP1;
4272
4273   // Check UZP2
4274   ismatch = true;
4275   for (unsigned i = 0; i < NumElts; ++i) {
4276     unsigned answer = i * 2 + 1;
4277     if (isV2undef && answer >= NumElts)
4278       answer -= NumElts;
4279     if (M[i] != -1 && (unsigned)M[i] != answer) {
4280       ismatch = false;
4281       break;
4282     }
4283   }
4284   if (ismatch)
4285     return AArch64ISD::NEON_UZP2;
4286
4287   // Check ZIP1
4288   ismatch = true;
4289   for (unsigned i = 0; i < NumElts; ++i) {
4290     unsigned answer = i / 2 + NumElts * (i % 2);
4291     if (isV2undef && answer >= NumElts)
4292       answer -= NumElts;
4293     if (M[i] != -1 && (unsigned)M[i] != answer) {
4294       ismatch = false;
4295       break;
4296     }
4297   }
4298   if (ismatch)
4299     return AArch64ISD::NEON_ZIP1;
4300
4301   // Check ZIP2
4302   ismatch = true;
4303   for (unsigned i = 0; i < NumElts; ++i) {
4304     unsigned answer = (NumElts + i) / 2 + NumElts * (i % 2);
4305     if (isV2undef && answer >= NumElts)
4306       answer -= NumElts;
4307     if (M[i] != -1 && (unsigned)M[i] != answer) {
4308       ismatch = false;
4309       break;
4310     }
4311   }
4312   if (ismatch)
4313     return AArch64ISD::NEON_ZIP2;
4314
4315   // Check TRN1
4316   ismatch = true;
4317   for (unsigned i = 0; i < NumElts; ++i) {
4318     unsigned answer = i + (NumElts - 1) * (i % 2);
4319     if (isV2undef && answer >= NumElts)
4320       answer -= NumElts;
4321     if (M[i] != -1 && (unsigned)M[i] != answer) {
4322       ismatch = false;
4323       break;
4324     }
4325   }
4326   if (ismatch)
4327     return AArch64ISD::NEON_TRN1;
4328
4329   // Check TRN2
4330   ismatch = true;
4331   for (unsigned i = 0; i < NumElts; ++i) {
4332     unsigned answer = 1 + i + (NumElts - 1) * (i % 2);
4333     if (isV2undef && answer >= NumElts)
4334       answer -= NumElts;
4335     if (M[i] != -1 && (unsigned)M[i] != answer) {
4336       ismatch = false;
4337       break;
4338     }
4339   }
4340   if (ismatch)
4341     return AArch64ISD::NEON_TRN2;
4342
4343   return 0;
4344 }
4345
4346 SDValue
4347 AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
4348                                            SelectionDAG &DAG) const {
4349   SDValue V1 = Op.getOperand(0);
4350   SDValue V2 = Op.getOperand(1);
4351   SDLoc dl(Op);
4352   EVT VT = Op.getValueType();
4353   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
4354
4355   // Convert shuffles that are directly supported on NEON to target-specific
4356   // DAG nodes, instead of keeping them as shuffles and matching them again
4357   // during code selection.  This is more efficient and avoids the possibility
4358   // of inconsistencies between legalization and selection.
4359   ArrayRef<int> ShuffleMask = SVN->getMask();
4360
4361   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4362   if (EltSize > 64)
4363     return SDValue();
4364
4365   if (isREVMask(ShuffleMask, VT, 64))
4366     return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
4367   if (isREVMask(ShuffleMask, VT, 32))
4368     return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
4369   if (isREVMask(ShuffleMask, VT, 16))
4370     return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
4371
4372   unsigned ISDNo;
4373   if (V2.getOpcode() == ISD::UNDEF)
4374     ISDNo = isPermuteMask(ShuffleMask, VT, true);
4375   else
4376     ISDNo = isPermuteMask(ShuffleMask, VT, false);
4377
4378   if (ISDNo) {
4379     if (V2.getOpcode() == ISD::UNDEF)
4380       return DAG.getNode(ISDNo, dl, VT, V1, V1);
4381     else
4382       return DAG.getNode(ISDNo, dl, VT, V1, V2);
4383   }
4384
4385   // If the element of shuffle mask are all the same constant, we can
4386   // transform it into either NEON_VDUP or NEON_VDUPLANE
4387   if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
4388     int Lane = SVN->getSplatIndex();
4389     // If this is undef splat, generate it via "just" vdup, if possible.
4390     if (Lane == -1) Lane = 0;
4391
4392     // Test if V1 is a SCALAR_TO_VECTOR.
4393     if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4394       return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
4395     }
4396     // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
4397     if (V1.getOpcode() == ISD::BUILD_VECTOR) {
4398       bool IsScalarToVector = true;
4399       for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
4400         if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
4401             i != (unsigned)Lane) {
4402           IsScalarToVector = false;
4403           break;
4404         }
4405       if (IsScalarToVector)
4406         return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
4407                            V1.getOperand(Lane));
4408     }
4409
4410     // Test if V1 is a EXTRACT_SUBVECTOR.
4411     if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4412       int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
4413       return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
4414                          DAG.getConstant(Lane + ExtLane, MVT::i64));
4415     }
4416     // Test if V1 is a CONCAT_VECTORS.
4417     if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
4418         V1.getOperand(1).getOpcode() == ISD::UNDEF) {
4419       SDValue Op0 = V1.getOperand(0);
4420       assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
4421              "Invalid vector lane access");
4422       return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
4423                          DAG.getConstant(Lane, MVT::i64));
4424     }
4425
4426     return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
4427                        DAG.getConstant(Lane, MVT::i64));
4428   }
4429
4430   int Length = ShuffleMask.size();
4431   int V1EltNum = V1.getValueType().getVectorNumElements();
4432
4433   // If the number of v1 elements is the same as the number of shuffle mask
4434   // element and the shuffle masks are sequential values, we can transform
4435   // it into NEON_VEXTRACT.
4436   if (V1EltNum == Length) {
4437     // Check if the shuffle mask is sequential.
4438     bool IsSequential = true;
4439     int CurMask = ShuffleMask[0];
4440     for (int I = 0; I < Length; ++I) {
4441       if (ShuffleMask[I] != CurMask) {
4442         IsSequential = false;
4443         break;
4444       }
4445       CurMask++;
4446     }
4447     if (IsSequential) {
4448       assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
4449       unsigned VecSize = EltSize * V1EltNum;
4450       unsigned Index = (EltSize/8) * ShuffleMask[0];
4451       if (VecSize == 64 || VecSize == 128)
4452         return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
4453                            DAG.getConstant(Index, MVT::i64));
4454     }
4455   }
4456
4457   // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
4458   // by element from V2 to V1 .
4459   // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
4460   // better choice to be inserted than V1 as less insert needed, so we count
4461   // element to be inserted for both V1 and V2, and select less one as insert
4462   // target.
4463
4464   // Collect elements need to be inserted and their index.
4465   SmallVector<int, 8> NV1Elt;
4466   SmallVector<int, 8> N1Index;
4467   SmallVector<int, 8> NV2Elt;
4468   SmallVector<int, 8> N2Index;
4469   for (int I = 0; I != Length; ++I) {
4470     if (ShuffleMask[I] != I) {
4471       NV1Elt.push_back(ShuffleMask[I]);
4472       N1Index.push_back(I);
4473     }
4474   }
4475   for (int I = 0; I != Length; ++I) {
4476     if (ShuffleMask[I] != (I + V1EltNum)) {
4477       NV2Elt.push_back(ShuffleMask[I]);
4478       N2Index.push_back(I);
4479     }
4480   }
4481
4482   // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
4483   // will be inserted.
4484   SDValue InsV = V1;
4485   SmallVector<int, 8> InsMasks = NV1Elt;
4486   SmallVector<int, 8> InsIndex = N1Index;
4487   if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
4488     if (NV1Elt.size() > NV2Elt.size()) {
4489       InsV = V2;
4490       InsMasks = NV2Elt;
4491       InsIndex = N2Index;
4492     }
4493   } else {
4494     InsV = DAG.getNode(ISD::UNDEF, dl, VT);
4495   }
4496
4497   for (int I = 0, E = InsMasks.size(); I != E; ++I) {
4498     SDValue ExtV = V1;
4499     int Mask = InsMasks[I];
4500     if (Mask >= V1EltNum) {
4501       ExtV = V2;
4502       Mask -= V1EltNum;
4503     }
4504     // Any value type smaller than i32 is illegal in AArch64, and this lower
4505     // function is called after legalize pass, so we need to legalize
4506     // the result here.
4507     EVT EltVT;
4508     if (VT.getVectorElementType().isFloatingPoint())
4509       EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
4510     else
4511       EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
4512
4513     if (Mask >= 0) {
4514       ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
4515                          DAG.getConstant(Mask, MVT::i64));
4516       InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
4517                          DAG.getConstant(InsIndex[I], MVT::i64));
4518     }
4519   }
4520   return InsV;
4521 }
4522
4523 AArch64TargetLowering::ConstraintType
4524 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
4525   if (Constraint.size() == 1) {
4526     switch (Constraint[0]) {
4527     default: break;
4528     case 'w': // An FP/SIMD vector register
4529       return C_RegisterClass;
4530     case 'I': // Constant that can be used with an ADD instruction
4531     case 'J': // Constant that can be used with a SUB instruction
4532     case 'K': // Constant that can be used with a 32-bit logical instruction
4533     case 'L': // Constant that can be used with a 64-bit logical instruction
4534     case 'M': // Constant that can be used as a 32-bit MOV immediate
4535     case 'N': // Constant that can be used as a 64-bit MOV immediate
4536     case 'Y': // Floating point constant zero
4537     case 'Z': // Integer constant zero
4538       return C_Other;
4539     case 'Q': // A memory reference with base register and no offset
4540       return C_Memory;
4541     case 'S': // A symbolic address
4542       return C_Other;
4543     }
4544   }
4545
4546   // FIXME: Ump, Utf, Usa, Ush
4547   // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
4548   //      whatever they may be
4549   // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
4550   // Usa: An absolute symbolic address
4551   // Ush: The high part (bits 32:12) of a pc-relative symbolic address
4552   assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
4553          && Constraint != "Ush" && "Unimplemented constraints");
4554
4555   return TargetLowering::getConstraintType(Constraint);
4556 }
4557
4558 TargetLowering::ConstraintWeight
4559 AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
4560                                                 const char *Constraint) const {
4561
4562   llvm_unreachable("Constraint weight unimplemented");
4563 }
4564
4565 void
4566 AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
4567                                                     std::string &Constraint,
4568                                                     std::vector<SDValue> &Ops,
4569                                                     SelectionDAG &DAG) const {
4570   SDValue Result(0, 0);
4571
4572   // Only length 1 constraints are C_Other.
4573   if (Constraint.size() != 1) return;
4574
4575   // Only C_Other constraints get lowered like this. That means constants for us
4576   // so return early if there's no hope the constraint can be lowered.
4577
4578   switch(Constraint[0]) {
4579   default: break;
4580   case 'I': case 'J': case 'K': case 'L':
4581   case 'M': case 'N': case 'Z': {
4582     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
4583     if (!C)
4584       return;
4585
4586     uint64_t CVal = C->getZExtValue();
4587     uint32_t Bits;
4588
4589     switch (Constraint[0]) {
4590     default:
4591       // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
4592       // is a peculiarly useless SUB constraint.
4593       llvm_unreachable("Unimplemented C_Other constraint");
4594     case 'I':
4595       if (CVal <= 0xfff)
4596         break;
4597       return;
4598     case 'K':
4599       if (A64Imms::isLogicalImm(32, CVal, Bits))
4600         break;
4601       return;
4602     case 'L':
4603       if (A64Imms::isLogicalImm(64, CVal, Bits))
4604         break;
4605       return;
4606     case 'Z':
4607       if (CVal == 0)
4608         break;
4609       return;
4610     }
4611
4612     Result = DAG.getTargetConstant(CVal, Op.getValueType());
4613     break;
4614   }
4615   case 'S': {
4616     // An absolute symbolic address or label reference.
4617     if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
4618       Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
4619                                           GA->getValueType(0));
4620     } else if (const BlockAddressSDNode *BA
4621                  = dyn_cast<BlockAddressSDNode>(Op)) {
4622       Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
4623                                          BA->getValueType(0));
4624     } else if (const ExternalSymbolSDNode *ES
4625                  = dyn_cast<ExternalSymbolSDNode>(Op)) {
4626       Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
4627                                            ES->getValueType(0));
4628     } else
4629       return;
4630     break;
4631   }
4632   case 'Y':
4633     if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
4634       if (CFP->isExactlyValue(0.0)) {
4635         Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
4636         break;
4637       }
4638     }
4639     return;
4640   }
4641
4642   if (Result.getNode()) {
4643     Ops.push_back(Result);
4644     return;
4645   }
4646
4647   // It's an unknown constraint for us. Let generic code have a go.
4648   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
4649 }
4650
4651 std::pair<unsigned, const TargetRegisterClass*>
4652 AArch64TargetLowering::getRegForInlineAsmConstraint(
4653                                                   const std::string &Constraint,
4654                                                   MVT VT) const {
4655   if (Constraint.size() == 1) {
4656     switch (Constraint[0]) {
4657     case 'r':
4658       if (VT.getSizeInBits() <= 32)
4659         return std::make_pair(0U, &AArch64::GPR32RegClass);
4660       else if (VT == MVT::i64)
4661         return std::make_pair(0U, &AArch64::GPR64RegClass);
4662       break;
4663     case 'w':
4664       if (VT == MVT::f16)
4665         return std::make_pair(0U, &AArch64::FPR16RegClass);
4666       else if (VT == MVT::f32)
4667         return std::make_pair(0U, &AArch64::FPR32RegClass);
4668       else if (VT.getSizeInBits() == 64)
4669         return std::make_pair(0U, &AArch64::FPR64RegClass);
4670       else if (VT.getSizeInBits() == 128)
4671         return std::make_pair(0U, &AArch64::FPR128RegClass);
4672       break;
4673     }
4674   }
4675
4676   // Use the default implementation in TargetLowering to convert the register
4677   // constraint into a member of a register class.
4678   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
4679 }
4680
4681 /// Represent NEON load and store intrinsics as MemIntrinsicNodes.
4682 /// The associated MachineMemOperands record the alignment specified
4683 /// in the intrinsic calls.
4684 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4685                                                const CallInst &I,
4686                                                unsigned Intrinsic) const {
4687   switch (Intrinsic) {
4688   case Intrinsic::arm_neon_vld1:
4689   case Intrinsic::arm_neon_vld2:
4690   case Intrinsic::arm_neon_vld3:
4691   case Intrinsic::arm_neon_vld4:
4692   case Intrinsic::aarch64_neon_vld1x2:
4693   case Intrinsic::aarch64_neon_vld1x3:
4694   case Intrinsic::aarch64_neon_vld1x4:
4695   case Intrinsic::arm_neon_vld2lane:
4696   case Intrinsic::arm_neon_vld3lane:
4697   case Intrinsic::arm_neon_vld4lane: {
4698     Info.opc = ISD::INTRINSIC_W_CHAIN;
4699     // Conservatively set memVT to the entire set of vectors loaded.
4700     uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
4701     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
4702     Info.ptrVal = I.getArgOperand(0);
4703     Info.offset = 0;
4704     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
4705     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
4706     Info.vol = false; // volatile loads with NEON intrinsics not supported
4707     Info.readMem = true;
4708     Info.writeMem = false;
4709     return true;
4710   }
4711   case Intrinsic::arm_neon_vst1:
4712   case Intrinsic::arm_neon_vst2:
4713   case Intrinsic::arm_neon_vst3:
4714   case Intrinsic::arm_neon_vst4:
4715   case Intrinsic::aarch64_neon_vst1x2:
4716   case Intrinsic::aarch64_neon_vst1x3:
4717   case Intrinsic::aarch64_neon_vst1x4:
4718   case Intrinsic::arm_neon_vst2lane:
4719   case Intrinsic::arm_neon_vst3lane:
4720   case Intrinsic::arm_neon_vst4lane: {
4721     Info.opc = ISD::INTRINSIC_VOID;
4722     // Conservatively set memVT to the entire set of vectors stored.
4723     unsigned NumElts = 0;
4724     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
4725       Type *ArgTy = I.getArgOperand(ArgI)->getType();
4726       if (!ArgTy->isVectorTy())
4727         break;
4728       NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
4729     }
4730     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
4731     Info.ptrVal = I.getArgOperand(0);
4732     Info.offset = 0;
4733     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
4734     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
4735     Info.vol = false; // volatile stores with NEON intrinsics not supported
4736     Info.readMem = false;
4737     Info.writeMem = true;
4738     return true;
4739   }
4740   default:
4741     break;
4742   }
4743
4744   return false;
4745 }