lib/Target/AArch64/AArch64ISelLowering.cpp

   1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that AArch64 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #define DEBUG_TYPE "aarch64-isel"
  16 #include "AArch64.h"
  17 #include "AArch64ISelLowering.h"
  18 #include "AArch64MachineFunctionInfo.h"
  19 #include "AArch64TargetMachine.h"
  20 #include "AArch64TargetObjectFile.h"
  21 #include "Utils/AArch64BaseInfo.h"
  22 #include "llvm/CodeGen/Analysis.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  28 #include "llvm/IR/CallingConv.h"
  29 #include "llvm/Support/MathExtras.h"
  30
  31 using namespace llvm;
  32
  33 static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
  34   assert (TM.getSubtarget<AArch64Subtarget>().isTargetELF() &&
  35           "unknown subtarget type");
  36   return new AArch64ElfTargetObjectFile();
  37 }
  38
  39 AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
  40   : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
  41
  42   const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
  43
  44   // SIMD compares set the entire lane's bits to 1
  45   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  46
  47   // Scalar register <-> type mapping
  48   addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
  49   addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
  50
  51   if (Subtarget->hasFPARMv8()) {
  52     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
  53     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
  54     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
  55     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
  56   }
  57
  58   if (Subtarget->hasNEON()) {
  59     // And the vectors
  60     addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
  61     addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
  62     addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
  63     addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
  64     addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
  65     addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
  66     addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
  67     addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
  68     addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
  69     addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
  70     addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
  71     addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
  72     addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
  73     addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
  74     addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
  75     addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
  76   }
  77
  78   computeRegisterProperties();
  79
  80   // We combine OR nodes for bitfield and NEON BSL operations.
  81   setTargetDAGCombine(ISD::OR);
  82
  83   setTargetDAGCombine(ISD::AND);
  84   setTargetDAGCombine(ISD::SRA);
  85   setTargetDAGCombine(ISD::SRL);
  86   setTargetDAGCombine(ISD::SHL);
  87
  88   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
  89   setTargetDAGCombine(ISD::INTRINSIC_VOID);
  90   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
  91
  92   // AArch64 does not have i1 loads, or much of anything for i1 really.
  93   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
  94   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
  95   setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
  96
  97   setStackPointerRegisterToSaveRestore(AArch64::XSP);
  98   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
  99   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 100   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
 101
 102   // We'll lower globals to wrappers for selection.
 103   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 104   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 105
 106   // A64 instructions have the comparison predicate attached to the user of the
 107   // result, but having a separate comparison is valuable for matching.
 108   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
 109   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
 110   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
 111   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
 112
 113   setOperationAction(ISD::SELECT, MVT::i32, Custom);
 114   setOperationAction(ISD::SELECT, MVT::i64, Custom);
 115   setOperationAction(ISD::SELECT, MVT::f32, Custom);
 116   setOperationAction(ISD::SELECT, MVT::f64, Custom);
 117
 118   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 119   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
 120   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
 121   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 122
 123   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 124
 125   setOperationAction(ISD::SETCC, MVT::i32, Custom);
 126   setOperationAction(ISD::SETCC, MVT::i64, Custom);
 127   setOperationAction(ISD::SETCC, MVT::f32, Custom);
 128   setOperationAction(ISD::SETCC, MVT::f64, Custom);
 129
 130   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 131   setOperationAction(ISD::JumpTable, MVT::i32, Custom);
 132   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 133
 134   setOperationAction(ISD::VASTART, MVT::Other, Custom);
 135   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
 136   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 137   setOperationAction(ISD::VAARG, MVT::Other, Expand);
 138
 139   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 140   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 141
 142   setOperationAction(ISD::ROTL, MVT::i32, Expand);
 143   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 144
 145   setOperationAction(ISD::UREM, MVT::i32, Expand);
 146   setOperationAction(ISD::UREM, MVT::i64, Expand);
 147   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 148   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
 149
 150   setOperationAction(ISD::SREM, MVT::i32, Expand);
 151   setOperationAction(ISD::SREM, MVT::i64, Expand);
 152   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
 153   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
 154
 155   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 156   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 157   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
 158   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 159
 160   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 161   setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 162
 163   // Legal floating-point operations.
 164   setOperationAction(ISD::FABS, MVT::f32, Legal);
 165   setOperationAction(ISD::FABS, MVT::f64, Legal);
 166
 167   setOperationAction(ISD::FCEIL, MVT::f32, Legal);
 168   setOperationAction(ISD::FCEIL, MVT::f64, Legal);
 169
 170   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
 171   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
 172
 173   setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
 174   setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
 175
 176   setOperationAction(ISD::FNEG, MVT::f32, Legal);
 177   setOperationAction(ISD::FNEG, MVT::f64, Legal);
 178
 179   setOperationAction(ISD::FRINT, MVT::f32, Legal);
 180   setOperationAction(ISD::FRINT, MVT::f64, Legal);
 181
 182   setOperationAction(ISD::FSQRT, MVT::f32, Legal);
 183   setOperationAction(ISD::FSQRT, MVT::f64, Legal);
 184
 185   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 186   setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
 187
 188   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 189   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 190   setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
 191
 192   // Illegal floating-point operations.
 193   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 194   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 195
 196   setOperationAction(ISD::FCOS, MVT::f32, Expand);
 197   setOperationAction(ISD::FCOS, MVT::f64, Expand);
 198
 199   setOperationAction(ISD::FEXP, MVT::f32, Expand);
 200   setOperationAction(ISD::FEXP, MVT::f64, Expand);
 201
 202   setOperationAction(ISD::FEXP2, MVT::f32, Expand);
 203   setOperationAction(ISD::FEXP2, MVT::f64, Expand);
 204
 205   setOperationAction(ISD::FLOG, MVT::f32, Expand);
 206   setOperationAction(ISD::FLOG, MVT::f64, Expand);
 207
 208   setOperationAction(ISD::FLOG2, MVT::f32, Expand);
 209   setOperationAction(ISD::FLOG2, MVT::f64, Expand);
 210
 211   setOperationAction(ISD::FLOG10, MVT::f32, Expand);
 212   setOperationAction(ISD::FLOG10, MVT::f64, Expand);
 213
 214   setOperationAction(ISD::FPOW, MVT::f32, Expand);
 215   setOperationAction(ISD::FPOW, MVT::f64, Expand);
 216
 217   setOperationAction(ISD::FPOWI, MVT::f32, Expand);
 218   setOperationAction(ISD::FPOWI, MVT::f64, Expand);
 219
 220   setOperationAction(ISD::FREM, MVT::f32, Expand);
 221   setOperationAction(ISD::FREM, MVT::f64, Expand);
 222
 223   setOperationAction(ISD::FSIN, MVT::f32, Expand);
 224   setOperationAction(ISD::FSIN, MVT::f64, Expand);
 225
 226   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 227   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 228
 229   // Virtually no operation on f128 is legal, but LLVM can't expand them when
 230   // there's a valid register class, so we need custom operations in most cases.
 231   setOperationAction(ISD::FABS,       MVT::f128, Expand);
 232   setOperationAction(ISD::FADD,       MVT::f128, Custom);
 233   setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
 234   setOperationAction(ISD::FCOS,       MVT::f128, Expand);
 235   setOperationAction(ISD::FDIV,       MVT::f128, Custom);
 236   setOperationAction(ISD::FMA,        MVT::f128, Expand);
 237   setOperationAction(ISD::FMUL,       MVT::f128, Custom);
 238   setOperationAction(ISD::FNEG,       MVT::f128, Expand);
 239   setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
 240   setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
 241   setOperationAction(ISD::FPOW,       MVT::f128, Expand);
 242   setOperationAction(ISD::FREM,       MVT::f128, Expand);
 243   setOperationAction(ISD::FRINT,      MVT::f128, Expand);
 244   setOperationAction(ISD::FSIN,       MVT::f128, Expand);
 245   setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
 246   setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
 247   setOperationAction(ISD::FSUB,       MVT::f128, Custom);
 248   setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
 249   setOperationAction(ISD::SETCC,      MVT::f128, Custom);
 250   setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
 251   setOperationAction(ISD::SELECT,     MVT::f128, Expand);
 252   setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
 253   setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
 254
 255   // Lowering for many of the conversions is actually specified by the non-f128
 256   // type. The LowerXXX function will be trivial when f128 isn't involved.
 257   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 258   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 259   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
 260   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 261   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 262   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
 263   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 264   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 265   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
 266   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 267   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 268   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
 269   setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
 270   setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
 271
 272   // This prevents LLVM trying to compress double constants into a floating
 273   // constant-pool entry and trying to load from there. It's of doubtful benefit
 274   // for A64: we'd need LDR followed by FCVT, I believe.
 275   setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
 276   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
 277   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
 278
 279   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 280   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
 281   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 282   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 283   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 284   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 285
 286   setExceptionPointerRegister(AArch64::X0);
 287   setExceptionSelectorRegister(AArch64::X1);
 288
 289   if (Subtarget->hasNEON()) {
 290     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Expand);
 291     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 292     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 293     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v1i64, Expand);
 294     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v16i8, Expand);
 295     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Expand);
 296     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 297     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Expand);
 298
 299     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
 300     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
 301     setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
 302     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
 303     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
 304     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
 305     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
 306     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
 307     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
 308     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
 309     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
 310     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
 311     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
 312     setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
 313     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
 314
 315     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
 316     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
 317     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
 318     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
 319     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
 320     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
 321     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
 322     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
 323     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
 324     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
 325     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
 326     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
 327
 328     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32, Legal);
 329     setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
 330     setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
 331     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
 332     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
 333     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
 334     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
 335
 336     setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i8, Custom);
 337     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16, Custom);
 338     setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
 339     setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
 340     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
 341
 342     setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
 343     setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
 344     setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
 345     setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
 346     setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
 347     setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
 348     setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
 349     setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
 350     setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
 351     setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
 352     setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
 353     setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
 354
 355     setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
 356     setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
 357     setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
 358     setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
 359
 360     setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
 361     setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
 362     setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
 363     setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
 364
 365     setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
 366     setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
 367     setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
 368     setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
 369
 370     setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
 371     setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
 372     setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
 373     setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
 374
 375     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
 376     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
 377     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
 378     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
 379
 380     setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
 381     setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
 382     setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
 383     setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
 384
 385     setOperationAction(ISD::SINT_TO_FP, MVT::v1i8, Custom);
 386     setOperationAction(ISD::SINT_TO_FP, MVT::v1i16, Custom);
 387     setOperationAction(ISD::SINT_TO_FP, MVT::v1i32, Custom);
 388     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
 389     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
 390     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
 391
 392     setOperationAction(ISD::UINT_TO_FP, MVT::v1i8, Custom);
 393     setOperationAction(ISD::UINT_TO_FP, MVT::v1i16, Custom);
 394     setOperationAction(ISD::UINT_TO_FP, MVT::v1i32, Custom);
 395     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
 396     setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
 397     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
 398
 399     setOperationAction(ISD::FP_TO_SINT, MVT::v1i8, Custom);
 400     setOperationAction(ISD::FP_TO_SINT, MVT::v1i16, Custom);
 401     setOperationAction(ISD::FP_TO_SINT, MVT::v1i32, Custom);
 402     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
 403     setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
 404     setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Custom);
 405
 406     setOperationAction(ISD::FP_TO_UINT, MVT::v1i8, Custom);
 407     setOperationAction(ISD::FP_TO_UINT, MVT::v1i16, Custom);
 408     setOperationAction(ISD::FP_TO_UINT, MVT::v1i32, Custom);
 409     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
 410     setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
 411     setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Custom);
 412
 413     // Neon does not support vector divide/remainder operations except
 414     // floating-point divide.
 415     setOperationAction(ISD::SDIV, MVT::v1i8, Expand);
 416     setOperationAction(ISD::SDIV, MVT::v8i8, Expand);
 417     setOperationAction(ISD::SDIV, MVT::v16i8, Expand);
 418     setOperationAction(ISD::SDIV, MVT::v1i16, Expand);
 419     setOperationAction(ISD::SDIV, MVT::v4i16, Expand);
 420     setOperationAction(ISD::SDIV, MVT::v8i16, Expand);
 421     setOperationAction(ISD::SDIV, MVT::v1i32, Expand);
 422     setOperationAction(ISD::SDIV, MVT::v2i32, Expand);
 423     setOperationAction(ISD::SDIV, MVT::v4i32, Expand);
 424     setOperationAction(ISD::SDIV, MVT::v1i64, Expand);
 425     setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
 426
 427     setOperationAction(ISD::UDIV, MVT::v1i8, Expand);
 428     setOperationAction(ISD::UDIV, MVT::v8i8, Expand);
 429     setOperationAction(ISD::UDIV, MVT::v16i8, Expand);
 430     setOperationAction(ISD::UDIV, MVT::v1i16, Expand);
 431     setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
 432     setOperationAction(ISD::UDIV, MVT::v8i16, Expand);
 433     setOperationAction(ISD::UDIV, MVT::v1i32, Expand);
 434     setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
 435     setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
 436     setOperationAction(ISD::UDIV, MVT::v1i64, Expand);
 437     setOperationAction(ISD::UDIV, MVT::v2i64, Expand);
 438
 439     setOperationAction(ISD::SREM, MVT::v1i8, Expand);
 440     setOperationAction(ISD::SREM, MVT::v8i8, Expand);
 441     setOperationAction(ISD::SREM, MVT::v16i8, Expand);
 442     setOperationAction(ISD::SREM, MVT::v1i16, Expand);
 443     setOperationAction(ISD::SREM, MVT::v4i16, Expand);
 444     setOperationAction(ISD::SREM, MVT::v8i16, Expand);
 445     setOperationAction(ISD::SREM, MVT::v1i32, Expand);
 446     setOperationAction(ISD::SREM, MVT::v2i32, Expand);
 447     setOperationAction(ISD::SREM, MVT::v4i32, Expand);
 448     setOperationAction(ISD::SREM, MVT::v1i64, Expand);
 449     setOperationAction(ISD::SREM, MVT::v2i64, Expand);
 450
 451     setOperationAction(ISD::UREM, MVT::v1i8, Expand);
 452     setOperationAction(ISD::UREM, MVT::v8i8, Expand);
 453     setOperationAction(ISD::UREM, MVT::v16i8, Expand);
 454     setOperationAction(ISD::UREM, MVT::v1i16, Expand);
 455     setOperationAction(ISD::UREM, MVT::v4i16, Expand);
 456     setOperationAction(ISD::UREM, MVT::v8i16, Expand);
 457     setOperationAction(ISD::UREM, MVT::v1i32, Expand);
 458     setOperationAction(ISD::UREM, MVT::v2i32, Expand);
 459     setOperationAction(ISD::UREM, MVT::v4i32, Expand);
 460     setOperationAction(ISD::UREM, MVT::v1i64, Expand);
 461     setOperationAction(ISD::UREM, MVT::v2i64, Expand);
 462
 463     setOperationAction(ISD::FREM, MVT::v2f32, Expand);
 464     setOperationAction(ISD::FREM, MVT::v4f32, Expand);
 465     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
 466     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
 467
 468     setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
 469     setOperationAction(ISD::SELECT, MVT::v16i8, Expand);
 470     setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
 471     setOperationAction(ISD::SELECT, MVT::v8i16, Expand);
 472     setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
 473     setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
 474     setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
 475     setOperationAction(ISD::SELECT, MVT::v2i64, Expand);
 476     setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
 477     setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
 478     setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
 479     setOperationAction(ISD::SELECT, MVT::v2f64, Expand);
 480
 481     setOperationAction(ISD::SELECT_CC, MVT::v8i8, Custom);
 482     setOperationAction(ISD::SELECT_CC, MVT::v16i8, Custom);
 483     setOperationAction(ISD::SELECT_CC, MVT::v4i16, Custom);
 484     setOperationAction(ISD::SELECT_CC, MVT::v8i16, Custom);
 485     setOperationAction(ISD::SELECT_CC, MVT::v2i32, Custom);
 486     setOperationAction(ISD::SELECT_CC, MVT::v4i32, Custom);
 487     setOperationAction(ISD::SELECT_CC, MVT::v1i64, Custom);
 488     setOperationAction(ISD::SELECT_CC, MVT::v2i64, Custom);
 489     setOperationAction(ISD::SELECT_CC, MVT::v2f32, Custom);
 490     setOperationAction(ISD::SELECT_CC, MVT::v4f32, Custom);
 491     setOperationAction(ISD::SELECT_CC, MVT::v1f64, Custom);
 492     setOperationAction(ISD::SELECT_CC, MVT::v2f64, Custom);
 493
 494     // Vector ExtLoad and TruncStore are expanded.
 495     for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
 496          I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
 497       MVT VT = (MVT::SimpleValueType) I;
 498       setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
 499       setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
 500       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
 501       for (unsigned II = MVT::FIRST_VECTOR_VALUETYPE;
 502            II <= MVT::LAST_VECTOR_VALUETYPE; ++II) {
 503         MVT VT1 = (MVT::SimpleValueType) II;
 504         // A TruncStore has two vector types of the same number of elements
 505         // and different element sizes.
 506         if (VT.getVectorNumElements() == VT1.getVectorNumElements() &&
 507             VT.getVectorElementType().getSizeInBits()
 508                 > VT1.getVectorElementType().getSizeInBits())
 509           setTruncStoreAction(VT, VT1, Expand);
 510       }
 511     }
 512
 513     // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply.
 514     // FIXME: For a v2i64 multiply, we copy VPR to GPR and do 2 i64 multiplies,
 515     // and then copy back to VPR. This solution may be optimized by Following 3
 516     // NEON instructions:
 517     //        pmull  v2.1q, v0.1d, v1.1d
 518     //        pmull2 v3.1q, v0.2d, v1.2d
 519     //        ins    v2.d[1], v3.d[0]
 520     // As currently we can't verify the correctness of such assumption, we can
 521     // do such optimization in the future.
 522     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 523     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
 524
 525     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
 526     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
 527     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
 528     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
 529     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
 530     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
 531     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
 532     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
 533     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
 534   }
 535
 536   setTargetDAGCombine(ISD::SETCC);
 537   setTargetDAGCombine(ISD::SIGN_EXTEND);
 538   setTargetDAGCombine(ISD::VSELECT);
 539 }
 540
 541 EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
 542   // It's reasonably important that this value matches the "natural" legal
 543   // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
 544   // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
 545   if (!VT.isVector()) return MVT::i32;
 546   return VT.changeVectorElementTypeToInteger();
 547 }
 548
 549 static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
 550                                   unsigned &LdrOpc,
 551                                   unsigned &StrOpc) {
 552   static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
 553                                        AArch64::LDXR_word, AArch64::LDXR_dword};
 554   static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
 555                                      AArch64::LDAXR_word, AArch64::LDAXR_dword};
 556   static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
 557                                        AArch64::STXR_word, AArch64::STXR_dword};
 558   static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
 559                                      AArch64::STLXR_word, AArch64::STLXR_dword};
 560
 561   const unsigned *LoadOps, *StoreOps;
 562   if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
 563     LoadOps = LoadAcqs;
 564   else
 565     LoadOps = LoadBares;
 566
 567   if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
 568     StoreOps = StoreRels;
 569   else
 570     StoreOps = StoreBares;
 571
 572   assert(isPowerOf2_32(Size) && Size <= 8 &&
 573          "unsupported size for atomic binary op!");
 574
 575   LdrOpc = LoadOps[Log2_32(Size)];
 576   StrOpc = StoreOps[Log2_32(Size)];
 577 }
 578
 579 // FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
 580 // have value type mapped, and they are both being defined as MVT::untyped.
 581 // Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
 582 // would fail to figure out the register pressure correctly.
 583 std::pair<const TargetRegisterClass*, uint8_t>
 584 AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
 585   const TargetRegisterClass *RRC = 0;
 586   uint8_t Cost = 1;
 587   switch (VT.SimpleTy) {
 588   default:
 589     return TargetLowering::findRepresentativeClass(VT);
 590   case MVT::v4i64:
 591     RRC = &AArch64::QPairRegClass;
 592     Cost = 2;
 593     break;
 594   case MVT::v8i64:
 595     RRC = &AArch64::QQuadRegClass;
 596     Cost = 4;
 597     break;
 598   }
 599   return std::make_pair(RRC, Cost);
 600 }
 601
 602 MachineBasicBlock *
 603 AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 604                                         unsigned Size,
 605                                         unsigned BinOpcode) const {
 606   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
 607   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 608
 609   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 610   MachineFunction *MF = BB->getParent();
 611   MachineFunction::iterator It = BB;
 612   ++It;
 613
 614   unsigned dest = MI->getOperand(0).getReg();
 615   unsigned ptr = MI->getOperand(1).getReg();
 616   unsigned incr = MI->getOperand(2).getReg();
 617   AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
 618   DebugLoc dl = MI->getDebugLoc();
 619
 620   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 621
 622   unsigned ldrOpc, strOpc;
 623   getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
 624
 625   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 626   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 627   MF->insert(It, loopMBB);
 628   MF->insert(It, exitMBB);
 629
 630   // Transfer the remainder of BB and its successor edges to exitMBB.
 631   exitMBB->splice(exitMBB->begin(), BB,
 632                   llvm::next(MachineBasicBlock::iterator(MI)),
 633                   BB->end());
 634   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 635
 636   const TargetRegisterClass *TRC
 637     = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
 638   unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
 639
 640   //  thisMBB:
 641   //   ...
 642   //   fallthrough --> loopMBB
 643   BB->addSuccessor(loopMBB);
 644
 645   //  loopMBB:
 646   //   ldxr dest, ptr
 647   //   <binop> scratch, dest, incr
 648   //   stxr stxr_status, scratch, ptr
 649   //   cbnz stxr_status, loopMBB
 650   //   fallthrough --> exitMBB
 651   BB = loopMBB;
 652   BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
 653   if (BinOpcode) {
 654     // All arithmetic operations we'll be creating are designed to take an extra
 655     // shift or extend operand, which we can conveniently set to zero.
 656
 657     // Operand order needs to go the other way for NAND.
 658     if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
 659       BuildMI(BB, dl, TII->get(BinOpcode), scratch)
 660         .addReg(incr).addReg(dest).addImm(0);
 661     else
 662       BuildMI(BB, dl, TII->get(BinOpcode), scratch)
 663         .addReg(dest).addReg(incr).addImm(0);
 664   }
 665
 666   // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
 667   unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
 668   MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
 669
 670   BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
 671   BuildMI(BB, dl, TII->get(AArch64::CBNZw))
 672     .addReg(stxr_status).addMBB(loopMBB);
 673
 674   BB->addSuccessor(loopMBB);
 675   BB->addSuccessor(exitMBB);
 676
 677   //  exitMBB:
 678   //   ...
 679   BB = exitMBB;
 680
 681   MI->eraseFromParent();   // The instruction is gone now.
 682
 683   return BB;
 684 }
 685
 686 MachineBasicBlock *
 687 AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
 688                                               MachineBasicBlock *BB,
 689                                               unsigned Size,
 690                                               unsigned CmpOp,
 691                                               A64CC::CondCodes Cond) const {
 692   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 693
 694   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 695   MachineFunction *MF = BB->getParent();
 696   MachineFunction::iterator It = BB;
 697   ++It;
 698
 699   unsigned dest = MI->getOperand(0).getReg();
 700   unsigned ptr = MI->getOperand(1).getReg();
 701   unsigned incr = MI->getOperand(2).getReg();
 702   AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
 703
 704   unsigned oldval = dest;
 705   DebugLoc dl = MI->getDebugLoc();
 706
 707   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 708   const TargetRegisterClass *TRC, *TRCsp;
 709   if (Size == 8) {
 710     TRC = &AArch64::GPR64RegClass;
 711     TRCsp = &AArch64::GPR64xspRegClass;
 712   } else {
 713     TRC = &AArch64::GPR32RegClass;
 714     TRCsp = &AArch64::GPR32wspRegClass;
 715   }
 716
 717   unsigned ldrOpc, strOpc;
 718   getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
 719
 720   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 721   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 722   MF->insert(It, loopMBB);
 723   MF->insert(It, exitMBB);
 724
 725   // Transfer the remainder of BB and its successor edges to exitMBB.
 726   exitMBB->splice(exitMBB->begin(), BB,
 727                   llvm::next(MachineBasicBlock::iterator(MI)),
 728                   BB->end());
 729   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 730
 731   unsigned scratch = MRI.createVirtualRegister(TRC);
 732   MRI.constrainRegClass(scratch, TRCsp);
 733
 734   //  thisMBB:
 735   //   ...
 736   //   fallthrough --> loopMBB
 737   BB->addSuccessor(loopMBB);
 738
 739   //  loopMBB:
 740   //   ldxr dest, ptr
 741   //   cmp incr, dest (, sign extend if necessary)
 742   //   csel scratch, dest, incr, cond
 743   //   stxr stxr_status, scratch, ptr
 744   //   cbnz stxr_status, loopMBB
 745   //   fallthrough --> exitMBB
 746   BB = loopMBB;
 747   BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
 748
 749   // Build compare and cmov instructions.
 750   MRI.constrainRegClass(incr, TRCsp);
 751   BuildMI(BB, dl, TII->get(CmpOp))
 752     .addReg(incr).addReg(oldval).addImm(0);
 753
 754   BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
 755           scratch)
 756     .addReg(oldval).addReg(incr).addImm(Cond);
 757
 758   unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
 759   MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
 760
 761   BuildMI(BB, dl, TII->get(strOpc), stxr_status)
 762     .addReg(scratch).addReg(ptr);
 763   BuildMI(BB, dl, TII->get(AArch64::CBNZw))
 764     .addReg(stxr_status).addMBB(loopMBB);
 765
 766   BB->addSuccessor(loopMBB);
 767   BB->addSuccessor(exitMBB);
 768
 769   //  exitMBB:
 770   //   ...
 771   BB = exitMBB;
 772
 773   MI->eraseFromParent();   // The instruction is gone now.
 774
 775   return BB;
 776 }
 777
 778 MachineBasicBlock *
 779 AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
 780                                          MachineBasicBlock *BB,
 781                                          unsigned Size) const {
 782   unsigned dest    = MI->getOperand(0).getReg();
 783   unsigned ptr     = MI->getOperand(1).getReg();
 784   unsigned oldval  = MI->getOperand(2).getReg();
 785   unsigned newval  = MI->getOperand(3).getReg();
 786   AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
 787   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 788   DebugLoc dl = MI->getDebugLoc();
 789
 790   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 791   const TargetRegisterClass *TRCsp;
 792   TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
 793
 794   unsigned ldrOpc, strOpc;
 795   getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
 796
 797   MachineFunction *MF = BB->getParent();
 798   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 799   MachineFunction::iterator It = BB;
 800   ++It; // insert the new blocks after the current block
 801
 802   MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
 803   MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
 804   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 805   MF->insert(It, loop1MBB);
 806   MF->insert(It, loop2MBB);
 807   MF->insert(It, exitMBB);
 808
 809   // Transfer the remainder of BB and its successor edges to exitMBB.
 810   exitMBB->splice(exitMBB->begin(), BB,
 811                   llvm::next(MachineBasicBlock::iterator(MI)),
 812                   BB->end());
 813   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 814
 815   //  thisMBB:
 816   //   ...
 817   //   fallthrough --> loop1MBB
 818   BB->addSuccessor(loop1MBB);
 819
 820   // loop1MBB:
 821   //   ldxr dest, [ptr]
 822   //   cmp dest, oldval
 823   //   b.ne exitMBB
 824   BB = loop1MBB;
 825   BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
 826
 827   unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
 828   MRI.constrainRegClass(dest, TRCsp);
 829   BuildMI(BB, dl, TII->get(CmpOp))
 830     .addReg(dest).addReg(oldval).addImm(0);
 831   BuildMI(BB, dl, TII->get(AArch64::Bcc))
 832     .addImm(A64CC::NE).addMBB(exitMBB);
 833   BB->addSuccessor(loop2MBB);
 834   BB->addSuccessor(exitMBB);
 835
 836   // loop2MBB:
 837   //   strex stxr_status, newval, [ptr]
 838   //   cbnz stxr_status, loop1MBB
 839   BB = loop2MBB;
 840   unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
 841   MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
 842
 843   BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
 844   BuildMI(BB, dl, TII->get(AArch64::CBNZw))
 845     .addReg(stxr_status).addMBB(loop1MBB);
 846   BB->addSuccessor(loop1MBB);
 847   BB->addSuccessor(exitMBB);
 848
 849   //  exitMBB:
 850   //   ...
 851   BB = exitMBB;
 852
 853   MI->eraseFromParent();   // The instruction is gone now.
 854
 855   return BB;
 856 }
 857
 858 MachineBasicBlock *
 859 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
 860                                     MachineBasicBlock *MBB) const {
 861   // We materialise the F128CSEL pseudo-instruction using conditional branches
 862   // and loads, giving an instruciton sequence like:
 863   //     str q0, [sp]
 864   //     b.ne IfTrue
 865   //     b Finish
 866   // IfTrue:
 867   //     str q1, [sp]
 868   // Finish:
 869   //     ldr q0, [sp]
 870   //
 871   // Using virtual registers would probably not be beneficial since COPY
 872   // instructions are expensive for f128 (there's no actual instruction to
 873   // implement them).
 874   //
 875   // An alternative would be to do an integer-CSEL on some address. E.g.:
 876   //     mov x0, sp
 877   //     add x1, sp, #16
 878   //     str q0, [x0]
 879   //     str q1, [x1]
 880   //     csel x0, x0, x1, ne
 881   //     ldr q0, [x0]
 882   //
 883   // It's unclear which approach is actually optimal.
 884   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 885   MachineFunction *MF = MBB->getParent();
 886   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
 887   DebugLoc DL = MI->getDebugLoc();
 888   MachineFunction::iterator It = MBB;
 889   ++It;
 890
 891   unsigned DestReg = MI->getOperand(0).getReg();
 892   unsigned IfTrueReg = MI->getOperand(1).getReg();
 893   unsigned IfFalseReg = MI->getOperand(2).getReg();
 894   unsigned CondCode = MI->getOperand(3).getImm();
 895   bool NZCVKilled = MI->getOperand(4).isKill();
 896
 897   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
 898   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
 899   MF->insert(It, TrueBB);
 900   MF->insert(It, EndBB);
 901
 902   // Transfer rest of current basic-block to EndBB
 903   EndBB->splice(EndBB->begin(), MBB,
 904                 llvm::next(MachineBasicBlock::iterator(MI)),
 905                 MBB->end());
 906   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
 907
 908   // We need somewhere to store the f128 value needed.
 909   int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
 910
 911   //     [... start of incoming MBB ...]
 912   //     str qIFFALSE, [sp]
 913   //     b.cc IfTrue
 914   //     b Done
 915   BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
 916     .addReg(IfFalseReg)
 917     .addFrameIndex(ScratchFI)
 918     .addImm(0);
 919   BuildMI(MBB, DL, TII->get(AArch64::Bcc))
 920     .addImm(CondCode)
 921     .addMBB(TrueBB);
 922   BuildMI(MBB, DL, TII->get(AArch64::Bimm))
 923     .addMBB(EndBB);
 924   MBB->addSuccessor(TrueBB);
 925   MBB->addSuccessor(EndBB);
 926
 927   if (!NZCVKilled) {
 928     // NZCV is live-through TrueBB.
 929     TrueBB->addLiveIn(AArch64::NZCV);
 930     EndBB->addLiveIn(AArch64::NZCV);
 931   }
 932
 933   // IfTrue:
 934   //     str qIFTRUE, [sp]
 935   BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
 936     .addReg(IfTrueReg)
 937     .addFrameIndex(ScratchFI)
 938     .addImm(0);
 939
 940   // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
 941   // blocks.
 942   TrueBB->addSuccessor(EndBB);
 943
 944   // Done:
 945   //     ldr qDEST, [sp]
 946   //     [... rest of incoming MBB ...]
 947   MachineInstr *StartOfEnd = EndBB->begin();
 948   BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
 949     .addFrameIndex(ScratchFI)
 950     .addImm(0);
 951
 952   MI->eraseFromParent();
 953   return EndBB;
 954 }
 955
 956 MachineBasicBlock *
 957 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 958                                                  MachineBasicBlock *MBB) const {
 959   switch (MI->getOpcode()) {
 960   default: llvm_unreachable("Unhandled instruction with custom inserter");
 961   case AArch64::F128CSEL:
 962     return EmitF128CSEL(MI, MBB);
 963   case AArch64::ATOMIC_LOAD_ADD_I8:
 964     return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
 965   case AArch64::ATOMIC_LOAD_ADD_I16:
 966     return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
 967   case AArch64::ATOMIC_LOAD_ADD_I32:
 968     return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
 969   case AArch64::ATOMIC_LOAD_ADD_I64:
 970     return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
 971
 972   case AArch64::ATOMIC_LOAD_SUB_I8:
 973     return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
 974   case AArch64::ATOMIC_LOAD_SUB_I16:
 975     return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
 976   case AArch64::ATOMIC_LOAD_SUB_I32:
 977     return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
 978   case AArch64::ATOMIC_LOAD_SUB_I64:
 979     return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
 980
 981   case AArch64::ATOMIC_LOAD_AND_I8:
 982     return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
 983   case AArch64::ATOMIC_LOAD_AND_I16:
 984     return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
 985   case AArch64::ATOMIC_LOAD_AND_I32:
 986     return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
 987   case AArch64::ATOMIC_LOAD_AND_I64:
 988     return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
 989
 990   case AArch64::ATOMIC_LOAD_OR_I8:
 991     return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
 992   case AArch64::ATOMIC_LOAD_OR_I16:
 993     return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
 994   case AArch64::ATOMIC_LOAD_OR_I32:
 995     return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
 996   case AArch64::ATOMIC_LOAD_OR_I64:
 997     return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
 998
 999   case AArch64::ATOMIC_LOAD_XOR_I8:
1000     return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
1001   case AArch64::ATOMIC_LOAD_XOR_I16:
1002     return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
1003   case AArch64::ATOMIC_LOAD_XOR_I32:
1004     return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
1005   case AArch64::ATOMIC_LOAD_XOR_I64:
1006     return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
1007
1008   case AArch64::ATOMIC_LOAD_NAND_I8:
1009     return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
1010   case AArch64::ATOMIC_LOAD_NAND_I16:
1011     return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
1012   case AArch64::ATOMIC_LOAD_NAND_I32:
1013     return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
1014   case AArch64::ATOMIC_LOAD_NAND_I64:
1015     return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
1016
1017   case AArch64::ATOMIC_LOAD_MIN_I8:
1018     return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
1019   case AArch64::ATOMIC_LOAD_MIN_I16:
1020     return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
1021   case AArch64::ATOMIC_LOAD_MIN_I32:
1022     return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
1023   case AArch64::ATOMIC_LOAD_MIN_I64:
1024     return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
1025
1026   case AArch64::ATOMIC_LOAD_MAX_I8:
1027     return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
1028   case AArch64::ATOMIC_LOAD_MAX_I16:
1029     return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
1030   case AArch64::ATOMIC_LOAD_MAX_I32:
1031     return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
1032   case AArch64::ATOMIC_LOAD_MAX_I64:
1033     return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
1034
1035   case AArch64::ATOMIC_LOAD_UMIN_I8:
1036     return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
1037   case AArch64::ATOMIC_LOAD_UMIN_I16:
1038     return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
1039   case AArch64::ATOMIC_LOAD_UMIN_I32:
1040     return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
1041   case AArch64::ATOMIC_LOAD_UMIN_I64:
1042     return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
1043
1044   case AArch64::ATOMIC_LOAD_UMAX_I8:
1045     return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
1046   case AArch64::ATOMIC_LOAD_UMAX_I16:
1047     return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
1048   case AArch64::ATOMIC_LOAD_UMAX_I32:
1049     return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
1050   case AArch64::ATOMIC_LOAD_UMAX_I64:
1051     return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
1052
1053   case AArch64::ATOMIC_SWAP_I8:
1054     return emitAtomicBinary(MI, MBB, 1, 0);
1055   case AArch64::ATOMIC_SWAP_I16:
1056     return emitAtomicBinary(MI, MBB, 2, 0);
1057   case AArch64::ATOMIC_SWAP_I32:
1058     return emitAtomicBinary(MI, MBB, 4, 0);
1059   case AArch64::ATOMIC_SWAP_I64:
1060     return emitAtomicBinary(MI, MBB, 8, 0);
1061
1062   case AArch64::ATOMIC_CMP_SWAP_I8:
1063     return emitAtomicCmpSwap(MI, MBB, 1);
1064   case AArch64::ATOMIC_CMP_SWAP_I16:
1065     return emitAtomicCmpSwap(MI, MBB, 2);
1066   case AArch64::ATOMIC_CMP_SWAP_I32:
1067     return emitAtomicCmpSwap(MI, MBB, 4);
1068   case AArch64::ATOMIC_CMP_SWAP_I64:
1069     return emitAtomicCmpSwap(MI, MBB, 8);
1070   }
1071 }
1072
1073
1074 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1075   switch (Opcode) {
1076   case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
1077   case AArch64ISD::Call:           return "AArch64ISD::Call";
1078   case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
1079   case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
1080   case AArch64ISD::BFI:            return "AArch64ISD::BFI";
1081   case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
1082   case AArch64ISD::Ret:            return "AArch64ISD::Ret";
1083   case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
1084   case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
1085   case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
1086   case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
1087   case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1088   case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
1089   case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
1090   case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
1091
1092   case AArch64ISD::NEON_MOVIMM:
1093     return "AArch64ISD::NEON_MOVIMM";
1094   case AArch64ISD::NEON_MVNIMM:
1095     return "AArch64ISD::NEON_MVNIMM";
1096   case AArch64ISD::NEON_FMOVIMM:
1097     return "AArch64ISD::NEON_FMOVIMM";
1098   case AArch64ISD::NEON_CMP:
1099     return "AArch64ISD::NEON_CMP";
1100   case AArch64ISD::NEON_CMPZ:
1101     return "AArch64ISD::NEON_CMPZ";
1102   case AArch64ISD::NEON_TST:
1103     return "AArch64ISD::NEON_TST";
1104   case AArch64ISD::NEON_QSHLs:
1105     return "AArch64ISD::NEON_QSHLs";
1106   case AArch64ISD::NEON_QSHLu:
1107     return "AArch64ISD::NEON_QSHLu";
1108   case AArch64ISD::NEON_VDUP:
1109     return "AArch64ISD::NEON_VDUP";
1110   case AArch64ISD::NEON_VDUPLANE:
1111     return "AArch64ISD::NEON_VDUPLANE";
1112   case AArch64ISD::NEON_REV16:
1113     return "AArch64ISD::NEON_REV16";
1114   case AArch64ISD::NEON_REV32:
1115     return "AArch64ISD::NEON_REV32";
1116   case AArch64ISD::NEON_REV64:
1117     return "AArch64ISD::NEON_REV64";
1118   case AArch64ISD::NEON_UZP1:
1119     return "AArch64ISD::NEON_UZP1";
1120   case AArch64ISD::NEON_UZP2:
1121     return "AArch64ISD::NEON_UZP2";
1122   case AArch64ISD::NEON_ZIP1:
1123     return "AArch64ISD::NEON_ZIP1";
1124   case AArch64ISD::NEON_ZIP2:
1125     return "AArch64ISD::NEON_ZIP2";
1126   case AArch64ISD::NEON_TRN1:
1127     return "AArch64ISD::NEON_TRN1";
1128   case AArch64ISD::NEON_TRN2:
1129     return "AArch64ISD::NEON_TRN2";
1130   case AArch64ISD::NEON_LD1_UPD:
1131     return "AArch64ISD::NEON_LD1_UPD";
1132   case AArch64ISD::NEON_LD2_UPD:
1133     return "AArch64ISD::NEON_LD2_UPD";
1134   case AArch64ISD::NEON_LD3_UPD:
1135     return "AArch64ISD::NEON_LD3_UPD";
1136   case AArch64ISD::NEON_LD4_UPD:
1137     return "AArch64ISD::NEON_LD4_UPD";
1138   case AArch64ISD::NEON_ST1_UPD:
1139     return "AArch64ISD::NEON_ST1_UPD";
1140   case AArch64ISD::NEON_ST2_UPD:
1141     return "AArch64ISD::NEON_ST2_UPD";
1142   case AArch64ISD::NEON_ST3_UPD:
1143     return "AArch64ISD::NEON_ST3_UPD";
1144   case AArch64ISD::NEON_ST4_UPD:
1145     return "AArch64ISD::NEON_ST4_UPD";
1146   case AArch64ISD::NEON_LD1x2_UPD:
1147     return "AArch64ISD::NEON_LD1x2_UPD";
1148   case AArch64ISD::NEON_LD1x3_UPD:
1149     return "AArch64ISD::NEON_LD1x3_UPD";
1150   case AArch64ISD::NEON_LD1x4_UPD:
1151     return "AArch64ISD::NEON_LD1x4_UPD";
1152   case AArch64ISD::NEON_ST1x2_UPD:
1153     return "AArch64ISD::NEON_ST1x2_UPD";
1154   case AArch64ISD::NEON_ST1x3_UPD:
1155     return "AArch64ISD::NEON_ST1x3_UPD";
1156   case AArch64ISD::NEON_ST1x4_UPD:
1157     return "AArch64ISD::NEON_ST1x4_UPD";
1158   case AArch64ISD::NEON_LD2DUP:
1159     return "AArch64ISD::NEON_LD2DUP";
1160   case AArch64ISD::NEON_LD3DUP:
1161     return "AArch64ISD::NEON_LD3DUP";
1162   case AArch64ISD::NEON_LD4DUP:
1163     return "AArch64ISD::NEON_LD4DUP";
1164   case AArch64ISD::NEON_LD2DUP_UPD:
1165     return "AArch64ISD::NEON_LD2DUP_UPD";
1166   case AArch64ISD::NEON_LD3DUP_UPD:
1167     return "AArch64ISD::NEON_LD3DUP_UPD";
1168   case AArch64ISD::NEON_LD4DUP_UPD:
1169     return "AArch64ISD::NEON_LD4DUP_UPD";
1170   case AArch64ISD::NEON_LD2LN_UPD:
1171     return "AArch64ISD::NEON_LD2LN_UPD";
1172   case AArch64ISD::NEON_LD3LN_UPD:
1173     return "AArch64ISD::NEON_LD3LN_UPD";
1174   case AArch64ISD::NEON_LD4LN_UPD:
1175     return "AArch64ISD::NEON_LD4LN_UPD";
1176   case AArch64ISD::NEON_ST2LN_UPD:
1177     return "AArch64ISD::NEON_ST2LN_UPD";
1178   case AArch64ISD::NEON_ST3LN_UPD:
1179     return "AArch64ISD::NEON_ST3LN_UPD";
1180   case AArch64ISD::NEON_ST4LN_UPD:
1181     return "AArch64ISD::NEON_ST4LN_UPD";
1182   case AArch64ISD::NEON_VEXTRACT:
1183     return "AArch64ISD::NEON_VEXTRACT";
1184   default:
1185     return NULL;
1186   }
1187 }
1188
1189 static const uint16_t AArch64FPRArgRegs[] = {
1190   AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
1191   AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
1192 };
1193 static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
1194
1195 static const uint16_t AArch64ArgRegs[] = {
1196   AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
1197   AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
1198 };
1199 static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
1200
1201 static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
1202                                  CCValAssign::LocInfo LocInfo,
1203                                  ISD::ArgFlagsTy ArgFlags, CCState &State) {
1204   // Mark all remaining general purpose registers as allocated. We don't
1205   // backtrack: if (for example) an i128 gets put on the stack, no subsequent
1206   // i64 will go in registers (C.11).
1207   for (unsigned i = 0; i < NumArgRegs; ++i)
1208     State.AllocateReg(AArch64ArgRegs[i]);
1209
1210   return false;
1211 }
1212
1213 #include "AArch64GenCallingConv.inc"
1214
1215 CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1216
1217   switch(CC) {
1218   default: llvm_unreachable("Unsupported calling convention");
1219   case CallingConv::Fast:
1220   case CallingConv::C:
1221     return CC_A64_APCS;
1222   }
1223 }
1224
1225 void
1226 AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
1227                                            SDLoc DL, SDValue &Chain) const {
1228   MachineFunction &MF = DAG.getMachineFunction();
1229   MachineFrameInfo *MFI = MF.getFrameInfo();
1230   AArch64MachineFunctionInfo *FuncInfo
1231     = MF.getInfo<AArch64MachineFunctionInfo>();
1232
1233   SmallVector<SDValue, 8> MemOps;
1234
1235   unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
1236                                                          NumArgRegs);
1237   unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
1238                                                          NumFPRArgRegs);
1239
1240   unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
1241   int GPRIdx = 0;
1242   if (GPRSaveSize != 0) {
1243     GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
1244
1245     SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
1246
1247     for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
1248       unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
1249       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
1250       SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
1251                                    MachinePointerInfo::getStack(i * 8),
1252                                    false, false, 0);
1253       MemOps.push_back(Store);
1254       FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1255                         DAG.getConstant(8, getPointerTy()));
1256     }
1257   }
1258
1259   if (getSubtarget()->hasFPARMv8()) {
1260   unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
1261   int FPRIdx = 0;
1262     // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
1263     // can omit a register save area if we know we'll never use registers of
1264     // that class.
1265     if (FPRSaveSize != 0) {
1266       FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
1267
1268       SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
1269
1270       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
1271         unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
1272             &AArch64::FPR128RegClass);
1273         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
1274         SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
1275             MachinePointerInfo::getStack(i * 16),
1276             false, false, 0);
1277         MemOps.push_back(Store);
1278         FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1279             DAG.getConstant(16, getPointerTy()));
1280       }
1281     }
1282     FuncInfo->setVariadicFPRIdx(FPRIdx);
1283     FuncInfo->setVariadicFPRSize(FPRSaveSize);
1284   }
1285
1286   unsigned StackOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), 8);
1287   int StackIdx = MFI->CreateFixedObject(8, StackOffset, true);
1288
1289   FuncInfo->setVariadicStackIdx(StackIdx);
1290   FuncInfo->setVariadicGPRIdx(GPRIdx);
1291   FuncInfo->setVariadicGPRSize(GPRSaveSize);
1292
1293   if (!MemOps.empty()) {
1294     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
1295                         MemOps.size());
1296   }
1297 }
1298
1299
1300 SDValue
1301 AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
1302                                       CallingConv::ID CallConv, bool isVarArg,
1303                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1304                                       SDLoc dl, SelectionDAG &DAG,
1305                                       SmallVectorImpl<SDValue> &InVals) const {
1306   MachineFunction &MF = DAG.getMachineFunction();
1307   AArch64MachineFunctionInfo *FuncInfo
1308     = MF.getInfo<AArch64MachineFunctionInfo>();
1309   MachineFrameInfo *MFI = MF.getFrameInfo();
1310   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1311
1312   SmallVector<CCValAssign, 16> ArgLocs;
1313   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1314                  getTargetMachine(), ArgLocs, *DAG.getContext());
1315   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1316
1317   SmallVector<SDValue, 16> ArgValues;
1318
1319   SDValue ArgValue;
1320   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1321     CCValAssign &VA = ArgLocs[i];
1322     ISD::ArgFlagsTy Flags = Ins[i].Flags;
1323
1324     if (Flags.isByVal()) {
1325       // Byval is used for small structs and HFAs in the PCS, but the system
1326       // should work in a non-compliant manner for larger structs.
1327       EVT PtrTy = getPointerTy();
1328       int Size = Flags.getByValSize();
1329       unsigned NumRegs = (Size + 7) / 8;
1330
1331       unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
1332                                                  VA.getLocMemOffset(),
1333                                                  false);
1334       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
1335       InVals.push_back(FrameIdxN);
1336
1337       continue;
1338     } else if (VA.isRegLoc()) {
1339       MVT RegVT = VA.getLocVT();
1340       const TargetRegisterClass *RC = getRegClassFor(RegVT);
1341       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1342
1343       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1344     } else { // VA.isRegLoc()
1345       assert(VA.isMemLoc());
1346
1347       int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
1348                                       VA.getLocMemOffset(), true);
1349
1350       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1351       ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
1352                              MachinePointerInfo::getFixedStack(FI),
1353                              false, false, false, 0);
1354
1355
1356     }
1357
1358     switch (VA.getLocInfo()) {
1359     default: llvm_unreachable("Unknown loc info!");
1360     case CCValAssign::Full: break;
1361     case CCValAssign::BCvt:
1362       ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
1363       break;
1364     case CCValAssign::SExt:
1365     case CCValAssign::ZExt:
1366     case CCValAssign::AExt:
1367     case CCValAssign::FPExt: {
1368       unsigned DestSize = VA.getValVT().getSizeInBits();
1369       unsigned DestSubReg;
1370
1371       switch (DestSize) {
1372       case 8: DestSubReg = AArch64::sub_8; break;
1373       case 16: DestSubReg = AArch64::sub_16; break;
1374       case 32: DestSubReg = AArch64::sub_32; break;
1375       case 64: DestSubReg = AArch64::sub_64; break;
1376       default: llvm_unreachable("Unexpected argument promotion");
1377       }
1378
1379       ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
1380                                    VA.getValVT(), ArgValue,
1381                                    DAG.getTargetConstant(DestSubReg, MVT::i32)),
1382                          0);
1383       break;
1384     }
1385     }
1386
1387     InVals.push_back(ArgValue);
1388   }
1389
1390   if (isVarArg)
1391     SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
1392
1393   unsigned StackArgSize = CCInfo.getNextStackOffset();
1394   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
1395     // This is a non-standard ABI so by fiat I say we're allowed to make full
1396     // use of the stack area to be popped, which must be aligned to 16 bytes in
1397     // any case:
1398     StackArgSize = RoundUpToAlignment(StackArgSize, 16);
1399
1400     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
1401     // a multiple of 16.
1402     FuncInfo->setArgumentStackToRestore(StackArgSize);
1403
1404     // This realignment carries over to the available bytes below. Our own
1405     // callers will guarantee the space is free by giving an aligned value to
1406     // CALLSEQ_START.
1407   }
1408   // Even if we're not expected to free up the space, it's useful to know how
1409   // much is there while considering tail calls (because we can reuse it).
1410   FuncInfo->setBytesInStackArgArea(StackArgSize);
1411
1412   return Chain;
1413 }
1414
1415 SDValue
1416 AArch64TargetLowering::LowerReturn(SDValue Chain,
1417                                    CallingConv::ID CallConv, bool isVarArg,
1418                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
1419                                    const SmallVectorImpl<SDValue> &OutVals,
1420                                    SDLoc dl, SelectionDAG &DAG) const {
1421   // CCValAssign - represent the assignment of the return value to a location.
1422   SmallVector<CCValAssign, 16> RVLocs;
1423
1424   // CCState - Info about the registers and stack slots.
1425   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1426                  getTargetMachine(), RVLocs, *DAG.getContext());
1427
1428   // Analyze outgoing return values.
1429   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
1430
1431   SDValue Flag;
1432   SmallVector<SDValue, 4> RetOps(1, Chain);
1433
1434   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
1435     // PCS: "If the type, T, of the result of a function is such that
1436     // void func(T arg) would require that arg be passed as a value in a
1437     // register (or set of registers) according to the rules in 5.4, then the
1438     // result is returned in the same registers as would be used for such an
1439     // argument.
1440     //
1441     // Otherwise, the caller shall reserve a block of memory of sufficient
1442     // size and alignment to hold the result. The address of the memory block
1443     // shall be passed as an additional argument to the function in x8."
1444     //
1445     // This is implemented in two places. The register-return values are dealt
1446     // with here, more complex returns are passed as an sret parameter, which
1447     // means we don't have to worry about it during actual return.
1448     CCValAssign &VA = RVLocs[i];
1449     assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
1450
1451
1452     SDValue Arg = OutVals[i];
1453
1454     // There's no convenient note in the ABI about this as there is for normal
1455     // arguments, but it says return values are passed in the same registers as
1456     // an argument would be. I believe that includes the comments about
1457     // unspecified higher bits, putting the burden of widening on the *caller*
1458     // for return values.
1459     switch (VA.getLocInfo()) {
1460     default: llvm_unreachable("Unknown loc info");
1461     case CCValAssign::Full: break;
1462     case CCValAssign::SExt:
1463     case CCValAssign::ZExt:
1464     case CCValAssign::AExt:
1465       // Floating-point values should only be extended when they're going into
1466       // memory, which can't happen here so an integer extend is acceptable.
1467       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1468       break;
1469     case CCValAssign::BCvt:
1470       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1471       break;
1472     }
1473
1474     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
1475     Flag = Chain.getValue(1);
1476     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1477   }
1478
1479   RetOps[0] = Chain;  // Update chain.
1480
1481   // Add the flag if we have it.
1482   if (Flag.getNode())
1483     RetOps.push_back(Flag);
1484
1485   return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
1486                      &RetOps[0], RetOps.size());
1487 }
1488
1489 unsigned AArch64TargetLowering::getByValTypeAlignment(Type *Ty) const {
1490   // This is a new backend. For anything more precise than this a FE should
1491   // set an explicit alignment.
1492   return 4;
1493 }
1494
1495 SDValue
1496 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
1497                                  SmallVectorImpl<SDValue> &InVals) const {
1498   SelectionDAG &DAG                     = CLI.DAG;
1499   SDLoc &dl                             = CLI.DL;
1500   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1501   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1502   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1503   SDValue Chain                         = CLI.Chain;
1504   SDValue Callee                        = CLI.Callee;
1505   bool &IsTailCall                      = CLI.IsTailCall;
1506   CallingConv::ID CallConv              = CLI.CallConv;
1507   bool IsVarArg                         = CLI.IsVarArg;
1508
1509   MachineFunction &MF = DAG.getMachineFunction();
1510   AArch64MachineFunctionInfo *FuncInfo
1511     = MF.getInfo<AArch64MachineFunctionInfo>();
1512   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1513   bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
1514   bool IsSibCall = false;
1515
1516   if (IsTailCall) {
1517     IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1518                     IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1519                                                    Outs, OutVals, Ins, DAG);
1520
1521     // A sibling call is one where we're under the usual C ABI and not planning
1522     // to change that but can still do a tail call:
1523     if (!TailCallOpt && IsTailCall)
1524       IsSibCall = true;
1525   }
1526
1527   SmallVector<CCValAssign, 16> ArgLocs;
1528   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
1529                  getTargetMachine(), ArgLocs, *DAG.getContext());
1530   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1531
1532   // On AArch64 (and all other architectures I'm aware of) the most this has to
1533   // do is adjust the stack pointer.
1534   unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
1535   if (IsSibCall) {
1536     // Since we're not changing the ABI to make this a tail call, the memory
1537     // operands are already available in the caller's incoming argument space.
1538     NumBytes = 0;
1539   }
1540
1541   // FPDiff is the byte offset of the call's argument area from the callee's.
1542   // Stores to callee stack arguments will be placed in FixedStackSlots offset
1543   // by this amount for a tail call. In a sibling call it must be 0 because the
1544   // caller will deallocate the entire stack and the callee still expects its
1545   // arguments to begin at SP+0. Completely unused for non-tail calls.
1546   int FPDiff = 0;
1547
1548   if (IsTailCall && !IsSibCall) {
1549     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1550
1551     // FPDiff will be negative if this tail call requires more space than we
1552     // would automatically have in our incoming argument space. Positive if we
1553     // can actually shrink the stack.
1554     FPDiff = NumReusableBytes - NumBytes;
1555
1556     // The stack pointer must be 16-byte aligned at all times it's used for a
1557     // memory operation, which in practice means at *all* times and in
1558     // particular across call boundaries. Therefore our own arguments started at
1559     // a 16-byte aligned SP and the delta applied for the tail call should
1560     // satisfy the same constraint.
1561     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
1562   }
1563
1564   if (!IsSibCall)
1565     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
1566                                  dl);
1567
1568   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
1569                                         getPointerTy());
1570
1571   SmallVector<SDValue, 8> MemOpChains;
1572   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1573
1574   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1575     CCValAssign &VA = ArgLocs[i];
1576     ISD::ArgFlagsTy Flags = Outs[i].Flags;
1577     SDValue Arg = OutVals[i];
1578
1579     // Callee does the actual widening, so all extensions just use an implicit
1580     // definition of the rest of the Loc. Aesthetically, this would be nicer as
1581     // an ANY_EXTEND, but that isn't valid for floating-point types and this
1582     // alternative works on integer types too.
1583     switch (VA.getLocInfo()) {
1584     default: llvm_unreachable("Unknown loc info!");
1585     case CCValAssign::Full: break;
1586     case CCValAssign::SExt:
1587     case CCValAssign::ZExt:
1588     case CCValAssign::AExt:
1589     case CCValAssign::FPExt: {
1590       unsigned SrcSize = VA.getValVT().getSizeInBits();
1591       unsigned SrcSubReg;
1592
1593       switch (SrcSize) {
1594       case 8: SrcSubReg = AArch64::sub_8; break;
1595       case 16: SrcSubReg = AArch64::sub_16; break;
1596       case 32: SrcSubReg = AArch64::sub_32; break;
1597       case 64: SrcSubReg = AArch64::sub_64; break;
1598       default: llvm_unreachable("Unexpected argument promotion");
1599       }
1600
1601       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
1602                                     VA.getLocVT(),
1603                                     DAG.getUNDEF(VA.getLocVT()),
1604                                     Arg,
1605                                     DAG.getTargetConstant(SrcSubReg, MVT::i32)),
1606                     0);
1607
1608       break;
1609     }
1610     case CCValAssign::BCvt:
1611       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1612       break;
1613     }
1614
1615     if (VA.isRegLoc()) {
1616       // A normal register (sub-) argument. For now we just note it down because
1617       // we want to copy things into registers as late as possible to avoid
1618       // register-pressure (and possibly worse).
1619       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1620       continue;
1621     }
1622
1623     assert(VA.isMemLoc() && "unexpected argument location");
1624
1625     SDValue DstAddr;
1626     MachinePointerInfo DstInfo;
1627     if (IsTailCall) {
1628       uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
1629                                           VA.getLocVT().getSizeInBits();
1630       OpSize = (OpSize + 7) / 8;
1631       int32_t Offset = VA.getLocMemOffset() + FPDiff;
1632       int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
1633
1634       DstAddr = DAG.getFrameIndex(FI, getPointerTy());
1635       DstInfo = MachinePointerInfo::getFixedStack(FI);
1636
1637       // Make sure any stack arguments overlapping with where we're storing are
1638       // loaded before this eventual operation. Otherwise they'll be clobbered.
1639       Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
1640     } else {
1641       SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset());
1642
1643       DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1644       DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
1645     }
1646
1647     if (Flags.isByVal()) {
1648       SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
1649       SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
1650                                   Flags.getByValAlign(),
1651                                   /*isVolatile = */ false,
1652                                   /*alwaysInline = */ false,
1653                                   DstInfo, MachinePointerInfo(0));
1654       MemOpChains.push_back(Cpy);
1655     } else {
1656       // Normal stack argument, put it where it's needed.
1657       SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
1658                                    false, false, 0);
1659       MemOpChains.push_back(Store);
1660     }
1661   }
1662
1663   // The loads and stores generated above shouldn't clash with each
1664   // other. Combining them with this TokenFactor notes that fact for the rest of
1665   // the backend.
1666   if (!MemOpChains.empty())
1667     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1668                         &MemOpChains[0], MemOpChains.size());
1669
1670   // Most of the rest of the instructions need to be glued together; we don't
1671   // want assignments to actual registers used by a call to be rearranged by a
1672   // well-meaning scheduler.
1673   SDValue InFlag;
1674
1675   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1676     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1677                              RegsToPass[i].second, InFlag);
1678     InFlag = Chain.getValue(1);
1679   }
1680
1681   // The linker is responsible for inserting veneers when necessary to put a
1682   // function call destination in range, so we don't need to bother with a
1683   // wrapper here.
1684   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1685     const GlobalValue *GV = G->getGlobal();
1686     Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
1687   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1688     const char *Sym = S->getSymbol();
1689     Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
1690   }
1691
1692   // We don't usually want to end the call-sequence here because we would tidy
1693   // the frame up *after* the call, however in the ABI-changing tail-call case
1694   // we've carefully laid out the parameters so that when sp is reset they'll be
1695   // in the correct location.
1696   if (IsTailCall && !IsSibCall) {
1697     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1698                                DAG.getIntPtrConstant(0, true), InFlag, dl);
1699     InFlag = Chain.getValue(1);
1700   }
1701
1702   // We produce the following DAG scheme for the actual call instruction:
1703   //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
1704   //
1705   // Most arguments aren't going to be used and just keep the values live as
1706   // far as LLVM is concerned. It's expected to be selected as simply "bl
1707   // callee" (for a direct, non-tail call).
1708   std::vector<SDValue> Ops;
1709   Ops.push_back(Chain);
1710   Ops.push_back(Callee);
1711
1712   if (IsTailCall) {
1713     // Each tail call may have to adjust the stack by a different amount, so
1714     // this information must travel along with the operation for eventual
1715     // consumption by emitEpilogue.
1716     Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
1717   }
1718
1719   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1720     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1721                                   RegsToPass[i].second.getValueType()));
1722
1723
1724   // Add a register mask operand representing the call-preserved registers. This
1725   // is used later in codegen to constrain register-allocation.
1726   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
1727   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
1728   assert(Mask && "Missing call preserved mask for calling convention");
1729   Ops.push_back(DAG.getRegisterMask(Mask));
1730
1731   // If we needed glue, put it in as the last argument.
1732   if (InFlag.getNode())
1733     Ops.push_back(InFlag);
1734
1735   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1736
1737   if (IsTailCall) {
1738     return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
1739   }
1740
1741   Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
1742   InFlag = Chain.getValue(1);
1743
1744   // Now we can reclaim the stack, just as well do it before working out where
1745   // our return value is.
1746   if (!IsSibCall) {
1747     uint64_t CalleePopBytes
1748       = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
1749
1750     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1751                                DAG.getIntPtrConstant(CalleePopBytes, true),
1752                                InFlag, dl);
1753     InFlag = Chain.getValue(1);
1754   }
1755
1756   return LowerCallResult(Chain, InFlag, CallConv,
1757                          IsVarArg, Ins, dl, DAG, InVals);
1758 }
1759
1760 SDValue
1761 AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1762                                       CallingConv::ID CallConv, bool IsVarArg,
1763                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1764                                       SDLoc dl, SelectionDAG &DAG,
1765                                       SmallVectorImpl<SDValue> &InVals) const {
1766   // Assign locations to each value returned by this call.
1767   SmallVector<CCValAssign, 16> RVLocs;
1768   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
1769                  getTargetMachine(), RVLocs, *DAG.getContext());
1770   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
1771
1772   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1773     CCValAssign VA = RVLocs[i];
1774
1775     // Return values that are too big to fit into registers should use an sret
1776     // pointer, so this can be a lot simpler than the main argument code.
1777     assert(VA.isRegLoc() && "Memory locations not expected for call return");
1778
1779     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1780                                      InFlag);
1781     Chain = Val.getValue(1);
1782     InFlag = Val.getValue(2);
1783
1784     switch (VA.getLocInfo()) {
1785     default: llvm_unreachable("Unknown loc info!");
1786     case CCValAssign::Full: break;
1787     case CCValAssign::BCvt:
1788       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1789       break;
1790     case CCValAssign::ZExt:
1791     case CCValAssign::SExt:
1792     case CCValAssign::AExt:
1793       // Floating-point arguments only get extended/truncated if they're going
1794       // in memory, so using the integer operation is acceptable here.
1795       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1796       break;
1797     }
1798
1799     InVals.push_back(Val);
1800   }
1801
1802   return Chain;
1803 }
1804
1805 bool
1806 AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
1807                                     CallingConv::ID CalleeCC,
1808                                     bool IsVarArg,
1809                                     bool IsCalleeStructRet,
1810                                     bool IsCallerStructRet,
1811                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
1812                                     const SmallVectorImpl<SDValue> &OutVals,
1813                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1814                                     SelectionDAG& DAG) const {
1815
1816   // For CallingConv::C this function knows whether the ABI needs
1817   // changing. That's not true for other conventions so they will have to opt in
1818   // manually.
1819   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1820     return false;
1821
1822   const MachineFunction &MF = DAG.getMachineFunction();
1823   const Function *CallerF = MF.getFunction();
1824   CallingConv::ID CallerCC = CallerF->getCallingConv();
1825   bool CCMatch = CallerCC == CalleeCC;
1826
1827   // Byval parameters hand the function a pointer directly into the stack area
1828   // we want to reuse during a tail call. Working around this *is* possible (see
1829   // X86) but less efficient and uglier in LowerCall.
1830   for (Function::const_arg_iterator i = CallerF->arg_begin(),
1831          e = CallerF->arg_end(); i != e; ++i)
1832     if (i->hasByValAttr())
1833       return false;
1834
1835   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
1836     if (IsTailCallConvention(CalleeCC) && CCMatch)
1837       return true;
1838     return false;
1839   }
1840
1841   // Now we search for cases where we can use a tail call without changing the
1842   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
1843   // concept.
1844
1845   // I want anyone implementing a new calling convention to think long and hard
1846   // about this assert.
1847   assert((!IsVarArg || CalleeCC == CallingConv::C)
1848          && "Unexpected variadic calling convention");
1849
1850   if (IsVarArg && !Outs.empty()) {
1851     // At least two cases here: if caller is fastcc then we can't have any
1852     // memory arguments (we'd be expected to clean up the stack afterwards). If
1853     // caller is C then we could potentially use its argument area.
1854
1855     // FIXME: for now we take the most conservative of these in both cases:
1856     // disallow all variadic memory operands.
1857     SmallVector<CCValAssign, 16> ArgLocs;
1858     CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
1859                    getTargetMachine(), ArgLocs, *DAG.getContext());
1860
1861     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
1862     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
1863       if (!ArgLocs[i].isRegLoc())
1864         return false;
1865   }
1866
1867   // If the calling conventions do not match, then we'd better make sure the
1868   // results are returned in the same way as what the caller expects.
1869   if (!CCMatch) {
1870     SmallVector<CCValAssign, 16> RVLocs1;
1871     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
1872                     getTargetMachine(), RVLocs1, *DAG.getContext());
1873     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
1874
1875     SmallVector<CCValAssign, 16> RVLocs2;
1876     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
1877                     getTargetMachine(), RVLocs2, *DAG.getContext());
1878     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
1879
1880     if (RVLocs1.size() != RVLocs2.size())
1881       return false;
1882     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
1883       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
1884         return false;
1885       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
1886         return false;
1887       if (RVLocs1[i].isRegLoc()) {
1888         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
1889           return false;
1890       } else {
1891         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
1892           return false;
1893       }
1894     }
1895   }
1896
1897   // Nothing more to check if the callee is taking no arguments
1898   if (Outs.empty())
1899     return true;
1900
1901   SmallVector<CCValAssign, 16> ArgLocs;
1902   CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
1903                  getTargetMachine(), ArgLocs, *DAG.getContext());
1904
1905   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
1906
1907   const AArch64MachineFunctionInfo *FuncInfo
1908     = MF.getInfo<AArch64MachineFunctionInfo>();
1909
1910   // If the stack arguments for this call would fit into our own save area then
1911   // the call can be made tail.
1912   return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
1913 }
1914
1915 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
1916                                                    bool TailCallOpt) const {
1917   return CallCC == CallingConv::Fast && TailCallOpt;
1918 }
1919
1920 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
1921   return CallCC == CallingConv::Fast;
1922 }
1923
1924 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
1925                                                    SelectionDAG &DAG,
1926                                                    MachineFrameInfo *MFI,
1927                                                    int ClobberedFI) const {
1928   SmallVector<SDValue, 8> ArgChains;
1929   int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
1930   int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
1931
1932   // Include the original chain at the beginning of the list. When this is
1933   // used by target LowerCall hooks, this helps legalize find the
1934   // CALLSEQ_BEGIN node.
1935   ArgChains.push_back(Chain);
1936
1937   // Add a chain value for each stack argument corresponding
1938   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1939          UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
1940     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
1941       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
1942         if (FI->getIndex() < 0) {
1943           int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
1944           int64_t InLastByte = InFirstByte;
1945           InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
1946
1947           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1948               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1949             ArgChains.push_back(SDValue(L, 1));
1950         }
1951
1952    // Build a tokenfactor for all the chains.
1953    return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
1954                       &ArgChains[0], ArgChains.size());
1955 }
1956
1957 static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
1958   switch (CC) {
1959   case ISD::SETEQ:  return A64CC::EQ;
1960   case ISD::SETGT:  return A64CC::GT;
1961   case ISD::SETGE:  return A64CC::GE;
1962   case ISD::SETLT:  return A64CC::LT;
1963   case ISD::SETLE:  return A64CC::LE;
1964   case ISD::SETNE:  return A64CC::NE;
1965   case ISD::SETUGT: return A64CC::HI;
1966   case ISD::SETUGE: return A64CC::HS;
1967   case ISD::SETULT: return A64CC::LO;
1968   case ISD::SETULE: return A64CC::LS;
1969   default: llvm_unreachable("Unexpected condition code");
1970   }
1971 }
1972
1973 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
1974   // icmp is implemented using adds/subs immediate, which take an unsigned
1975   // 12-bit immediate, optionally shifted left by 12 bits.
1976
1977   // Symmetric by using adds/subs
1978   if (Val < 0)
1979     Val = -Val;
1980
1981   return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
1982 }
1983
1984 SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
1985                                         ISD::CondCode CC, SDValue &A64cc,
1986                                         SelectionDAG &DAG, SDLoc &dl) const {
1987   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1988     int64_t C = 0;
1989     EVT VT = RHSC->getValueType(0);
1990     bool knownInvalid = false;
1991
1992     // I'm not convinced the rest of LLVM handles these edge cases properly, but
1993     // we can at least get it right.
1994     if (isSignedIntSetCC(CC)) {
1995       C = RHSC->getSExtValue();
1996     } else if (RHSC->getZExtValue() > INT64_MAX) {
1997       // A 64-bit constant not representable by a signed 64-bit integer is far
1998       // too big to fit into a SUBS immediate anyway.
1999       knownInvalid = true;
2000     } else {
2001       C = RHSC->getZExtValue();
2002     }
2003
2004     if (!knownInvalid && !isLegalICmpImmediate(C)) {
2005       // Constant does not fit, try adjusting it by one?
2006       switch (CC) {
2007       default: break;
2008       case ISD::SETLT:
2009       case ISD::SETGE:
2010         if (isLegalICmpImmediate(C-1)) {
2011           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2012           RHS = DAG.getConstant(C-1, VT);
2013         }
2014         break;
2015       case ISD::SETULT:
2016       case ISD::SETUGE:
2017         if (isLegalICmpImmediate(C-1)) {
2018           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2019           RHS = DAG.getConstant(C-1, VT);
2020         }
2021         break;
2022       case ISD::SETLE:
2023       case ISD::SETGT:
2024         if (isLegalICmpImmediate(C+1)) {
2025           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2026           RHS = DAG.getConstant(C+1, VT);
2027         }
2028         break;
2029       case ISD::SETULE:
2030       case ISD::SETUGT:
2031         if (isLegalICmpImmediate(C+1)) {
2032           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2033           RHS = DAG.getConstant(C+1, VT);
2034         }
2035         break;
2036       }
2037     }
2038   }
2039
2040   A64CC::CondCodes CondCode = IntCCToA64CC(CC);
2041   A64cc = DAG.getConstant(CondCode, MVT::i32);
2042   return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2043                      DAG.getCondCode(CC));
2044 }
2045
2046 static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
2047                                     A64CC::CondCodes &Alternative) {
2048   A64CC::CondCodes CondCode = A64CC::Invalid;
2049   Alternative = A64CC::Invalid;
2050
2051   switch (CC) {
2052   default: llvm_unreachable("Unknown FP condition!");
2053   case ISD::SETEQ:
2054   case ISD::SETOEQ: CondCode = A64CC::EQ; break;
2055   case ISD::SETGT:
2056   case ISD::SETOGT: CondCode = A64CC::GT; break;
2057   case ISD::SETGE:
2058   case ISD::SETOGE: CondCode = A64CC::GE; break;
2059   case ISD::SETOLT: CondCode = A64CC::MI; break;
2060   case ISD::SETOLE: CondCode = A64CC::LS; break;
2061   case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
2062   case ISD::SETO:   CondCode = A64CC::VC; break;
2063   case ISD::SETUO:  CondCode = A64CC::VS; break;
2064   case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
2065   case ISD::SETUGT: CondCode = A64CC::HI; break;
2066   case ISD::SETUGE: CondCode = A64CC::PL; break;
2067   case ISD::SETLT:
2068   case ISD::SETULT: CondCode = A64CC::LT; break;
2069   case ISD::SETLE:
2070   case ISD::SETULE: CondCode = A64CC::LE; break;
2071   case ISD::SETNE:
2072   case ISD::SETUNE: CondCode = A64CC::NE; break;
2073   }
2074   return CondCode;
2075 }
2076
2077 SDValue
2078 AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
2079   SDLoc DL(Op);
2080   EVT PtrVT = getPointerTy();
2081   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2082
2083   switch(getTargetMachine().getCodeModel()) {
2084   case CodeModel::Small:
2085     // The most efficient code is PC-relative anyway for the small memory model,
2086     // so we don't need to worry about relocation model.
2087     return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2088                        DAG.getTargetBlockAddress(BA, PtrVT, 0,
2089                                                  AArch64II::MO_NO_FLAG),
2090                        DAG.getTargetBlockAddress(BA, PtrVT, 0,
2091                                                  AArch64II::MO_LO12),
2092                        DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
2093   case CodeModel::Large:
2094     return DAG.getNode(
2095       AArch64ISD::WrapperLarge, DL, PtrVT,
2096       DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
2097       DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
2098       DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
2099       DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
2100   default:
2101     llvm_unreachable("Only small and large code models supported now");
2102   }
2103 }
2104
2105
2106 // (BRCOND chain, val, dest)
2107 SDValue
2108 AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
2109   SDLoc dl(Op);
2110   SDValue Chain = Op.getOperand(0);
2111   SDValue TheBit = Op.getOperand(1);
2112   SDValue DestBB = Op.getOperand(2);
2113
2114   // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
2115   // that as the consumer we are responsible for ignoring rubbish in higher
2116   // bits.
2117   TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
2118                        DAG.getConstant(1, MVT::i32));
2119
2120   SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
2121                                DAG.getConstant(0, TheBit.getValueType()),
2122                                DAG.getCondCode(ISD::SETNE));
2123
2124   return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
2125                      A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
2126                      DestBB);
2127 }
2128
2129 // (BR_CC chain, condcode, lhs, rhs, dest)
2130 SDValue
2131 AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
2132   SDLoc dl(Op);
2133   SDValue Chain = Op.getOperand(0);
2134   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
2135   SDValue LHS = Op.getOperand(2);
2136   SDValue RHS = Op.getOperand(3);
2137   SDValue DestBB = Op.getOperand(4);
2138
2139   if (LHS.getValueType() == MVT::f128) {
2140     // f128 comparisons are lowered to runtime calls by a routine which sets
2141     // LHS, RHS and CC appropriately for the rest of this function to continue.
2142     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2143
2144     // If softenSetCCOperands returned a scalar, we need to compare the result
2145     // against zero to select between true and false values.
2146     if (RHS.getNode() == 0) {
2147       RHS = DAG.getConstant(0, LHS.getValueType());
2148       CC = ISD::SETNE;
2149     }
2150   }
2151
2152   if (LHS.getValueType().isInteger()) {
2153     SDValue A64cc;
2154
2155     // Integers are handled in a separate function because the combinations of
2156     // immediates and tests can get hairy and we may want to fiddle things.
2157     SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
2158
2159     return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
2160                        Chain, CmpOp, A64cc, DestBB);
2161   }
2162
2163   // Note that some LLVM floating-point CondCodes can't be lowered to a single
2164   // conditional branch, hence FPCCToA64CC can set a second test, where either
2165   // passing is sufficient.
2166   A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
2167   CondCode = FPCCToA64CC(CC, Alternative);
2168   SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
2169   SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2170                               DAG.getCondCode(CC));
2171   SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
2172                                  Chain, SetCC, A64cc, DestBB);
2173
2174   if (Alternative != A64CC::Invalid) {
2175     A64cc = DAG.getConstant(Alternative, MVT::i32);
2176     A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
2177                            A64BR_CC, SetCC, A64cc, DestBB);
2178
2179   }
2180
2181   return A64BR_CC;
2182 }
2183
2184 SDValue
2185 AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
2186                                        RTLIB::Libcall Call) const {
2187   ArgListTy Args;
2188   ArgListEntry Entry;
2189   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
2190     EVT ArgVT = Op.getOperand(i).getValueType();
2191     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2192     Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
2193     Entry.isSExt = false;
2194     Entry.isZExt = false;
2195     Args.push_back(Entry);
2196   }
2197   SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
2198
2199   Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
2200
2201   // By default, the input chain to this libcall is the entry node of the
2202   // function. If the libcall is going to be emitted as a tail call then
2203   // isUsedByReturnOnly will change it to the right chain if the return
2204   // node which is being folded has a non-entry input chain.
2205   SDValue InChain = DAG.getEntryNode();
2206
2207   // isTailCall may be true since the callee does not reference caller stack
2208   // frame. Check if it's in the right position.
2209   SDValue TCChain = InChain;
2210   bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
2211   if (isTailCall)
2212     InChain = TCChain;
2213
2214   TargetLowering::
2215   CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
2216                     0, getLibcallCallingConv(Call), isTailCall,
2217                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
2218                     Callee, Args, DAG, SDLoc(Op));
2219   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
2220
2221   if (!CallInfo.second.getNode())
2222     // It's a tailcall, return the chain (which is the DAG root).
2223     return DAG.getRoot();
2224
2225   return CallInfo.first;
2226 }
2227
2228 SDValue
2229 AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
2230   if (Op.getOperand(0).getValueType() != MVT::f128) {
2231     // It's legal except when f128 is involved
2232     return Op;
2233   }
2234
2235   RTLIB::Libcall LC;
2236   LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
2237
2238   SDValue SrcVal = Op.getOperand(0);
2239   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
2240                      /*isSigned*/ false, SDLoc(Op)).first;
2241 }
2242
2243 SDValue
2244 AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
2245   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2246
2247   RTLIB::Libcall LC;
2248   LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
2249
2250   return LowerF128ToCall(Op, DAG, LC);
2251 }
2252
2253 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2254                                     bool IsSigned) {
2255   SDLoc dl(Op);
2256   EVT VT = Op.getValueType();
2257   SDValue Vec = Op.getOperand(0);
2258   EVT OpVT = Vec.getValueType();
2259   unsigned Opc = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
2260
2261   if (VT.getVectorNumElements() == 1) {
2262     assert(OpVT == MVT::v1f64 && "Unexpected vector type!");
2263     if (VT.getSizeInBits() == OpVT.getSizeInBits())
2264       return Op;
2265     return DAG.UnrollVectorOp(Op.getNode());
2266   }
2267
2268   if (VT.getSizeInBits() > OpVT.getSizeInBits()) {
2269     assert(Vec.getValueType() == MVT::v2f32 && VT == MVT::v2i64 &&
2270            "Unexpected vector type!");
2271     Vec = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Vec);
2272     return DAG.getNode(Opc, dl, VT, Vec);
2273   } else if (VT.getSizeInBits() < OpVT.getSizeInBits()) {
2274     EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
2275                                    OpVT.getVectorElementType().getSizeInBits());
2276     CastVT =
2277         EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
2278     Vec = DAG.getNode(Opc, dl, CastVT, Vec);
2279     return DAG.getNode(ISD::TRUNCATE, dl, VT, Vec);
2280   }
2281   return DAG.getNode(Opc, dl, VT, Vec);
2282 }
2283
2284 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
2285   // We custom lower concat_vectors with 4, 8, or 16 operands that are all the
2286   // same operand and of type v1* using the DUP instruction.
2287   unsigned NumOps = Op->getNumOperands();
2288   if (NumOps != 4 && NumOps != 8 && NumOps != 16)
2289     return Op;
2290
2291   // Must be a single value for VDUP.
2292   bool isConstant = true;
2293   SDValue Op0 = Op.getOperand(0);
2294   for (unsigned i = 1; i < NumOps; ++i) {
2295     SDValue OpN = Op.getOperand(i);
2296     if (Op0 != OpN)
2297       return Op;
2298
2299     if (!isa<ConstantSDNode>(OpN->getOperand(0)))
2300       isConstant = false;
2301   }
2302
2303   // Verify the value type.
2304   EVT EltVT = Op0.getValueType();
2305   switch (NumOps) {
2306   default: llvm_unreachable("Unexpected number of operands");
2307   case 4:
2308     if (EltVT != MVT::v1i16 && EltVT != MVT::v1i32)
2309       return Op;
2310     break;
2311   case 8:
2312     if (EltVT != MVT::v1i8 && EltVT != MVT::v1i16)
2313       return Op;
2314     break;
2315   case 16:
2316     if (EltVT != MVT::v1i8)
2317       return Op;
2318     break;
2319   }
2320
2321   SDLoc DL(Op);
2322   EVT VT = Op.getValueType();
2323   // VDUP produces better code for constants.
2324   if (isConstant)
2325     return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Op0->getOperand(0));
2326   return DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, Op0,
2327                      DAG.getConstant(0, MVT::i64));
2328 }
2329
2330 SDValue
2331 AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2332                                       bool IsSigned) const {
2333   if (Op.getValueType().isVector())
2334     return LowerVectorFP_TO_INT(Op, DAG, IsSigned);
2335   if (Op.getOperand(0).getValueType() != MVT::f128) {
2336     // It's legal except when f128 is involved
2337     return Op;
2338   }
2339
2340   RTLIB::Libcall LC;
2341   if (IsSigned)
2342     LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
2343   else
2344     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
2345
2346   return LowerF128ToCall(Op, DAG, LC);
2347 }
2348
2349 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
2350   MachineFunction &MF = DAG.getMachineFunction();
2351   MachineFrameInfo *MFI = MF.getFrameInfo();
2352   MFI->setReturnAddressIsTaken(true);
2353
2354   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
2355     return SDValue();
2356
2357   EVT VT = Op.getValueType();
2358   SDLoc dl(Op);
2359   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2360   if (Depth) {
2361     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
2362     SDValue Offset = DAG.getConstant(8, MVT::i64);
2363     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
2364                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
2365                        MachinePointerInfo(), false, false, false, 0);
2366   }
2367
2368   // Return X30, which contains the return address. Mark it an implicit live-in.
2369   unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
2370   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
2371 }
2372
2373
2374 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
2375                                               const {
2376   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
2377   MFI->setFrameAddressIsTaken(true);
2378
2379   EVT VT = Op.getValueType();
2380   SDLoc dl(Op);
2381   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2382   unsigned FrameReg = AArch64::X29;
2383   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
2384   while (Depth--)
2385     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
2386                             MachinePointerInfo(),
2387                             false, false, false, 0);
2388   return FrameAddr;
2389 }
2390
2391 SDValue
2392 AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
2393                                                   SelectionDAG &DAG) const {
2394   assert(getTargetMachine().getCodeModel() == CodeModel::Large);
2395   assert(getTargetMachine().getRelocationModel() == Reloc::Static);
2396
2397   EVT PtrVT = getPointerTy();
2398   SDLoc dl(Op);
2399   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2400   const GlobalValue *GV = GN->getGlobal();
2401
2402   SDValue GlobalAddr = DAG.getNode(
2403       AArch64ISD::WrapperLarge, dl, PtrVT,
2404       DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
2405       DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
2406       DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
2407       DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
2408
2409   if (GN->getOffset() != 0)
2410     return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
2411                        DAG.getConstant(GN->getOffset(), PtrVT));
2412
2413   return GlobalAddr;
2414 }
2415
2416 SDValue
2417 AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
2418                                                   SelectionDAG &DAG) const {
2419   assert(getTargetMachine().getCodeModel() == CodeModel::Small);
2420
2421   EVT PtrVT = getPointerTy();
2422   SDLoc dl(Op);
2423   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2424   const GlobalValue *GV = GN->getGlobal();
2425   unsigned Alignment = GV->getAlignment();
2426   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2427   if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
2428     // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
2429     // to zero when they remain undefined. In PIC mode the GOT can take care of
2430     // this, but in absolute mode we use a constant pool load.
2431     SDValue PoolAddr;
2432     PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2433                            DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
2434                                                      AArch64II::MO_NO_FLAG),
2435                            DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
2436                                                      AArch64II::MO_LO12),
2437                            DAG.getConstant(8, MVT::i32));
2438     SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
2439                                      MachinePointerInfo::getConstantPool(),
2440                                      /*isVolatile=*/ false,
2441                                      /*isNonTemporal=*/ true,
2442                                      /*isInvariant=*/ true, 8);
2443     if (GN->getOffset() != 0)
2444       return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
2445                          DAG.getConstant(GN->getOffset(), PtrVT));
2446
2447     return GlobalAddr;
2448   }
2449
2450   if (Alignment == 0) {
2451     const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
2452     if (GVPtrTy->getElementType()->isSized()) {
2453       Alignment
2454         = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
2455     } else {
2456       // Be conservative if we can't guess, not that it really matters:
2457       // functions and labels aren't valid for loads, and the methods used to
2458       // actually calculate an address work with any alignment.
2459       Alignment = 1;
2460     }
2461   }
2462
2463   unsigned char HiFixup, LoFixup;
2464   bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
2465
2466   if (UseGOT) {
2467     HiFixup = AArch64II::MO_GOT;
2468     LoFixup = AArch64II::MO_GOT_LO12;
2469     Alignment = 8;
2470   } else {
2471     HiFixup = AArch64II::MO_NO_FLAG;
2472     LoFixup = AArch64II::MO_LO12;
2473   }
2474
2475   // AArch64's small model demands the following sequence:
2476   // ADRP x0, somewhere
2477   // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
2478   SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2479                                   DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
2480                                                              HiFixup),
2481                                   DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
2482                                                              LoFixup),
2483                                   DAG.getConstant(Alignment, MVT::i32));
2484
2485   if (UseGOT) {
2486     GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
2487                             GlobalRef);
2488   }
2489
2490   if (GN->getOffset() != 0)
2491     return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
2492                        DAG.getConstant(GN->getOffset(), PtrVT));
2493
2494   return GlobalRef;
2495 }
2496
2497 SDValue
2498 AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
2499                                              SelectionDAG &DAG) const {
2500   // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
2501   // we make those distinctions here.
2502
2503   switch (getTargetMachine().getCodeModel()) {
2504   case CodeModel::Small:
2505     return LowerGlobalAddressELFSmall(Op, DAG);
2506   case CodeModel::Large:
2507     return LowerGlobalAddressELFLarge(Op, DAG);
2508   default:
2509     llvm_unreachable("Only small and large code models supported now");
2510   }
2511 }
2512
2513 SDValue
2514 AArch64TargetLowering::LowerConstantPool(SDValue Op,
2515                                          SelectionDAG &DAG) const {
2516   SDLoc DL(Op);
2517   EVT PtrVT = getPointerTy();
2518   ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Op);
2519   const Constant *C = CN->getConstVal();
2520
2521   switch(getTargetMachine().getCodeModel()) {
2522   case CodeModel::Small:
2523     // The most efficient code is PC-relative anyway for the small memory model,
2524     // so we don't need to worry about relocation model.
2525     return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2526                        DAG.getTargetConstantPool(C, PtrVT, 0, 0,
2527                                                  AArch64II::MO_NO_FLAG),
2528                        DAG.getTargetConstantPool(C, PtrVT, 0, 0,
2529                                                  AArch64II::MO_LO12),
2530                        DAG.getConstant(CN->getAlignment(), MVT::i32));
2531   case CodeModel::Large:
2532     return DAG.getNode(
2533       AArch64ISD::WrapperLarge, DL, PtrVT,
2534       DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
2535       DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
2536       DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
2537       DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC));
2538   default:
2539     llvm_unreachable("Only small and large code models supported now");
2540   }
2541 }
2542
2543 SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
2544                                                 SDValue DescAddr,
2545                                                 SDLoc DL,
2546                                                 SelectionDAG &DAG) const {
2547   EVT PtrVT = getPointerTy();
2548
2549   // The function we need to call is simply the first entry in the GOT for this
2550   // descriptor, load it in preparation.
2551   SDValue Func, Chain;
2552   Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
2553                      DescAddr);
2554
2555   // The function takes only one argument: the address of the descriptor itself
2556   // in X0.
2557   SDValue Glue;
2558   Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
2559   Glue = Chain.getValue(1);
2560
2561   // Finally, there's a special calling-convention which means that the lookup
2562   // must preserve all registers (except X0, obviously).
2563   const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
2564   const AArch64RegisterInfo *A64RI
2565     = static_cast<const AArch64RegisterInfo *>(TRI);
2566   const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
2567
2568   // We're now ready to populate the argument list, as with a normal call:
2569   std::vector<SDValue> Ops;
2570   Ops.push_back(Chain);
2571   Ops.push_back(Func);
2572   Ops.push_back(SymAddr);
2573   Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
2574   Ops.push_back(DAG.getRegisterMask(Mask));
2575   Ops.push_back(Glue);
2576
2577   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2578   Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
2579                       Ops.size());
2580   Glue = Chain.getValue(1);
2581
2582   // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
2583   // back to the generic handling code.
2584   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
2585 }
2586
2587 SDValue
2588 AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
2589                                              SelectionDAG &DAG) const {
2590   assert(getSubtarget()->isTargetELF() &&
2591          "TLS not implemented for non-ELF targets");
2592   assert(getTargetMachine().getCodeModel() == CodeModel::Small
2593          && "TLS only supported in small memory model");
2594   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2595
2596   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
2597
2598   SDValue TPOff;
2599   EVT PtrVT = getPointerTy();
2600   SDLoc DL(Op);
2601   const GlobalValue *GV = GA->getGlobal();
2602
2603   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
2604
2605   if (Model == TLSModel::InitialExec) {
2606     TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2607                         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2608                                                    AArch64II::MO_GOTTPREL),
2609                         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2610                                                    AArch64II::MO_GOTTPREL_LO12),
2611                         DAG.getConstant(8, MVT::i32));
2612     TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
2613                         TPOff);
2614   } else if (Model == TLSModel::LocalExec) {
2615     SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2616                                                AArch64II::MO_TPREL_G1);
2617     SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2618                                                AArch64II::MO_TPREL_G0_NC);
2619
2620     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
2621                                        DAG.getTargetConstant(1, MVT::i32)), 0);
2622     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
2623                                        TPOff, LoVar,
2624                                        DAG.getTargetConstant(0, MVT::i32)), 0);
2625   } else if (Model == TLSModel::GeneralDynamic) {
2626     // Accesses used in this sequence go via the TLS descriptor which lives in
2627     // the GOT. Prepare an address we can use to handle this.
2628     SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2629                                                 AArch64II::MO_TLSDESC);
2630     SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2631                                                 AArch64II::MO_TLSDESC_LO12);
2632     SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2633                                    HiDesc, LoDesc,
2634                                    DAG.getConstant(8, MVT::i32));
2635     SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
2636
2637     TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
2638   } else if (Model == TLSModel::LocalDynamic) {
2639     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
2640     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
2641     // the beginning of the module's TLS region, followed by a DTPREL offset
2642     // calculation.
2643
2644     // These accesses will need deduplicating if there's more than one.
2645     AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
2646       .getInfo<AArch64MachineFunctionInfo>();
2647     MFI->incNumLocalDynamicTLSAccesses();
2648
2649
2650     // Get the location of _TLS_MODULE_BASE_:
2651     SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2652                                                 AArch64II::MO_TLSDESC);
2653     SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2654                                                 AArch64II::MO_TLSDESC_LO12);
2655     SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2656                                    HiDesc, LoDesc,
2657                                    DAG.getConstant(8, MVT::i32));
2658     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
2659
2660     ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
2661
2662     // Get the variable's offset from _TLS_MODULE_BASE_
2663     SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2664                                                AArch64II::MO_DTPREL_G1);
2665     SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2666                                                AArch64II::MO_DTPREL_G0_NC);
2667
2668     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
2669                                        DAG.getTargetConstant(0, MVT::i32)), 0);
2670     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
2671                                        TPOff, LoVar,
2672                                        DAG.getTargetConstant(0, MVT::i32)), 0);
2673   } else
2674       llvm_unreachable("Unsupported TLS access model");
2675
2676
2677   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
2678 }
2679
2680 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2681                                     bool IsSigned) {
2682   SDLoc dl(Op);
2683   EVT VT = Op.getValueType();
2684   SDValue Vec = Op.getOperand(0);
2685   unsigned Opc = IsSigned ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2686
2687   if (VT.getVectorNumElements() == 1) {
2688     assert(VT == MVT::v1f64 && "Unexpected vector type!");
2689     if (VT.getSizeInBits() == Vec.getValueSizeInBits())
2690       return Op;
2691     return DAG.UnrollVectorOp(Op.getNode());
2692   }
2693
2694   if (VT.getSizeInBits() < Vec.getValueSizeInBits()) {
2695     assert(Vec.getValueType() == MVT::v2i64 && VT == MVT::v2f32 &&
2696            "Unexpected vector type!");
2697     Vec = DAG.getNode(Opc, dl, MVT::v2f64, Vec);
2698     return DAG.getNode(ISD::FP_ROUND, dl, VT, Vec, DAG.getIntPtrConstant(0));
2699   } else if (VT.getSizeInBits() > Vec.getValueSizeInBits()) {
2700     unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2701     EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
2702                                    VT.getVectorElementType().getSizeInBits());
2703     CastVT =
2704         EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
2705     Vec = DAG.getNode(CastOpc, dl, CastVT, Vec);
2706   }
2707
2708   return DAG.getNode(Opc, dl, VT, Vec);
2709 }
2710
2711 SDValue
2712 AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2713                                       bool IsSigned) const {
2714   if (Op.getValueType().isVector())
2715     return LowerVectorINT_TO_FP(Op, DAG, IsSigned);
2716   if (Op.getValueType() != MVT::f128) {
2717     // Legal for everything except f128.
2718     return Op;
2719   }
2720
2721   RTLIB::Libcall LC;
2722   if (IsSigned)
2723     LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2724   else
2725     LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2726
2727   return LowerF128ToCall(Op, DAG, LC);
2728 }
2729
2730
2731 SDValue
2732 AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
2733   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
2734   SDLoc dl(JT);
2735   EVT PtrVT = getPointerTy();
2736
2737   // When compiling PIC, jump tables get put in the code section so a static
2738   // relocation-style is acceptable for both cases.
2739   switch (getTargetMachine().getCodeModel()) {
2740   case CodeModel::Small:
2741     return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2742                        DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
2743                        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
2744                                               AArch64II::MO_LO12),
2745                        DAG.getConstant(1, MVT::i32));
2746   case CodeModel::Large:
2747     return DAG.getNode(
2748       AArch64ISD::WrapperLarge, dl, PtrVT,
2749       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
2750       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
2751       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
2752       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
2753   default:
2754     llvm_unreachable("Only small and large code models supported now");
2755   }
2756 }
2757
2758 // (SELECT testbit, iftrue, iffalse)
2759 SDValue
2760 AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2761   SDLoc dl(Op);
2762   SDValue TheBit = Op.getOperand(0);
2763   SDValue IfTrue = Op.getOperand(1);
2764   SDValue IfFalse = Op.getOperand(2);
2765
2766   // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
2767   // that as the consumer we are responsible for ignoring rubbish in higher
2768   // bits.
2769   TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
2770                        DAG.getConstant(1, MVT::i32));
2771   SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
2772                                DAG.getConstant(0, TheBit.getValueType()),
2773                                DAG.getCondCode(ISD::SETNE));
2774
2775   return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2776                      A64CMP, IfTrue, IfFalse,
2777                      DAG.getConstant(A64CC::NE, MVT::i32));
2778 }
2779
2780 static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
2781   SDLoc DL(Op);
2782   SDValue LHS = Op.getOperand(0);
2783   SDValue RHS = Op.getOperand(1);
2784   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2785   EVT VT = Op.getValueType();
2786   bool Invert = false;
2787   SDValue Op0, Op1;
2788   unsigned Opcode;
2789
2790   if (LHS.getValueType().isInteger()) {
2791
2792     // Attempt to use Vector Integer Compare Mask Test instruction.
2793     // TST = icmp ne (and (op0, op1), zero).
2794     if (CC == ISD::SETNE) {
2795       if (((LHS.getOpcode() == ISD::AND) &&
2796            ISD::isBuildVectorAllZeros(RHS.getNode())) ||
2797           ((RHS.getOpcode() == ISD::AND) &&
2798            ISD::isBuildVectorAllZeros(LHS.getNode()))) {
2799
2800         SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
2801         SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
2802         SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
2803         return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
2804       }
2805     }
2806
2807     // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
2808     // Note: Compare against Zero does not support unsigned predicates.
2809     if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
2810          ISD::isBuildVectorAllZeros(LHS.getNode())) &&
2811         !isUnsignedIntSetCC(CC)) {
2812
2813       // If LHS is the zero value, swap operands and CondCode.
2814       if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
2815         CC = getSetCCSwappedOperands(CC);
2816         Op0 = RHS;
2817       } else
2818         Op0 = LHS;
2819
2820       // Ensure valid CondCode for Compare Mask against Zero instruction:
2821       // EQ, GE, GT, LE, LT.
2822       if (ISD::SETNE == CC) {
2823         Invert = true;
2824         CC = ISD::SETEQ;
2825       }
2826
2827       // Using constant type to differentiate integer and FP compares with zero.
2828       Op1 = DAG.getConstant(0, MVT::i32);
2829       Opcode = AArch64ISD::NEON_CMPZ;
2830
2831     } else {
2832       // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
2833       // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
2834       bool Swap = false;
2835       switch (CC) {
2836       default:
2837         llvm_unreachable("Illegal integer comparison.");
2838       case ISD::SETEQ:
2839       case ISD::SETGT:
2840       case ISD::SETGE:
2841       case ISD::SETUGT:
2842       case ISD::SETUGE:
2843         break;
2844       case ISD::SETNE:
2845         Invert = true;
2846         CC = ISD::SETEQ;
2847         break;
2848       case ISD::SETULT:
2849       case ISD::SETULE:
2850       case ISD::SETLT:
2851       case ISD::SETLE:
2852         Swap = true;
2853         CC = getSetCCSwappedOperands(CC);
2854       }
2855
2856       if (Swap)
2857         std::swap(LHS, RHS);
2858
2859       Opcode = AArch64ISD::NEON_CMP;
2860       Op0 = LHS;
2861       Op1 = RHS;
2862     }
2863
2864     // Generate Compare Mask instr or Compare Mask against Zero instr.
2865     SDValue NeonCmp =
2866         DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
2867
2868     if (Invert)
2869       NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
2870
2871     return NeonCmp;
2872   }
2873
2874   // Now handle Floating Point cases.
2875   // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
2876   if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
2877       ISD::isBuildVectorAllZeros(LHS.getNode())) {
2878
2879     // If LHS is the zero value, swap operands and CondCode.
2880     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
2881       CC = getSetCCSwappedOperands(CC);
2882       Op0 = RHS;
2883     } else
2884       Op0 = LHS;
2885
2886     // Using constant type to differentiate integer and FP compares with zero.
2887     Op1 = DAG.getConstantFP(0, MVT::f32);
2888     Opcode = AArch64ISD::NEON_CMPZ;
2889   } else {
2890     // Attempt to use Vector Floating Point Compare Mask instruction.
2891     Op0 = LHS;
2892     Op1 = RHS;
2893     Opcode = AArch64ISD::NEON_CMP;
2894   }
2895
2896   SDValue NeonCmpAlt;
2897   // Some register compares have to be implemented with swapped CC and operands,
2898   // e.g.: OLT implemented as OGT with swapped operands.
2899   bool SwapIfRegArgs = false;
2900
2901   // Ensure valid CondCode for FP Compare Mask against Zero instruction:
2902   // EQ, GE, GT, LE, LT.
2903   // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
2904   switch (CC) {
2905   default:
2906     llvm_unreachable("Illegal FP comparison");
2907   case ISD::SETUNE:
2908   case ISD::SETNE:
2909     Invert = true; // Fallthrough
2910   case ISD::SETOEQ:
2911   case ISD::SETEQ:
2912     CC = ISD::SETEQ;
2913     break;
2914   case ISD::SETOLT:
2915   case ISD::SETLT:
2916     CC = ISD::SETLT;
2917     SwapIfRegArgs = true;
2918     break;
2919   case ISD::SETOGT:
2920   case ISD::SETGT:
2921     CC = ISD::SETGT;
2922     break;
2923   case ISD::SETOLE:
2924   case ISD::SETLE:
2925     CC = ISD::SETLE;
2926     SwapIfRegArgs = true;
2927     break;
2928   case ISD::SETOGE:
2929   case ISD::SETGE:
2930     CC = ISD::SETGE;
2931     break;
2932   case ISD::SETUGE:
2933     Invert = true;
2934     CC = ISD::SETLT;
2935     SwapIfRegArgs = true;
2936     break;
2937   case ISD::SETULE:
2938     Invert = true;
2939     CC = ISD::SETGT;
2940     break;
2941   case ISD::SETUGT:
2942     Invert = true;
2943     CC = ISD::SETLE;
2944     SwapIfRegArgs = true;
2945     break;
2946   case ISD::SETULT:
2947     Invert = true;
2948     CC = ISD::SETGE;
2949     break;
2950   case ISD::SETUEQ:
2951     Invert = true; // Fallthrough
2952   case ISD::SETONE:
2953     // Expand this to (OGT |OLT).
2954     NeonCmpAlt =
2955         DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
2956     CC = ISD::SETLT;
2957     SwapIfRegArgs = true;
2958     break;
2959   case ISD::SETUO:
2960     Invert = true; // Fallthrough
2961   case ISD::SETO:
2962     // Expand this to (OGE | OLT).
2963     NeonCmpAlt =
2964         DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
2965     CC = ISD::SETLT;
2966     SwapIfRegArgs = true;
2967     break;
2968   }
2969
2970   if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
2971     CC = getSetCCSwappedOperands(CC);
2972     std::swap(Op0, Op1);
2973   }
2974
2975   // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
2976   SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
2977
2978   if (NeonCmpAlt.getNode())
2979     NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
2980
2981   if (Invert)
2982     NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
2983
2984   return NeonCmp;
2985 }
2986
2987 // (SETCC lhs, rhs, condcode)
2988 SDValue
2989 AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
2990   SDLoc dl(Op);
2991   SDValue LHS = Op.getOperand(0);
2992   SDValue RHS = Op.getOperand(1);
2993   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2994   EVT VT = Op.getValueType();
2995
2996   if (VT.isVector())
2997     return LowerVectorSETCC(Op, DAG);
2998
2999   if (LHS.getValueType() == MVT::f128) {
3000     // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
3001     // for the rest of the function (some i32 or i64 values).
3002     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
3003
3004     // If softenSetCCOperands returned a scalar, use it.
3005     if (RHS.getNode() == 0) {
3006       assert(LHS.getValueType() == Op.getValueType() &&
3007              "Unexpected setcc expansion!");
3008       return LHS;
3009     }
3010   }
3011
3012   if (LHS.getValueType().isInteger()) {
3013     SDValue A64cc;
3014
3015     // Integers are handled in a separate function because the combinations of
3016     // immediates and tests can get hairy and we may want to fiddle things.
3017     SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
3018
3019     return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
3020                        CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
3021                        A64cc);
3022   }
3023
3024   // Note that some LLVM floating-point CondCodes can't be lowered to a single
3025   // conditional branch, hence FPCCToA64CC can set a second test, where either
3026   // passing is sufficient.
3027   A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
3028   CondCode = FPCCToA64CC(CC, Alternative);
3029   SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
3030   SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
3031                               DAG.getCondCode(CC));
3032   SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
3033                                      CmpOp, DAG.getConstant(1, VT),
3034                                      DAG.getConstant(0, VT), A64cc);
3035
3036   if (Alternative != A64CC::Invalid) {
3037     A64cc = DAG.getConstant(Alternative, MVT::i32);
3038     A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
3039                                DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
3040   }
3041
3042   return A64SELECT_CC;
3043 }
3044
3045 static SDValue LowerVectorSELECT_CC(SDValue Op, SelectionDAG &DAG) {
3046   SDLoc dl(Op);
3047   SDValue LHS = Op.getOperand(0);
3048   SDValue RHS = Op.getOperand(1);
3049   SDValue IfTrue = Op.getOperand(2);
3050   SDValue IfFalse = Op.getOperand(3);
3051   EVT IfTrueVT = IfTrue.getValueType();
3052   EVT CondVT = IfTrueVT.changeVectorElementTypeToInteger();
3053   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
3054
3055   // If LHS & RHS are floating point and IfTrue & IfFalse are vectors, we will
3056   // use NEON compare.
3057   if ((LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64)) {
3058     EVT EltVT = LHS.getValueType();
3059     unsigned EltNum = 128 / EltVT.getSizeInBits();
3060     EVT VT = EVT::getVectorVT(*DAG.getContext(), EltVT, EltNum);
3061     unsigned SubConstant =
3062         (LHS.getValueType() == MVT::f32) ? AArch64::sub_32 :AArch64::sub_64;
3063     EVT CEltT = (LHS.getValueType() == MVT::f32) ? MVT::i32 : MVT::i64;
3064     EVT CVT = EVT::getVectorVT(*DAG.getContext(), CEltT, EltNum);
3065
3066     LHS
3067       = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
3068                   VT, DAG.getTargetConstant(0, MVT::i32), LHS,
3069                   DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
3070     RHS
3071       = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
3072                   VT, DAG.getTargetConstant(0, MVT::i32), RHS,
3073                   DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
3074
3075     SDValue VSetCC = DAG.getSetCC(dl, CVT, LHS, RHS, CC);
3076     SDValue ResCC = LowerVectorSETCC(VSetCC, DAG);
3077     if (CEltT.getSizeInBits() < IfTrueVT.getSizeInBits()) {
3078       EVT DUPVT =
3079           EVT::getVectorVT(*DAG.getContext(), CEltT,
3080                            IfTrueVT.getSizeInBits() / CEltT.getSizeInBits());
3081       ResCC = DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, DUPVT, ResCC,
3082                           DAG.getConstant(0, MVT::i64, false));
3083
3084       ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
3085     } else {
3086       // FIXME: If IfTrue & IfFalse hold v1i8, v1i16 or v1i32, this function
3087       // can't handle them and will hit this assert.
3088       assert(CEltT.getSizeInBits() == IfTrueVT.getSizeInBits() &&
3089              "Vector of IfTrue & IfFalse is too small.");
3090
3091       unsigned ExEltNum =
3092           EltNum * IfTrueVT.getSizeInBits() / ResCC.getValueSizeInBits();
3093       EVT ExVT = EVT::getVectorVT(*DAG.getContext(), CEltT, ExEltNum);
3094       ResCC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExVT, ResCC,
3095                           DAG.getConstant(0, MVT::i64, false));
3096       ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
3097     }
3098     SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
3099                                   ResCC, IfTrue, IfFalse);
3100     return VSelect;
3101   }
3102
3103   // Here we handle the case that LHS & RHS are integer and IfTrue & IfFalse are
3104   // vectors.
3105   A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
3106   CondCode = FPCCToA64CC(CC, Alternative);
3107   SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
3108   SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
3109                               DAG.getCondCode(CC));
3110   EVT SEVT = MVT::i32;
3111   if (IfTrue.getValueType().getVectorElementType().getSizeInBits() > 32)
3112     SEVT = MVT::i64;
3113   SDValue AllOne = DAG.getConstant(-1, SEVT);
3114   SDValue AllZero = DAG.getConstant(0, SEVT);
3115   SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, SEVT, SetCC,
3116                                      AllOne, AllZero, A64cc);
3117
3118   if (Alternative != A64CC::Invalid) {
3119     A64cc = DAG.getConstant(Alternative, MVT::i32);
3120     A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
3121                                SetCC, AllOne, A64SELECT_CC, A64cc);
3122   }
3123   SDValue VDup;
3124   if (IfTrue.getValueType().getVectorNumElements() == 1)
3125     VDup = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, CondVT, A64SELECT_CC);
3126   else
3127     VDup = DAG.getNode(AArch64ISD::NEON_VDUP, dl, CondVT, A64SELECT_CC);
3128   SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
3129                                 VDup, IfTrue, IfFalse);
3130   return VSelect;
3131 }
3132
3133 // (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
3134 SDValue
3135 AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
3136   SDLoc dl(Op);
3137   SDValue LHS = Op.getOperand(0);
3138   SDValue RHS = Op.getOperand(1);
3139   SDValue IfTrue = Op.getOperand(2);
3140   SDValue IfFalse = Op.getOperand(3);
3141   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
3142
3143   if (IfTrue.getValueType().isVector())
3144     return LowerVectorSELECT_CC(Op, DAG);
3145
3146   if (LHS.getValueType() == MVT::f128) {
3147     // f128 comparisons are lowered to libcalls, but slot in nicely here
3148     // afterwards.
3149     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
3150
3151     // If softenSetCCOperands returned a scalar, we need to compare the result
3152     // against zero to select between true and false values.
3153     if (RHS.getNode() == 0) {
3154       RHS = DAG.getConstant(0, LHS.getValueType());
3155       CC = ISD::SETNE;
3156     }
3157   }
3158
3159   if (LHS.getValueType().isInteger()) {
3160     SDValue A64cc;
3161
3162     // Integers are handled in a separate function because the combinations of
3163     // immediates and tests can get hairy and we may want to fiddle things.
3164     SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
3165
3166     return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), CmpOp,
3167                        IfTrue, IfFalse, A64cc);
3168   }
3169
3170   // Note that some LLVM floating-point CondCodes can't be lowered to a single
3171   // conditional branch, hence FPCCToA64CC can set a second test, where either
3172   // passing is sufficient.
3173   A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
3174   CondCode = FPCCToA64CC(CC, Alternative);
3175   SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
3176   SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
3177                               DAG.getCondCode(CC));
3178   SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
3179                                      Op.getValueType(),
3180                                      SetCC, IfTrue, IfFalse, A64cc);
3181
3182   if (Alternative != A64CC::Invalid) {
3183     A64cc = DAG.getConstant(Alternative, MVT::i32);
3184     A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
3185                                SetCC, IfTrue, A64SELECT_CC, A64cc);
3186
3187   }
3188
3189   return A64SELECT_CC;
3190 }
3191
3192 SDValue
3193 AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
3194   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
3195   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
3196
3197   // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
3198   // rather than just 8.
3199   return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
3200                        Op.getOperand(1), Op.getOperand(2),
3201                        DAG.getConstant(32, MVT::i32), 8, false, false,
3202                        MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
3203 }
3204
3205 SDValue
3206 AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3207   // The layout of the va_list struct is specified in the AArch64 Procedure Call
3208   // Standard, section B.3.
3209   MachineFunction &MF = DAG.getMachineFunction();
3210   AArch64MachineFunctionInfo *FuncInfo
3211     = MF.getInfo<AArch64MachineFunctionInfo>();
3212   SDLoc DL(Op);
3213
3214   SDValue Chain = Op.getOperand(0);
3215   SDValue VAList = Op.getOperand(1);
3216   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3217   SmallVector<SDValue, 4> MemOps;
3218
3219   // void *__stack at offset 0
3220   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
3221                                     getPointerTy());
3222   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
3223                                 MachinePointerInfo(SV), false, false, 0));
3224
3225   // void *__gr_top at offset 8
3226   int GPRSize = FuncInfo->getVariadicGPRSize();
3227   if (GPRSize > 0) {
3228     SDValue GRTop, GRTopAddr;
3229
3230     GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3231                             DAG.getConstant(8, getPointerTy()));
3232
3233     GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
3234     GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
3235                         DAG.getConstant(GPRSize, getPointerTy()));
3236
3237     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
3238                                   MachinePointerInfo(SV, 8),
3239                                   false, false, 0));
3240   }
3241
3242   // void *__vr_top at offset 16
3243   int FPRSize = FuncInfo->getVariadicFPRSize();
3244   if (FPRSize > 0) {
3245     SDValue VRTop, VRTopAddr;
3246     VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3247                             DAG.getConstant(16, getPointerTy()));
3248
3249     VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
3250     VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
3251                         DAG.getConstant(FPRSize, getPointerTy()));
3252
3253     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
3254                                   MachinePointerInfo(SV, 16),
3255                                   false, false, 0));
3256   }
3257
3258   // int __gr_offs at offset 24
3259   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3260                                    DAG.getConstant(24, getPointerTy()));
3261   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
3262                                 GROffsAddr, MachinePointerInfo(SV, 24),
3263                                 false, false, 0));
3264
3265   // int __vr_offs at offset 28
3266   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3267                                    DAG.getConstant(28, getPointerTy()));
3268   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
3269                                 VROffsAddr, MachinePointerInfo(SV, 28),
3270                                 false, false, 0));
3271
3272   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
3273                      MemOps.size());
3274 }
3275
3276 SDValue
3277 AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3278   switch (Op.getOpcode()) {
3279   default: llvm_unreachable("Don't know how to custom lower this!");
3280   case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
3281   case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
3282   case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
3283   case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
3284   case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
3285   case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
3286   case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
3287   case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
3288   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
3289   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
3290   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
3291   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
3292
3293   case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
3294   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3295   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
3296   case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
3297   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
3298   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
3299   case ISD::JumpTable: return LowerJumpTable(Op, DAG);
3300   case ISD::SELECT: return LowerSELECT(Op, DAG);
3301   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
3302   case ISD::SETCC: return LowerSETCC(Op, DAG);
3303   case ISD::VACOPY: return LowerVACOPY(Op, DAG);
3304   case ISD::VASTART: return LowerVASTART(Op, DAG);
3305   case ISD::BUILD_VECTOR:
3306     return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
3307   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
3308   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
3309   }
3310
3311   return SDValue();
3312 }
3313
3314 /// Check if the specified splat value corresponds to a valid vector constant
3315 /// for a Neon instruction with a "modified immediate" operand (e.g., MOVI).  If
3316 /// so, return the encoded 8-bit immediate and the OpCmode instruction fields
3317 /// values.
3318 static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
3319                               unsigned SplatBitSize, SelectionDAG &DAG,
3320                               bool is128Bits, NeonModImmType type, EVT &VT,
3321                               unsigned &Imm, unsigned &OpCmode) {
3322   switch (SplatBitSize) {
3323   default:
3324     llvm_unreachable("unexpected size for isNeonModifiedImm");
3325   case 8: {
3326     if (type != Neon_Mov_Imm)
3327       return false;
3328     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
3329     // Neon movi per byte: Op=0, Cmode=1110.
3330     OpCmode = 0xe;
3331     Imm = SplatBits;
3332     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
3333     break;
3334   }
3335   case 16: {
3336     // Neon move inst per halfword
3337     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
3338     if ((SplatBits & ~0xff) == 0) {
3339       // Value = 0x00nn is 0x00nn LSL 0
3340       // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
3341       // bic:  Op=1, Cmode=1001;  orr:  Op=0, Cmode=1001
3342       // Op=x, Cmode=100y
3343       Imm = SplatBits;
3344       OpCmode = 0x8;
3345       break;
3346     }
3347     if ((SplatBits & ~0xff00) == 0) {
3348       // Value = 0xnn00 is 0x00nn LSL 8
3349       // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
3350       // bic:  Op=1, Cmode=1011;  orr:  Op=0, Cmode=1011
3351       // Op=x, Cmode=101x
3352       Imm = SplatBits >> 8;
3353       OpCmode = 0xa;
3354       break;
3355     }
3356     // can't handle any other
3357     return false;
3358   }
3359
3360   case 32: {
3361     // First the LSL variants (MSL is unusable by some interested instructions).
3362
3363     // Neon move instr per word, shift zeros
3364     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
3365     if ((SplatBits & ~0xff) == 0) {
3366       // Value = 0x000000nn is 0x000000nn LSL 0
3367       // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
3368       // bic:  Op=1, Cmode= 0001; orr:  Op=0, Cmode= 0001
3369       // Op=x, Cmode=000x
3370       Imm = SplatBits;
3371       OpCmode = 0;
3372       break;
3373     }
3374     if ((SplatBits & ~0xff00) == 0) {
3375       // Value = 0x0000nn00 is 0x000000nn LSL 8
3376       // movi: Op=0, Cmode= 0010;  mvni: Op=1, Cmode= 0010
3377       // bic:  Op=1, Cmode= 0011;  orr : Op=0, Cmode= 0011
3378       // Op=x, Cmode=001x
3379       Imm = SplatBits >> 8;
3380       OpCmode = 0x2;
3381       break;
3382     }
3383     if ((SplatBits & ~0xff0000) == 0) {
3384       // Value = 0x00nn0000 is 0x000000nn LSL 16
3385       // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
3386       // bic:  Op=1, Cmode= 0101; orr:  Op=0, Cmode= 0101
3387       // Op=x, Cmode=010x
3388       Imm = SplatBits >> 16;
3389       OpCmode = 0x4;
3390       break;
3391     }
3392     if ((SplatBits & ~0xff000000) == 0) {
3393       // Value = 0xnn000000 is 0x000000nn LSL 24
3394       // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
3395       // bic:  Op=1, Cmode= 0111; orr:  Op=0, Cmode= 0111
3396       // Op=x, Cmode=011x
3397       Imm = SplatBits >> 24;
3398       OpCmode = 0x6;
3399       break;
3400     }
3401
3402     // Now the MSL immediates.
3403
3404     // Neon move instr per word, shift ones
3405     if ((SplatBits & ~0xffff) == 0 &&
3406         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
3407       // Value = 0x0000nnff is 0x000000nn MSL 8
3408       // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
3409       // Op=x, Cmode=1100
3410       Imm = SplatBits >> 8;
3411       OpCmode = 0xc;
3412       break;
3413     }
3414     if ((SplatBits & ~0xffffff) == 0 &&
3415         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
3416       // Value = 0x00nnffff is 0x000000nn MSL 16
3417       // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
3418       // Op=x, Cmode=1101
3419       Imm = SplatBits >> 16;
3420       OpCmode = 0xd;
3421       break;
3422     }
3423     // can't handle any other
3424     return false;
3425   }
3426
3427   case 64: {
3428     if (type != Neon_Mov_Imm)
3429       return false;
3430     // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
3431     // movi Op=1, Cmode=1110.
3432     OpCmode = 0x1e;
3433     uint64_t BitMask = 0xff;
3434     uint64_t Val = 0;
3435     unsigned ImmMask = 1;
3436     Imm = 0;
3437     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
3438       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
3439         Val |= BitMask;
3440         Imm |= ImmMask;
3441       } else if ((SplatBits & BitMask) != 0) {
3442         return false;
3443       }
3444       BitMask <<= 8;
3445       ImmMask <<= 1;
3446     }
3447     SplatBits = Val;
3448     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
3449     break;
3450   }
3451   }
3452
3453   return true;
3454 }
3455
3456 static SDValue PerformANDCombine(SDNode *N,
3457                                  TargetLowering::DAGCombinerInfo &DCI) {
3458
3459   SelectionDAG &DAG = DCI.DAG;
3460   SDLoc DL(N);
3461   EVT VT = N->getValueType(0);
3462
3463   // We're looking for an SRA/SHL pair which form an SBFX.
3464
3465   if (VT != MVT::i32 && VT != MVT::i64)
3466     return SDValue();
3467
3468   if (!isa<ConstantSDNode>(N->getOperand(1)))
3469     return SDValue();
3470
3471   uint64_t TruncMask = N->getConstantOperandVal(1);
3472   if (!isMask_64(TruncMask))
3473     return SDValue();
3474
3475   uint64_t Width = CountPopulation_64(TruncMask);
3476   SDValue Shift = N->getOperand(0);
3477
3478   if (Shift.getOpcode() != ISD::SRL)
3479     return SDValue();
3480
3481   if (!isa<ConstantSDNode>(Shift->getOperand(1)))
3482     return SDValue();
3483   uint64_t LSB = Shift->getConstantOperandVal(1);
3484
3485   if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
3486     return SDValue();
3487
3488   return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
3489                      DAG.getConstant(LSB, MVT::i64),
3490                      DAG.getConstant(LSB + Width - 1, MVT::i64));
3491 }
3492
3493 /// For a true bitfield insert, the bits getting into that contiguous mask
3494 /// should come from the low part of an existing value: they must be formed from
3495 /// a compatible SHL operation (unless they're already low). This function
3496 /// checks that condition and returns the least-significant bit that's
3497 /// intended. If the operation not a field preparation, -1 is returned.
3498 static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
3499                             SDValue &MaskedVal, uint64_t Mask) {
3500   if (!isShiftedMask_64(Mask))
3501     return -1;
3502
3503   // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
3504   // instruction. BFI will do a left-shift by LSB before applying the mask we've
3505   // spotted, so in general we should pre-emptively "undo" that by making sure
3506   // the incoming bits have had a right-shift applied to them.
3507   //
3508   // This right shift, however, will combine with existing left/right shifts. In
3509   // the simplest case of a completely straight bitfield operation, it will be
3510   // expected to completely cancel out with an existing SHL. More complicated
3511   // cases (e.g. bitfield to bitfield copy) may still need a real shift before
3512   // the BFI.
3513
3514   uint64_t LSB = countTrailingZeros(Mask);
3515   int64_t ShiftRightRequired = LSB;
3516   if (MaskedVal.getOpcode() == ISD::SHL &&
3517       isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
3518     ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
3519     MaskedVal = MaskedVal.getOperand(0);
3520   } else if (MaskedVal.getOpcode() == ISD::SRL &&
3521              isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
3522     ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
3523     MaskedVal = MaskedVal.getOperand(0);
3524   }
3525
3526   if (ShiftRightRequired > 0)
3527     MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
3528                             DAG.getConstant(ShiftRightRequired, MVT::i64));
3529   else if (ShiftRightRequired < 0) {
3530     // We could actually end up with a residual left shift, for example with
3531     // "struc.bitfield = val << 1".
3532     MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
3533                             DAG.getConstant(-ShiftRightRequired, MVT::i64));
3534   }
3535
3536   return LSB;
3537 }
3538
3539 /// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
3540 /// a mask and an extension. Returns true if a BFI was found and provides
3541 /// information on its surroundings.
3542 static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
3543                           bool &Extended) {
3544   Extended = false;
3545   if (N.getOpcode() == ISD::ZERO_EXTEND) {
3546     Extended = true;
3547     N = N.getOperand(0);
3548   }
3549
3550   if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
3551     Mask = N->getConstantOperandVal(1);
3552     N = N.getOperand(0);
3553   } else {
3554     // Mask is the whole width.
3555     Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
3556   }
3557
3558   if (N.getOpcode() == AArch64ISD::BFI) {
3559     BFI = N;
3560     return true;
3561   }
3562
3563   return false;
3564 }
3565
3566 /// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
3567 /// is roughly equivalent to (and (BFI ...), mask). This form is used because it
3568 /// can often be further combined with a larger mask. Ultimately, we want mask
3569 /// to be 2^32-1 or 2^64-1 so the AND can be skipped.
3570 static SDValue tryCombineToBFI(SDNode *N,
3571                                TargetLowering::DAGCombinerInfo &DCI,
3572                                const AArch64Subtarget *Subtarget) {
3573   SelectionDAG &DAG = DCI.DAG;
3574   SDLoc DL(N);
3575   EVT VT = N->getValueType(0);
3576
3577   assert(N->getOpcode() == ISD::OR && "Unexpected root");
3578
3579   // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
3580   // abandon the effort.
3581   SDValue LHS = N->getOperand(0);
3582   if (LHS.getOpcode() != ISD::AND)
3583     return SDValue();
3584
3585   uint64_t LHSMask;
3586   if (isa<ConstantSDNode>(LHS.getOperand(1)))
3587     LHSMask = LHS->getConstantOperandVal(1);
3588   else
3589     return SDValue();
3590
3591   // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
3592   // is or abandon the effort.
3593   SDValue RHS = N->getOperand(1);
3594   if (RHS.getOpcode() != ISD::AND)
3595     return SDValue();
3596
3597   uint64_t RHSMask;
3598   if (isa<ConstantSDNode>(RHS.getOperand(1)))
3599     RHSMask = RHS->getConstantOperandVal(1);
3600   else
3601     return SDValue();
3602
3603   // Can't do anything if the masks are incompatible.
3604   if (LHSMask & RHSMask)
3605     return SDValue();
3606
3607   // Now we need one of the masks to be a contiguous field. Without loss of
3608   // generality that should be the RHS one.
3609   SDValue Bitfield = LHS.getOperand(0);
3610   if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
3611     // We know that LHS is a candidate new value, and RHS isn't already a better
3612     // one.
3613     std::swap(LHS, RHS);
3614     std::swap(LHSMask, RHSMask);
3615   }
3616
3617   // We've done our best to put the right operands in the right places, all we
3618   // can do now is check whether a BFI exists.
3619   Bitfield = RHS.getOperand(0);
3620   int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
3621   if (LSB == -1)
3622     return SDValue();
3623
3624   uint32_t Width = CountPopulation_64(RHSMask);
3625   assert(Width && "Expected non-zero bitfield width");
3626
3627   SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
3628                             LHS.getOperand(0), Bitfield,
3629                             DAG.getConstant(LSB, MVT::i64),
3630                             DAG.getConstant(Width, MVT::i64));
3631
3632   // Mask is trivial
3633   if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
3634     return BFI;
3635
3636   return DAG.getNode(ISD::AND, DL, VT, BFI,
3637                      DAG.getConstant(LHSMask | RHSMask, VT));
3638 }
3639
3640 /// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
3641 /// original input. This is surprisingly common because SROA splits things up
3642 /// into i8 chunks, so the originally detected MaskedBFI may actually only act
3643 /// on the low (say) byte of a word. This is then orred into the rest of the
3644 /// word afterwards.
3645 ///
3646 /// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
3647 ///
3648 /// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
3649 /// MaskedBFI. We can also deal with a certain amount of extend/truncate being
3650 /// involved.
3651 static SDValue tryCombineToLargerBFI(SDNode *N,
3652                                      TargetLowering::DAGCombinerInfo &DCI,
3653                                      const AArch64Subtarget *Subtarget) {
3654   SelectionDAG &DAG = DCI.DAG;
3655   SDLoc DL(N);
3656   EVT VT = N->getValueType(0);
3657
3658   // First job is to hunt for a MaskedBFI on either the left or right. Swap
3659   // operands if it's actually on the right.
3660   SDValue BFI;
3661   SDValue PossExtraMask;
3662   uint64_t ExistingMask = 0;
3663   bool Extended = false;
3664   if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
3665     PossExtraMask = N->getOperand(1);
3666   else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
3667     PossExtraMask = N->getOperand(0);
3668   else
3669     return SDValue();
3670
3671   // We can only combine a BFI with another compatible mask.
3672   if (PossExtraMask.getOpcode() != ISD::AND ||
3673       !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
3674     return SDValue();
3675
3676   uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
3677
3678   // Masks must be compatible.
3679   if (ExtraMask & ExistingMask)
3680     return SDValue();
3681
3682   SDValue OldBFIVal = BFI.getOperand(0);
3683   SDValue NewBFIVal = BFI.getOperand(1);
3684   if (Extended) {
3685     // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
3686     // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
3687     // need to be made compatible.
3688     assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
3689            && "Invalid types for BFI");
3690     OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
3691     NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
3692   }
3693
3694   // We need the MaskedBFI to be combined with a mask of the *same* value.
3695   if (PossExtraMask.getOperand(0) != OldBFIVal)
3696     return SDValue();
3697
3698   BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
3699                     OldBFIVal, NewBFIVal,
3700                     BFI.getOperand(2), BFI.getOperand(3));
3701
3702   // If the masking is trivial, we don't need to create it.
3703   if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
3704     return BFI;
3705
3706   return DAG.getNode(ISD::AND, DL, VT, BFI,
3707                      DAG.getConstant(ExtraMask | ExistingMask, VT));
3708 }
3709
3710 /// An EXTR instruction is made up of two shifts, ORed together. This helper
3711 /// searches for and classifies those shifts.
3712 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
3713                          bool &FromHi) {
3714   if (N.getOpcode() == ISD::SHL)
3715     FromHi = false;
3716   else if (N.getOpcode() == ISD::SRL)
3717     FromHi = true;
3718   else
3719     return false;
3720
3721   if (!isa<ConstantSDNode>(N.getOperand(1)))
3722     return false;
3723
3724   ShiftAmount = N->getConstantOperandVal(1);
3725   Src = N->getOperand(0);
3726   return true;
3727 }
3728
3729 /// EXTR instruction extracts a contiguous chunk of bits from two existing
3730 /// registers viewed as a high/low pair. This function looks for the pattern:
3731 /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
3732 /// EXTR. Can't quite be done in TableGen because the two immediates aren't
3733 /// independent.
3734 static SDValue tryCombineToEXTR(SDNode *N,
3735                                 TargetLowering::DAGCombinerInfo &DCI) {
3736   SelectionDAG &DAG = DCI.DAG;
3737   SDLoc DL(N);
3738   EVT VT = N->getValueType(0);
3739
3740   assert(N->getOpcode() == ISD::OR && "Unexpected root");
3741
3742   if (VT != MVT::i32 && VT != MVT::i64)
3743     return SDValue();
3744
3745   SDValue LHS;
3746   uint32_t ShiftLHS = 0;
3747   bool LHSFromHi = 0;
3748   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
3749     return SDValue();
3750
3751   SDValue RHS;
3752   uint32_t ShiftRHS = 0;
3753   bool RHSFromHi = 0;
3754   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
3755     return SDValue();
3756
3757   // If they're both trying to come from the high part of the register, they're
3758   // not really an EXTR.
3759   if (LHSFromHi == RHSFromHi)
3760     return SDValue();
3761
3762   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
3763     return SDValue();
3764
3765   if (LHSFromHi) {
3766     std::swap(LHS, RHS);
3767     std::swap(ShiftLHS, ShiftRHS);
3768   }
3769
3770   return DAG.getNode(AArch64ISD::EXTR, DL, VT,
3771                      LHS, RHS,
3772                      DAG.getConstant(ShiftRHS, MVT::i64));
3773 }
3774
3775 /// Target-specific dag combine xforms for ISD::OR
3776 static SDValue PerformORCombine(SDNode *N,
3777                                 TargetLowering::DAGCombinerInfo &DCI,
3778                                 const AArch64Subtarget *Subtarget) {
3779
3780   SelectionDAG &DAG = DCI.DAG;
3781   SDLoc DL(N);
3782   EVT VT = N->getValueType(0);
3783
3784   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3785     return SDValue();
3786
3787   // Attempt to recognise bitfield-insert operations.
3788   SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
3789   if (Res.getNode())
3790     return Res;
3791
3792   // Attempt to combine an existing MaskedBFI operation into one with a larger
3793   // mask.
3794   Res = tryCombineToLargerBFI(N, DCI, Subtarget);
3795   if (Res.getNode())
3796     return Res;
3797
3798   Res = tryCombineToEXTR(N, DCI);
3799   if (Res.getNode())
3800     return Res;
3801
3802   if (!Subtarget->hasNEON())
3803     return SDValue();
3804
3805   // Attempt to use vector immediate-form BSL
3806   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
3807
3808   SDValue N0 = N->getOperand(0);
3809   if (N0.getOpcode() != ISD::AND)
3810     return SDValue();
3811
3812   SDValue N1 = N->getOperand(1);
3813   if (N1.getOpcode() != ISD::AND)
3814     return SDValue();
3815
3816   if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
3817     APInt SplatUndef;
3818     unsigned SplatBitSize;
3819     bool HasAnyUndefs;
3820     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
3821     APInt SplatBits0;
3822     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
3823                                       HasAnyUndefs) &&
3824         !HasAnyUndefs) {
3825       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
3826       APInt SplatBits1;
3827       if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
3828                                         HasAnyUndefs) && !HasAnyUndefs &&
3829           SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
3830           SplatBits0 == ~SplatBits1) {
3831
3832         return DAG.getNode(ISD::VSELECT, DL, VT, N0->getOperand(1),
3833                            N0->getOperand(0), N1->getOperand(0));
3834       }
3835     }
3836   }
3837
3838   return SDValue();
3839 }
3840
3841 /// Target-specific dag combine xforms for ISD::SRA
3842 static SDValue PerformSRACombine(SDNode *N,
3843                                  TargetLowering::DAGCombinerInfo &DCI) {
3844
3845   SelectionDAG &DAG = DCI.DAG;
3846   SDLoc DL(N);
3847   EVT VT = N->getValueType(0);
3848
3849   // We're looking for an SRA/SHL pair which form an SBFX.
3850
3851   if (VT != MVT::i32 && VT != MVT::i64)
3852     return SDValue();
3853
3854   if (!isa<ConstantSDNode>(N->getOperand(1)))
3855     return SDValue();
3856
3857   uint64_t ExtraSignBits = N->getConstantOperandVal(1);
3858   SDValue Shift = N->getOperand(0);
3859
3860   if (Shift.getOpcode() != ISD::SHL)
3861     return SDValue();
3862
3863   if (!isa<ConstantSDNode>(Shift->getOperand(1)))
3864     return SDValue();
3865
3866   uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
3867   uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
3868   uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
3869
3870   if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
3871     return SDValue();
3872
3873   return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
3874                      DAG.getConstant(LSB, MVT::i64),
3875                      DAG.getConstant(LSB + Width - 1, MVT::i64));
3876 }
3877
3878 /// Check if this is a valid build_vector for the immediate operand of
3879 /// a vector shift operation, where all the elements of the build_vector
3880 /// must have the same constant integer value.
3881 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
3882   // Ignore bit_converts.
3883   while (Op.getOpcode() == ISD::BITCAST)
3884     Op = Op.getOperand(0);
3885   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
3886   APInt SplatBits, SplatUndef;
3887   unsigned SplatBitSize;
3888   bool HasAnyUndefs;
3889   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
3890                                       HasAnyUndefs, ElementBits) ||
3891       SplatBitSize > ElementBits)
3892     return false;
3893   Cnt = SplatBits.getSExtValue();
3894   return true;
3895 }
3896
3897 /// Check if this is a valid build_vector for the immediate operand of
3898 /// a vector shift left operation.  That value must be in the range:
3899 /// 0 <= Value < ElementBits
3900 static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
3901   assert(VT.isVector() && "vector shift count is not a vector type");
3902   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
3903   if (!getVShiftImm(Op, ElementBits, Cnt))
3904     return false;
3905   return (Cnt >= 0 && Cnt < ElementBits);
3906 }
3907
3908 /// Check if this is a valid build_vector for the immediate operand of a
3909 /// vector shift right operation. The value must be in the range:
3910 ///   1 <= Value <= ElementBits
3911 static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
3912   assert(VT.isVector() && "vector shift count is not a vector type");
3913   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
3914   if (!getVShiftImm(Op, ElementBits, Cnt))
3915     return false;
3916   return (Cnt >= 1 && Cnt <= ElementBits);
3917 }
3918
3919 static SDValue GenForSextInreg(SDNode *N,
3920                                TargetLowering::DAGCombinerInfo &DCI,
3921                                EVT SrcVT, EVT DestVT, EVT SubRegVT,
3922                                const int *Mask, SDValue Src) {
3923   SelectionDAG &DAG = DCI.DAG;
3924   SDValue Bitcast
3925     = DAG.getNode(ISD::BITCAST, SDLoc(N), SrcVT, Src);
3926   SDValue Sext
3927     = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), DestVT, Bitcast);
3928   SDValue ShuffleVec
3929     = DAG.getVectorShuffle(DestVT, SDLoc(N), Sext, DAG.getUNDEF(DestVT), Mask);
3930   SDValue ExtractSubreg
3931     = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N),
3932                 SubRegVT, ShuffleVec,
3933                 DAG.getTargetConstant(AArch64::sub_64, MVT::i32)), 0);
3934   return ExtractSubreg;
3935 }
3936
3937 /// Checks for vector shifts and lowers them.
3938 static SDValue PerformShiftCombine(SDNode *N,
3939                                    TargetLowering::DAGCombinerInfo &DCI,
3940                                    const AArch64Subtarget *ST) {
3941   SelectionDAG &DAG = DCI.DAG;
3942   EVT VT = N->getValueType(0);
3943   if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
3944     return PerformSRACombine(N, DCI);
3945
3946   // We're looking for an SRA/SHL pair to help generating instruction
3947   //   sshll  v0.8h, v0.8b, #0
3948   // The instruction STXL is also the alias of this instruction.
3949   //
3950   // For example, for DAG like below,
3951   //   v2i32 = sra (v2i32 (shl v2i32, 16)), 16
3952   // we can transform it into
3953   //   v2i32 = EXTRACT_SUBREG
3954   //             (v4i32 (suffle_vector
3955   //                       (v4i32 (sext (v4i16 (bitcast v2i32))),
3956   //                       undef, (0, 2, u, u)),
3957   //             sub_64
3958   //
3959   // With this transformation we expect to generate "SSHLL + UZIP1"
3960   // Sometimes UZIP1 can be optimized away by combining with other context.
3961   int64_t ShrCnt, ShlCnt;
3962   if (N->getOpcode() == ISD::SRA
3963       && (VT == MVT::v2i32 || VT == MVT::v4i16)
3964       && isVShiftRImm(N->getOperand(1), VT, ShrCnt)
3965       && N->getOperand(0).getOpcode() == ISD::SHL
3966       && isVShiftRImm(N->getOperand(0).getOperand(1), VT, ShlCnt)) {
3967     SDValue Src = N->getOperand(0).getOperand(0);
3968     if (VT == MVT::v2i32 && ShrCnt == 16 && ShlCnt == 16) {
3969       // sext_inreg(v2i32, v2i16)
3970       // We essentially only care the Mask {0, 2, u, u}
3971       int Mask[4] = {0, 2, 4, 6};
3972       return GenForSextInreg(N, DCI, MVT::v4i16, MVT::v4i32, MVT::v2i32,
3973                              Mask, Src);
3974     }
3975     else if (VT == MVT::v2i32 && ShrCnt == 24 && ShlCnt == 24) {
3976       // sext_inreg(v2i16, v2i8)
3977       // We essentially only care the Mask {0, u, 4, u, u, u, u, u, u, u, u, u}
3978       int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3979       return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v2i32,
3980                              Mask, Src);
3981     }
3982     else if (VT == MVT::v4i16 && ShrCnt == 8 && ShlCnt == 8) {
3983       // sext_inreg(v4i16, v4i8)
3984       // We essentially only care the Mask {0, 2, 4, 6, u, u, u, u, u, u, u, u}
3985       int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3986       return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v4i16,
3987                              Mask, Src);
3988     }
3989   }
3990
3991   // Nothing to be done for scalar shifts.
3992   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3993   if (!VT.isVector() || !TLI.isTypeLegal(VT))
3994     return SDValue();
3995
3996   assert(ST->hasNEON() && "unexpected vector shift");
3997   int64_t Cnt;
3998
3999   switch (N->getOpcode()) {
4000   default:
4001     llvm_unreachable("unexpected shift opcode");
4002
4003   case ISD::SHL:
4004     if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
4005       SDValue RHS =
4006           DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
4007                       DAG.getConstant(Cnt, MVT::i32));
4008       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
4009     }
4010     break;
4011
4012   case ISD::SRA:
4013   case ISD::SRL:
4014     if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
4015       SDValue RHS =
4016           DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
4017                       DAG.getConstant(Cnt, MVT::i32));
4018       return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
4019     }
4020     break;
4021   }
4022
4023   return SDValue();
4024 }
4025
4026 /// ARM-specific DAG combining for intrinsics.
4027 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
4028   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
4029
4030   switch (IntNo) {
4031   default:
4032     // Don't do anything for most intrinsics.
4033     break;
4034
4035   case Intrinsic::arm_neon_vqshifts:
4036   case Intrinsic::arm_neon_vqshiftu:
4037     EVT VT = N->getOperand(1).getValueType();
4038     int64_t Cnt;
4039     if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
4040       break;
4041     unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
4042                              ? AArch64ISD::NEON_QSHLs
4043                              : AArch64ISD::NEON_QSHLu;
4044     return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
4045                        N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
4046   }
4047
4048   return SDValue();
4049 }
4050
4051 /// Target-specific DAG combine function for NEON load/store intrinsics
4052 /// to merge base address updates.
4053 static SDValue CombineBaseUpdate(SDNode *N,
4054                                  TargetLowering::DAGCombinerInfo &DCI) {
4055   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
4056     return SDValue();
4057
4058   SelectionDAG &DAG = DCI.DAG;
4059   bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
4060                       N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
4061   unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
4062   SDValue Addr = N->getOperand(AddrOpIdx);
4063
4064   // Search for a use of the address operand that is an increment.
4065   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
4066        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
4067     SDNode *User = *UI;
4068     if (User->getOpcode() != ISD::ADD ||
4069         UI.getUse().getResNo() != Addr.getResNo())
4070       continue;
4071
4072     // Check that the add is independent of the load/store.  Otherwise, folding
4073     // it would create a cycle.
4074     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
4075       continue;
4076
4077     // Find the new opcode for the updating load/store.
4078     bool isLoad = true;
4079     bool isLaneOp = false;
4080     unsigned NewOpc = 0;
4081     unsigned NumVecs = 0;
4082     if (isIntrinsic) {
4083       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
4084       switch (IntNo) {
4085       default: llvm_unreachable("unexpected intrinsic for Neon base update");
4086       case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
4087         NumVecs = 1; break;
4088       case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
4089         NumVecs = 2; break;
4090       case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
4091         NumVecs = 3; break;
4092       case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
4093         NumVecs = 4; break;
4094       case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
4095         NumVecs = 1; isLoad = false; break;
4096       case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
4097         NumVecs = 2; isLoad = false; break;
4098       case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
4099         NumVecs = 3; isLoad = false; break;
4100       case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
4101         NumVecs = 4; isLoad = false; break;
4102       case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
4103         NumVecs = 2; break;
4104       case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
4105         NumVecs = 3; break;
4106       case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
4107         NumVecs = 4; break;
4108       case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
4109         NumVecs = 2; isLoad = false; break;
4110       case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
4111         NumVecs = 3; isLoad = false; break;
4112       case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
4113         NumVecs = 4; isLoad = false; break;
4114       case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
4115         NumVecs = 2; isLaneOp = true; break;
4116       case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
4117         NumVecs = 3; isLaneOp = true; break;
4118       case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
4119         NumVecs = 4; isLaneOp = true; break;
4120       case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
4121         NumVecs = 2; isLoad = false; isLaneOp = true; break;
4122       case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
4123         NumVecs = 3; isLoad = false; isLaneOp = true; break;
4124       case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
4125         NumVecs = 4; isLoad = false; isLaneOp = true; break;
4126       }
4127     } else {
4128       isLaneOp = true;
4129       switch (N->getOpcode()) {
4130       default: llvm_unreachable("unexpected opcode for Neon base update");
4131       case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
4132         NumVecs = 2; break;
4133       case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
4134         NumVecs = 3; break;
4135       case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
4136         NumVecs = 4; break;
4137       }
4138     }
4139
4140     // Find the size of memory referenced by the load/store.
4141     EVT VecTy;
4142     if (isLoad)
4143       VecTy = N->getValueType(0);
4144     else
4145       VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
4146     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
4147     if (isLaneOp)
4148       NumBytes /= VecTy.getVectorNumElements();
4149
4150     // If the increment is a constant, it must match the memory ref size.
4151     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
4152     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
4153       uint32_t IncVal = CInc->getZExtValue();
4154       if (IncVal != NumBytes)
4155         continue;
4156       Inc = DAG.getTargetConstant(IncVal, MVT::i32);
4157     }
4158
4159     // Create the new updating load/store node.
4160     EVT Tys[6];
4161     unsigned NumResultVecs = (isLoad ? NumVecs : 0);
4162     unsigned n;
4163     for (n = 0; n < NumResultVecs; ++n)
4164       Tys[n] = VecTy;
4165     Tys[n++] = MVT::i64;
4166     Tys[n] = MVT::Other;
4167     SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
4168     SmallVector<SDValue, 8> Ops;
4169     Ops.push_back(N->getOperand(0)); // incoming chain
4170     Ops.push_back(N->getOperand(AddrOpIdx));
4171     Ops.push_back(Inc);
4172     for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
4173       Ops.push_back(N->getOperand(i));
4174     }
4175     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
4176     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
4177                                            Ops.data(), Ops.size(),
4178                                            MemInt->getMemoryVT(),
4179                                            MemInt->getMemOperand());
4180
4181     // Update the uses.
4182     std::vector<SDValue> NewResults;
4183     for (unsigned i = 0; i < NumResultVecs; ++i) {
4184       NewResults.push_back(SDValue(UpdN.getNode(), i));
4185     }
4186     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
4187     DCI.CombineTo(N, NewResults);
4188     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
4189
4190     break;
4191   }
4192   return SDValue();
4193 }
4194
4195 /// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
4196 /// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
4197 /// If so, combine them to a vldN-dup operation and return true.
4198 static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
4199   SelectionDAG &DAG = DCI.DAG;
4200   EVT VT = N->getValueType(0);
4201
4202   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
4203   SDNode *VLD = N->getOperand(0).getNode();
4204   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
4205     return SDValue();
4206   unsigned NumVecs = 0;
4207   unsigned NewOpc = 0;
4208   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
4209   if (IntNo == Intrinsic::arm_neon_vld2lane) {
4210     NumVecs = 2;
4211     NewOpc = AArch64ISD::NEON_LD2DUP;
4212   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
4213     NumVecs = 3;
4214     NewOpc = AArch64ISD::NEON_LD3DUP;
4215   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
4216     NumVecs = 4;
4217     NewOpc = AArch64ISD::NEON_LD4DUP;
4218   } else {
4219     return SDValue();
4220   }
4221
4222   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
4223   // numbers match the load.
4224   unsigned VLDLaneNo =
4225       cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
4226   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
4227        UI != UE; ++UI) {
4228     // Ignore uses of the chain result.
4229     if (UI.getUse().getResNo() == NumVecs)
4230       continue;
4231     SDNode *User = *UI;
4232     if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
4233         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
4234       return SDValue();
4235   }
4236
4237   // Create the vldN-dup node.
4238   EVT Tys[5];
4239   unsigned n;
4240   for (n = 0; n < NumVecs; ++n)
4241     Tys[n] = VT;
4242   Tys[n] = MVT::Other;
4243   SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
4244   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
4245   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
4246   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
4247                                            VLDMemInt->getMemoryVT(),
4248                                            VLDMemInt->getMemOperand());
4249
4250   // Update the uses.
4251   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
4252        UI != UE; ++UI) {
4253     unsigned ResNo = UI.getUse().getResNo();
4254     // Ignore uses of the chain result.
4255     if (ResNo == NumVecs)
4256       continue;
4257     SDNode *User = *UI;
4258     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
4259   }
4260
4261   // Now the vldN-lane intrinsic is dead except for its chain result.
4262   // Update uses of the chain.
4263   std::vector<SDValue> VLDDupResults;
4264   for (unsigned n = 0; n < NumVecs; ++n)
4265     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
4266   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
4267   DCI.CombineTo(VLD, VLDDupResults);
4268
4269   return SDValue(N, 0);
4270 }
4271
4272 // v1i1 setcc ->
4273 //     v1i1 (bitcast (i1 setcc (extract_vector_elt, extract_vector_elt))
4274 // FIXME: Currently the type legalizer can't handle SETCC having v1i1 as result.
4275 // If it can legalize "v1i1 SETCC" correctly, no need to combine such SETCC.
4276 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
4277   EVT ResVT = N->getValueType(0);
4278
4279   if (!ResVT.isVector() || ResVT.getVectorNumElements() != 1 ||
4280       ResVT.getVectorElementType() != MVT::i1)
4281     return SDValue();
4282
4283   SDValue LHS = N->getOperand(0);
4284   SDValue RHS = N->getOperand(1);
4285   EVT CmpVT = LHS.getValueType();
4286   LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
4287                     CmpVT.getVectorElementType(), LHS,
4288                     DAG.getConstant(0, MVT::i64));
4289   RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
4290                     CmpVT.getVectorElementType(), RHS,
4291                     DAG.getConstant(0, MVT::i64));
4292   SDValue SetCC =
4293       DAG.getSetCC(SDLoc(N), MVT::i1, LHS, RHS,
4294                    cast<CondCodeSDNode>(N->getOperand(2))->get());
4295   return DAG.getNode(ISD::BITCAST, SDLoc(N), ResVT, SetCC);
4296 }
4297
4298 // vselect (v1i1 setcc) ->
4299 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
4300 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
4301 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
4302 // such VSELECT.
4303 static SDValue PerformVSelectCombine(SDNode *N, SelectionDAG &DAG) {
4304   SDValue N0 = N->getOperand(0);
4305   EVT CCVT = N0.getValueType();
4306
4307   if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
4308       CCVT.getVectorElementType() != MVT::i1)
4309     return SDValue();
4310
4311   EVT ResVT = N->getValueType(0);
4312   EVT CmpVT = N0.getOperand(0).getValueType();
4313   // Only combine when the result type is of the same size as the compared
4314   // operands.
4315   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
4316     return SDValue();
4317
4318   SDValue IfTrue = N->getOperand(1);
4319   SDValue IfFalse = N->getOperand(2);
4320   SDValue SetCC =
4321       DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
4322                    N0.getOperand(0), N0.getOperand(1),
4323                    cast<CondCodeSDNode>(N0.getOperand(2))->get());
4324   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
4325                      IfTrue, IfFalse);
4326 }
4327
4328 // sign_extend (extract_vector_elt (v1i1 setcc)) ->
4329 //     extract_vector_elt (v1iXX setcc)
4330 // (XX is the size of the compared operand type)
4331 static SDValue PerformSignExtendCombine(SDNode *N, SelectionDAG &DAG) {
4332   SDValue N0 = N->getOperand(0);
4333   SDValue Vec = N0.getOperand(0);
4334
4335   if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
4336       Vec.getOpcode() != ISD::SETCC)
4337     return SDValue();
4338
4339   EVT ResVT = N->getValueType(0);
4340   EVT CmpVT = Vec.getOperand(0).getValueType();
4341   // Only optimize when the result type is of the same size as the element
4342   // type of the compared operand.
4343   if (ResVT.getSizeInBits() != CmpVT.getVectorElementType().getSizeInBits())
4344     return SDValue();
4345
4346   SDValue Lane = N0.getOperand(1);
4347   SDValue SetCC =
4348       DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
4349                    Vec.getOperand(0), Vec.getOperand(1),
4350                    cast<CondCodeSDNode>(Vec.getOperand(2))->get());
4351   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ResVT,
4352                      SetCC, Lane);
4353 }
4354
4355 SDValue
4356 AArch64TargetLowering::PerformDAGCombine(SDNode *N,
4357                                          DAGCombinerInfo &DCI) const {
4358   switch (N->getOpcode()) {
4359   default: break;
4360   case ISD::AND: return PerformANDCombine(N, DCI);
4361   case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
4362   case ISD::SHL:
4363   case ISD::SRA:
4364   case ISD::SRL:
4365     return PerformShiftCombine(N, DCI, getSubtarget());
4366   case ISD::SETCC: return PerformSETCCCombine(N, DCI.DAG);
4367   case ISD::VSELECT: return PerformVSelectCombine(N, DCI.DAG);
4368   case ISD::SIGN_EXTEND: return PerformSignExtendCombine(N, DCI.DAG);
4369   case ISD::INTRINSIC_WO_CHAIN:
4370     return PerformIntrinsicCombine(N, DCI.DAG);
4371   case AArch64ISD::NEON_VDUPLANE:
4372     return CombineVLDDUP(N, DCI);
4373   case AArch64ISD::NEON_LD2DUP:
4374   case AArch64ISD::NEON_LD3DUP:
4375   case AArch64ISD::NEON_LD4DUP:
4376     return CombineBaseUpdate(N, DCI);
4377   case ISD::INTRINSIC_VOID:
4378   case ISD::INTRINSIC_W_CHAIN:
4379     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
4380     case Intrinsic::arm_neon_vld1:
4381     case Intrinsic::arm_neon_vld2:
4382     case Intrinsic::arm_neon_vld3:
4383     case Intrinsic::arm_neon_vld4:
4384     case Intrinsic::arm_neon_vst1:
4385     case Intrinsic::arm_neon_vst2:
4386     case Intrinsic::arm_neon_vst3:
4387     case Intrinsic::arm_neon_vst4:
4388     case Intrinsic::arm_neon_vld2lane:
4389     case Intrinsic::arm_neon_vld3lane:
4390     case Intrinsic::arm_neon_vld4lane:
4391     case Intrinsic::aarch64_neon_vld1x2:
4392     case Intrinsic::aarch64_neon_vld1x3:
4393     case Intrinsic::aarch64_neon_vld1x4:
4394     case Intrinsic::aarch64_neon_vst1x2:
4395     case Intrinsic::aarch64_neon_vst1x3:
4396     case Intrinsic::aarch64_neon_vst1x4:
4397     case Intrinsic::arm_neon_vst2lane:
4398     case Intrinsic::arm_neon_vst3lane:
4399     case Intrinsic::arm_neon_vst4lane:
4400       return CombineBaseUpdate(N, DCI);
4401     default:
4402       break;
4403     }
4404   }
4405   return SDValue();
4406 }
4407
4408 bool
4409 AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
4410   VT = VT.getScalarType();
4411
4412   if (!VT.isSimple())
4413     return false;
4414
4415   switch (VT.getSimpleVT().SimpleTy) {
4416   case MVT::f16:
4417   case MVT::f32:
4418   case MVT::f64:
4419     return true;
4420   case MVT::f128:
4421     return false;
4422   default:
4423     break;
4424   }
4425
4426   return false;
4427 }
4428 // Check whether a shuffle_vector could be presented as concat_vector.
4429 bool AArch64TargetLowering::isConcatVector(SDValue Op, SelectionDAG &DAG,
4430                                            SDValue V0, SDValue V1,
4431                                            const int *Mask,
4432                                            SDValue &Res) const {
4433   SDLoc DL(Op);
4434   EVT VT = Op.getValueType();
4435   if (VT.getSizeInBits() != 128)
4436     return false;
4437   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
4438       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
4439     return false;
4440
4441   unsigned NumElts = VT.getVectorNumElements();
4442   bool isContactVector = true;
4443   bool splitV0 = false;
4444   if (V0.getValueType().getSizeInBits() == 128)
4445     splitV0 = true;
4446
4447   for (int I = 0, E = NumElts / 2; I != E; I++) {
4448     if (Mask[I] != I) {
4449       isContactVector = false;
4450       break;
4451     }
4452   }
4453
4454   if (isContactVector) {
4455     int offset = NumElts / 2;
4456     for (int I = NumElts / 2, E = NumElts; I != E; I++) {
4457       if (Mask[I] != I + splitV0 * offset) {
4458         isContactVector = false;
4459         break;
4460       }
4461     }
4462   }
4463
4464   if (isContactVector) {
4465     EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
4466                                   NumElts / 2);
4467     if (splitV0) {
4468       V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
4469                        DAG.getConstant(0, MVT::i64));
4470     }
4471     if (V1.getValueType().getSizeInBits() == 128) {
4472       V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
4473                        DAG.getConstant(0, MVT::i64));
4474     }
4475     Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
4476     return true;
4477   }
4478   return false;
4479 }
4480
4481 // Check whether a Build Vector could be presented as Shuffle Vector.
4482 // This Shuffle Vector maybe not legalized, so the length of its operand and
4483 // the length of result may not equal.
4484 bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
4485                                                  SDValue &V0, SDValue &V1,
4486                                                  int *Mask) const {
4487   SDLoc DL(Op);
4488   EVT VT = Op.getValueType();
4489   unsigned NumElts = VT.getVectorNumElements();
4490   unsigned V0NumElts = 0;
4491
4492   // Check if all elements are extracted from less than 3 vectors.
4493   for (unsigned i = 0; i < NumElts; ++i) {
4494     SDValue Elt = Op.getOperand(i);
4495     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
4496         Elt.getOperand(0).getValueType().getVectorElementType() !=
4497             VT.getVectorElementType())
4498       return false;
4499
4500     if (V0.getNode() == 0) {
4501       V0 = Elt.getOperand(0);
4502       V0NumElts = V0.getValueType().getVectorNumElements();
4503     }
4504     if (Elt.getOperand(0) == V0) {
4505       Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
4506       continue;
4507     } else if (V1.getNode() == 0) {
4508       V1 = Elt.getOperand(0);
4509     }
4510     if (Elt.getOperand(0) == V1) {
4511       unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
4512       Mask[i] = (Lane + V0NumElts);
4513       continue;
4514     } else {
4515       return false;
4516     }
4517   }
4518   return true;
4519 }
4520
4521 // If this is a case we can't handle, return null and let the default
4522 // expansion code take care of it.
4523 SDValue
4524 AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
4525                                          const AArch64Subtarget *ST) const {
4526
4527   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
4528   SDLoc DL(Op);
4529   EVT VT = Op.getValueType();
4530
4531   APInt SplatBits, SplatUndef;
4532   unsigned SplatBitSize;
4533   bool HasAnyUndefs;
4534
4535   unsigned UseNeonMov = VT.getSizeInBits() >= 64;
4536
4537   // Note we favor lowering MOVI over MVNI.
4538   // This has implications on the definition of patterns in TableGen to select
4539   // BIC immediate instructions but not ORR immediate instructions.
4540   // If this lowering order is changed, TableGen patterns for BIC immediate and
4541   // ORR immediate instructions have to be updated.
4542   if (UseNeonMov &&
4543       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
4544     if (SplatBitSize <= 64) {
4545       // First attempt to use vector immediate-form MOVI
4546       EVT NeonMovVT;
4547       unsigned Imm = 0;
4548       unsigned OpCmode = 0;
4549
4550       if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
4551                             SplatBitSize, DAG, VT.is128BitVector(),
4552                             Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
4553         SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
4554         SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
4555
4556         if (ImmVal.getNode() && OpCmodeVal.getNode()) {
4557           SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
4558                                         ImmVal, OpCmodeVal);
4559           return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
4560         }
4561       }
4562
4563       // Then attempt to use vector immediate-form MVNI
4564       uint64_t NegatedImm = (~SplatBits).getZExtValue();
4565       if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
4566                             DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
4567                             Imm, OpCmode)) {
4568         SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
4569         SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
4570         if (ImmVal.getNode() && OpCmodeVal.getNode()) {
4571           SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
4572                                         ImmVal, OpCmodeVal);
4573           return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
4574         }
4575       }
4576
4577       // Attempt to use vector immediate-form FMOV
4578       if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
4579           (VT == MVT::v2f64 && SplatBitSize == 64)) {
4580         APFloat RealVal(
4581             SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
4582             SplatBits);
4583         uint32_t ImmVal;
4584         if (A64Imms::isFPImm(RealVal, ImmVal)) {
4585           SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
4586           return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
4587         }
4588       }
4589     }
4590   }
4591
4592   unsigned NumElts = VT.getVectorNumElements();
4593   bool isOnlyLowElement = true;
4594   bool usesOnlyOneValue = true;
4595   bool hasDominantValue = false;
4596   bool isConstant = true;
4597
4598   // Map of the number of times a particular SDValue appears in the
4599   // element list.
4600   DenseMap<SDValue, unsigned> ValueCounts;
4601   SDValue Value;
4602   for (unsigned i = 0; i < NumElts; ++i) {
4603     SDValue V = Op.getOperand(i);
4604     if (V.getOpcode() == ISD::UNDEF)
4605       continue;
4606     if (i > 0)
4607       isOnlyLowElement = false;
4608     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
4609       isConstant = false;
4610
4611     ValueCounts.insert(std::make_pair(V, 0));
4612     unsigned &Count = ValueCounts[V];
4613
4614     // Is this value dominant? (takes up more than half of the lanes)
4615     if (++Count > (NumElts / 2)) {
4616       hasDominantValue = true;
4617       Value = V;
4618     }
4619   }
4620   if (ValueCounts.size() != 1)
4621     usesOnlyOneValue = false;
4622   if (!Value.getNode() && ValueCounts.size() > 0)
4623     Value = ValueCounts.begin()->first;
4624
4625   if (ValueCounts.size() == 0)
4626     return DAG.getUNDEF(VT);
4627
4628   if (isOnlyLowElement)
4629     return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
4630
4631   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4632   if (hasDominantValue && EltSize <= 64) {
4633     // Use VDUP for non-constant splats.
4634     if (!isConstant) {
4635       SDValue N;
4636
4637       // If we are DUPing a value that comes directly from a vector, we could
4638       // just use DUPLANE. We can only do this if the lane being extracted
4639       // is at a constant index, as the DUP from lane instructions only have
4640       // constant-index forms.
4641       //
4642       // If there is a TRUNCATE between EXTRACT_VECTOR_ELT and DUP, we can
4643       // remove TRUNCATE for DUPLANE by apdating the source vector to
4644       // appropriate vector type and lane index.
4645       //
4646       // FIXME: for now we have v1i8, v1i16, v1i32 legal vector types, if they
4647       // are not legal any more, no need to check the type size in bits should
4648       // be large than 64.
4649       SDValue V = Value;
4650       if (Value->getOpcode() == ISD::TRUNCATE)
4651         V = Value->getOperand(0);
4652       if (V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
4653           isa<ConstantSDNode>(V->getOperand(1)) &&
4654           V->getOperand(0).getValueType().getSizeInBits() >= 64) {
4655
4656         // If the element size of source vector is larger than DUPLANE
4657         // element size, we can do transformation by,
4658         // 1) bitcasting source register to smaller element vector
4659         // 2) mutiplying the lane index by SrcEltSize/ResEltSize
4660         // For example, we can lower
4661         //     "v8i16 vdup_lane(v4i32, 1)"
4662         // to be
4663         //     "v8i16 vdup_lane(v8i16 bitcast(v4i32), 2)".
4664         SDValue SrcVec = V->getOperand(0);
4665         unsigned SrcEltSize =
4666             SrcVec.getValueType().getVectorElementType().getSizeInBits();
4667         unsigned ResEltSize = VT.getVectorElementType().getSizeInBits();
4668         if (SrcEltSize > ResEltSize) {
4669           assert((SrcEltSize % ResEltSize == 0) && "Invalid element size");
4670           SDValue BitCast;
4671           unsigned SrcSize = SrcVec.getValueType().getSizeInBits();
4672           unsigned ResSize = VT.getSizeInBits();
4673
4674           if (SrcSize > ResSize) {
4675             assert((SrcSize % ResSize == 0) && "Invalid vector size");
4676             EVT CastVT =
4677                 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
4678                                  SrcSize / ResEltSize);
4679             BitCast = DAG.getNode(ISD::BITCAST, DL, CastVT, SrcVec);
4680           } else {
4681             assert((SrcSize == ResSize) && "Invalid vector size of source vec");
4682             BitCast = DAG.getNode(ISD::BITCAST, DL, VT, SrcVec);
4683           }
4684
4685           unsigned LaneIdx = V->getConstantOperandVal(1);
4686           SDValue Lane =
4687               DAG.getConstant((SrcEltSize / ResEltSize) * LaneIdx, MVT::i64);
4688           N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, BitCast, Lane);
4689         } else {
4690           assert((SrcEltSize == ResEltSize) &&
4691                  "Invalid element size of source vec");
4692           N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, V->getOperand(0),
4693                           V->getOperand(1));
4694         }
4695       } else
4696         N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
4697
4698       if (!usesOnlyOneValue) {
4699         // The dominant value was splatted as 'N', but we now have to insert
4700         // all differing elements.
4701         for (unsigned I = 0; I < NumElts; ++I) {
4702           if (Op.getOperand(I) == Value)
4703             continue;
4704           SmallVector<SDValue, 3> Ops;
4705           Ops.push_back(N);
4706           Ops.push_back(Op.getOperand(I));
4707           Ops.push_back(DAG.getConstant(I, MVT::i64));
4708           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
4709         }
4710       }
4711       return N;
4712     }
4713     if (usesOnlyOneValue && isConstant) {
4714       return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
4715     }
4716   }
4717   // If all elements are constants and the case above didn't get hit, fall back
4718   // to the default expansion, which will generate a load from the constant
4719   // pool.
4720   if (isConstant)
4721     return SDValue();
4722
4723   // Try to lower this in lowering ShuffleVector way.
4724   SDValue V0, V1;
4725   int Mask[16];
4726   if (isKnownShuffleVector(Op, DAG, V0, V1, Mask)) {
4727     unsigned V0NumElts = V0.getValueType().getVectorNumElements();
4728     if (!V1.getNode() && V0NumElts == NumElts * 2) {
4729       V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
4730                        DAG.getConstant(NumElts, MVT::i64));
4731       V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
4732                        DAG.getConstant(0, MVT::i64));
4733       V0NumElts = V0.getValueType().getVectorNumElements();
4734     }
4735
4736     if (V1.getNode() && NumElts == V0NumElts &&
4737         V0NumElts == V1.getValueType().getVectorNumElements()) {
4738       SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
4739       if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
4740         return Shuffle;
4741       else
4742         return LowerVECTOR_SHUFFLE(Shuffle, DAG);
4743     } else {
4744       SDValue Res;
4745       if (isConcatVector(Op, DAG, V0, V1, Mask, Res))
4746         return Res;
4747     }
4748   }
4749
4750   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
4751   // know the default expansion would otherwise fall back on something even
4752   // worse. For a vector with one or two non-undef values, that's
4753   // scalar_to_vector for the elements followed by a shuffle (provided the
4754   // shuffle is valid for the target) and materialization element by element
4755   // on the stack followed by a load for everything else.
4756   if (!isConstant && !usesOnlyOneValue) {
4757     SDValue Vec = DAG.getUNDEF(VT);
4758     for (unsigned i = 0 ; i < NumElts; ++i) {
4759       SDValue V = Op.getOperand(i);
4760       if (V.getOpcode() == ISD::UNDEF)
4761         continue;
4762       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
4763       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
4764     }
4765     return Vec;
4766   }
4767   return SDValue();
4768 }
4769
4770 /// isREVMask - Check if a vector shuffle corresponds to a REV
4771 /// instruction with the specified blocksize.  (The order of the elements
4772 /// within each block of the vector is reversed.)
4773 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
4774   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
4775          "Only possible block sizes for REV are: 16, 32, 64");
4776
4777   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4778   if (EltSz == 64)
4779     return false;
4780
4781   unsigned NumElts = VT.getVectorNumElements();
4782   unsigned BlockElts = M[0] + 1;
4783   // If the first shuffle index is UNDEF, be optimistic.
4784   if (M[0] < 0)
4785     BlockElts = BlockSize / EltSz;
4786
4787   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
4788     return false;
4789
4790   for (unsigned i = 0; i < NumElts; ++i) {
4791     if (M[i] < 0)
4792       continue; // ignore UNDEF indices
4793     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
4794       return false;
4795   }
4796
4797   return true;
4798 }
4799
4800 // isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
4801 // TRN instruction.
4802 static unsigned isPermuteMask(ArrayRef<int> M, EVT VT, bool isV2undef) {
4803   unsigned NumElts = VT.getVectorNumElements();
4804   if (NumElts < 4)
4805     return 0;
4806
4807   bool ismatch = true;
4808
4809   // Check UZP1
4810   for (unsigned i = 0; i < NumElts; ++i) {
4811     unsigned answer = i * 2;
4812     if (isV2undef && answer >= NumElts)
4813       answer -= NumElts;
4814     if (M[i] != -1 && (unsigned)M[i] != answer) {
4815       ismatch = false;
4816       break;
4817     }
4818   }
4819   if (ismatch)
4820     return AArch64ISD::NEON_UZP1;
4821
4822   // Check UZP2
4823   ismatch = true;
4824   for (unsigned i = 0; i < NumElts; ++i) {
4825     unsigned answer = i * 2 + 1;
4826     if (isV2undef && answer >= NumElts)
4827       answer -= NumElts;
4828     if (M[i] != -1 && (unsigned)M[i] != answer) {
4829       ismatch = false;
4830       break;
4831     }
4832   }
4833   if (ismatch)
4834     return AArch64ISD::NEON_UZP2;
4835
4836   // Check ZIP1
4837   ismatch = true;
4838   for (unsigned i = 0; i < NumElts; ++i) {
4839     unsigned answer = i / 2 + NumElts * (i % 2);
4840     if (isV2undef && answer >= NumElts)
4841       answer -= NumElts;
4842     if (M[i] != -1 && (unsigned)M[i] != answer) {
4843       ismatch = false;
4844       break;
4845     }
4846   }
4847   if (ismatch)
4848     return AArch64ISD::NEON_ZIP1;
4849
4850   // Check ZIP2
4851   ismatch = true;
4852   for (unsigned i = 0; i < NumElts; ++i) {
4853     unsigned answer = (NumElts + i) / 2 + NumElts * (i % 2);
4854     if (isV2undef && answer >= NumElts)
4855       answer -= NumElts;
4856     if (M[i] != -1 && (unsigned)M[i] != answer) {
4857       ismatch = false;
4858       break;
4859     }
4860   }
4861   if (ismatch)
4862     return AArch64ISD::NEON_ZIP2;
4863
4864   // Check TRN1
4865   ismatch = true;
4866   for (unsigned i = 0; i < NumElts; ++i) {
4867     unsigned answer = i + (NumElts - 1) * (i % 2);
4868     if (isV2undef && answer >= NumElts)
4869       answer -= NumElts;
4870     if (M[i] != -1 && (unsigned)M[i] != answer) {
4871       ismatch = false;
4872       break;
4873     }
4874   }
4875   if (ismatch)
4876     return AArch64ISD::NEON_TRN1;
4877
4878   // Check TRN2
4879   ismatch = true;
4880   for (unsigned i = 0; i < NumElts; ++i) {
4881     unsigned answer = 1 + i + (NumElts - 1) * (i % 2);
4882     if (isV2undef && answer >= NumElts)
4883       answer -= NumElts;
4884     if (M[i] != -1 && (unsigned)M[i] != answer) {
4885       ismatch = false;
4886       break;
4887     }
4888   }
4889   if (ismatch)
4890     return AArch64ISD::NEON_TRN2;
4891
4892   return 0;
4893 }
4894
4895 SDValue
4896 AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
4897                                            SelectionDAG &DAG) const {
4898   SDValue V1 = Op.getOperand(0);
4899   SDValue V2 = Op.getOperand(1);
4900   SDLoc dl(Op);
4901   EVT VT = Op.getValueType();
4902   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
4903
4904   // Convert shuffles that are directly supported on NEON to target-specific
4905   // DAG nodes, instead of keeping them as shuffles and matching them again
4906   // during code selection.  This is more efficient and avoids the possibility
4907   // of inconsistencies between legalization and selection.
4908   ArrayRef<int> ShuffleMask = SVN->getMask();
4909
4910   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4911   if (EltSize > 64)
4912     return SDValue();
4913
4914   if (isREVMask(ShuffleMask, VT, 64))
4915     return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
4916   if (isREVMask(ShuffleMask, VT, 32))
4917     return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
4918   if (isREVMask(ShuffleMask, VT, 16))
4919     return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
4920
4921   unsigned ISDNo;
4922   if (V2.getOpcode() == ISD::UNDEF)
4923     ISDNo = isPermuteMask(ShuffleMask, VT, true);
4924   else
4925     ISDNo = isPermuteMask(ShuffleMask, VT, false);
4926
4927   if (ISDNo) {
4928     if (V2.getOpcode() == ISD::UNDEF)
4929       return DAG.getNode(ISDNo, dl, VT, V1, V1);
4930     else
4931       return DAG.getNode(ISDNo, dl, VT, V1, V2);
4932   }
4933
4934   SDValue Res;
4935   if (isConcatVector(Op, DAG, V1, V2, &ShuffleMask[0], Res))
4936     return Res;
4937
4938   // If the element of shuffle mask are all the same constant, we can
4939   // transform it into either NEON_VDUP or NEON_VDUPLANE
4940   if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
4941     int Lane = SVN->getSplatIndex();
4942     // If this is undef splat, generate it via "just" vdup, if possible.
4943     if (Lane == -1) Lane = 0;
4944
4945     // Test if V1 is a SCALAR_TO_VECTOR.
4946     if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4947       return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
4948     }
4949     // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
4950     if (V1.getOpcode() == ISD::BUILD_VECTOR) {
4951       bool IsScalarToVector = true;
4952       for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
4953         if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
4954             i != (unsigned)Lane) {
4955           IsScalarToVector = false;
4956           break;
4957         }
4958       if (IsScalarToVector)
4959         return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
4960                            V1.getOperand(Lane));
4961     }
4962
4963     // Test if V1 is a EXTRACT_SUBVECTOR.
4964     if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4965       int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
4966       return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
4967                          DAG.getConstant(Lane + ExtLane, MVT::i64));
4968     }
4969     // Test if V1 is a CONCAT_VECTORS.
4970     if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
4971         V1.getOperand(1).getOpcode() == ISD::UNDEF) {
4972       SDValue Op0 = V1.getOperand(0);
4973       assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
4974              "Invalid vector lane access");
4975       return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
4976                          DAG.getConstant(Lane, MVT::i64));
4977     }
4978
4979     return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
4980                        DAG.getConstant(Lane, MVT::i64));
4981   }
4982
4983   int Length = ShuffleMask.size();
4984   int V1EltNum = V1.getValueType().getVectorNumElements();
4985
4986   // If the number of v1 elements is the same as the number of shuffle mask
4987   // element and the shuffle masks are sequential values, we can transform
4988   // it into NEON_VEXTRACT.
4989   if (V1EltNum == Length) {
4990     // Check if the shuffle mask is sequential.
4991     int SkipUndef = 0;
4992     while (ShuffleMask[SkipUndef] == -1) {
4993       SkipUndef++;
4994     }
4995     int CurMask = ShuffleMask[SkipUndef];
4996     if (CurMask >= SkipUndef) {
4997       bool IsSequential = true;
4998       for (int I = SkipUndef; I < Length; ++I) {
4999         if (ShuffleMask[I] != -1 && ShuffleMask[I] != CurMask) {
5000           IsSequential = false;
5001           break;
5002         }
5003         CurMask++;
5004       }
5005       if (IsSequential) {
5006         assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
5007         unsigned VecSize = EltSize * V1EltNum;
5008         unsigned Index = (EltSize / 8) * (ShuffleMask[SkipUndef] - SkipUndef);
5009         if (VecSize == 64 || VecSize == 128)
5010           return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
5011                              DAG.getConstant(Index, MVT::i64));
5012       }
5013     }
5014   }
5015
5016   // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
5017   // by element from V2 to V1 .
5018   // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
5019   // better choice to be inserted than V1 as less insert needed, so we count
5020   // element to be inserted for both V1 and V2, and select less one as insert
5021   // target.
5022
5023   // Collect elements need to be inserted and their index.
5024   SmallVector<int, 8> NV1Elt;
5025   SmallVector<int, 8> N1Index;
5026   SmallVector<int, 8> NV2Elt;
5027   SmallVector<int, 8> N2Index;
5028   for (int I = 0; I != Length; ++I) {
5029     if (ShuffleMask[I] != I) {
5030       NV1Elt.push_back(ShuffleMask[I]);
5031       N1Index.push_back(I);
5032     }
5033   }
5034   for (int I = 0; I != Length; ++I) {
5035     if (ShuffleMask[I] != (I + V1EltNum)) {
5036       NV2Elt.push_back(ShuffleMask[I]);
5037       N2Index.push_back(I);
5038     }
5039   }
5040
5041   // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
5042   // will be inserted.
5043   SDValue InsV = V1;
5044   SmallVector<int, 8> InsMasks = NV1Elt;
5045   SmallVector<int, 8> InsIndex = N1Index;
5046   if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
5047     if (NV1Elt.size() > NV2Elt.size()) {
5048       InsV = V2;
5049       InsMasks = NV2Elt;
5050       InsIndex = N2Index;
5051     }
5052   } else {
5053     InsV = DAG.getNode(ISD::UNDEF, dl, VT);
5054   }
5055
5056   for (int I = 0, E = InsMasks.size(); I != E; ++I) {
5057     SDValue ExtV = V1;
5058     int Mask = InsMasks[I];
5059     if (Mask >= V1EltNum) {
5060       ExtV = V2;
5061       Mask -= V1EltNum;
5062     }
5063     // Any value type smaller than i32 is illegal in AArch64, and this lower
5064     // function is called after legalize pass, so we need to legalize
5065     // the result here.
5066     EVT EltVT;
5067     if (VT.getVectorElementType().isFloatingPoint())
5068       EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
5069     else
5070       EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
5071
5072     if (Mask >= 0) {
5073       ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
5074                          DAG.getConstant(Mask, MVT::i64));
5075       InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
5076                          DAG.getConstant(InsIndex[I], MVT::i64));
5077     }
5078   }
5079   return InsV;
5080 }
5081
5082 AArch64TargetLowering::ConstraintType
5083 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
5084   if (Constraint.size() == 1) {
5085     switch (Constraint[0]) {
5086     default: break;
5087     case 'w': // An FP/SIMD vector register
5088       return C_RegisterClass;
5089     case 'I': // Constant that can be used with an ADD instruction
5090     case 'J': // Constant that can be used with a SUB instruction
5091     case 'K': // Constant that can be used with a 32-bit logical instruction
5092     case 'L': // Constant that can be used with a 64-bit logical instruction
5093     case 'M': // Constant that can be used as a 32-bit MOV immediate
5094     case 'N': // Constant that can be used as a 64-bit MOV immediate
5095     case 'Y': // Floating point constant zero
5096     case 'Z': // Integer constant zero
5097       return C_Other;
5098     case 'Q': // A memory reference with base register and no offset
5099       return C_Memory;
5100     case 'S': // A symbolic address
5101       return C_Other;
5102     }
5103   }
5104
5105   // FIXME: Ump, Utf, Usa, Ush
5106   // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
5107   //      whatever they may be
5108   // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
5109   // Usa: An absolute symbolic address
5110   // Ush: The high part (bits 32:12) of a pc-relative symbolic address
5111   assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
5112          && Constraint != "Ush" && "Unimplemented constraints");
5113
5114   return TargetLowering::getConstraintType(Constraint);
5115 }
5116
5117 TargetLowering::ConstraintWeight
5118 AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
5119                                                 const char *Constraint) const {
5120
5121   llvm_unreachable("Constraint weight unimplemented");
5122 }
5123
5124 void
5125 AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
5126                                                     std::string &Constraint,
5127                                                     std::vector<SDValue> &Ops,
5128                                                     SelectionDAG &DAG) const {
5129   SDValue Result(0, 0);
5130
5131   // Only length 1 constraints are C_Other.
5132   if (Constraint.size() != 1) return;
5133
5134   // Only C_Other constraints get lowered like this. That means constants for us
5135   // so return early if there's no hope the constraint can be lowered.
5136
5137   switch(Constraint[0]) {
5138   default: break;
5139   case 'I': case 'J': case 'K': case 'L':
5140   case 'M': case 'N': case 'Z': {
5141     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
5142     if (!C)
5143       return;
5144
5145     uint64_t CVal = C->getZExtValue();
5146     uint32_t Bits;
5147
5148     switch (Constraint[0]) {
5149     default:
5150       // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
5151       // is a peculiarly useless SUB constraint.
5152       llvm_unreachable("Unimplemented C_Other constraint");
5153     case 'I':
5154       if (CVal <= 0xfff)
5155         break;
5156       return;
5157     case 'K':
5158       if (A64Imms::isLogicalImm(32, CVal, Bits))
5159         break;
5160       return;
5161     case 'L':
5162       if (A64Imms::isLogicalImm(64, CVal, Bits))
5163         break;
5164       return;
5165     case 'Z':
5166       if (CVal == 0)
5167         break;
5168       return;
5169     }
5170
5171     Result = DAG.getTargetConstant(CVal, Op.getValueType());
5172     break;
5173   }
5174   case 'S': {
5175     // An absolute symbolic address or label reference.
5176     if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
5177       Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
5178                                           GA->getValueType(0));
5179     } else if (const BlockAddressSDNode *BA
5180                  = dyn_cast<BlockAddressSDNode>(Op)) {
5181       Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
5182                                          BA->getValueType(0));
5183     } else if (const ExternalSymbolSDNode *ES
5184                  = dyn_cast<ExternalSymbolSDNode>(Op)) {
5185       Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
5186                                            ES->getValueType(0));
5187     } else
5188       return;
5189     break;
5190   }
5191   case 'Y':
5192     if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
5193       if (CFP->isExactlyValue(0.0)) {
5194         Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
5195         break;
5196       }
5197     }
5198     return;
5199   }
5200
5201   if (Result.getNode()) {
5202     Ops.push_back(Result);
5203     return;
5204   }
5205
5206   // It's an unknown constraint for us. Let generic code have a go.
5207   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
5208 }
5209
5210 std::pair<unsigned, const TargetRegisterClass*>
5211 AArch64TargetLowering::getRegForInlineAsmConstraint(
5212                                                   const std::string &Constraint,
5213                                                   MVT VT) const {
5214   if (Constraint.size() == 1) {
5215     switch (Constraint[0]) {
5216     case 'r':
5217       if (VT.getSizeInBits() <= 32)
5218         return std::make_pair(0U, &AArch64::GPR32RegClass);
5219       else if (VT == MVT::i64)
5220         return std::make_pair(0U, &AArch64::GPR64RegClass);
5221       break;
5222     case 'w':
5223       if (VT == MVT::f16)
5224         return std::make_pair(0U, &AArch64::FPR16RegClass);
5225       else if (VT == MVT::f32)
5226         return std::make_pair(0U, &AArch64::FPR32RegClass);
5227       else if (VT.getSizeInBits() == 64)
5228         return std::make_pair(0U, &AArch64::FPR64RegClass);
5229       else if (VT.getSizeInBits() == 128)
5230         return std::make_pair(0U, &AArch64::FPR128RegClass);
5231       break;
5232     }
5233   }
5234
5235   // Use the default implementation in TargetLowering to convert the register
5236   // constraint into a member of a register class.
5237   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
5238 }
5239
5240 /// Represent NEON load and store intrinsics as MemIntrinsicNodes.
5241 /// The associated MachineMemOperands record the alignment specified
5242 /// in the intrinsic calls.
5243 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5244                                                const CallInst &I,
5245                                                unsigned Intrinsic) const {
5246   switch (Intrinsic) {
5247   case Intrinsic::arm_neon_vld1:
5248   case Intrinsic::arm_neon_vld2:
5249   case Intrinsic::arm_neon_vld3:
5250   case Intrinsic::arm_neon_vld4:
5251   case Intrinsic::aarch64_neon_vld1x2:
5252   case Intrinsic::aarch64_neon_vld1x3:
5253   case Intrinsic::aarch64_neon_vld1x4:
5254   case Intrinsic::arm_neon_vld2lane:
5255   case Intrinsic::arm_neon_vld3lane:
5256   case Intrinsic::arm_neon_vld4lane: {
5257     Info.opc = ISD::INTRINSIC_W_CHAIN;
5258     // Conservatively set memVT to the entire set of vectors loaded.
5259     uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
5260     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
5261     Info.ptrVal = I.getArgOperand(0);
5262     Info.offset = 0;
5263     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
5264     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
5265     Info.vol = false; // volatile loads with NEON intrinsics not supported
5266     Info.readMem = true;
5267     Info.writeMem = false;
5268     return true;
5269   }
5270   case Intrinsic::arm_neon_vst1:
5271   case Intrinsic::arm_neon_vst2:
5272   case Intrinsic::arm_neon_vst3:
5273   case Intrinsic::arm_neon_vst4:
5274   case Intrinsic::aarch64_neon_vst1x2:
5275   case Intrinsic::aarch64_neon_vst1x3:
5276   case Intrinsic::aarch64_neon_vst1x4:
5277   case Intrinsic::arm_neon_vst2lane:
5278   case Intrinsic::arm_neon_vst3lane:
5279   case Intrinsic::arm_neon_vst4lane: {
5280     Info.opc = ISD::INTRINSIC_VOID;
5281     // Conservatively set memVT to the entire set of vectors stored.
5282     unsigned NumElts = 0;
5283     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
5284       Type *ArgTy = I.getArgOperand(ArgI)->getType();
5285       if (!ArgTy->isVectorTy())
5286         break;
5287       NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
5288     }
5289     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
5290     Info.ptrVal = I.getArgOperand(0);
5291     Info.offset = 0;
5292     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
5293     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
5294     Info.vol = false; // volatile stores with NEON intrinsics not supported
5295     Info.readMem = false;
5296     Info.writeMem = true;
5297     return true;
5298   }
5299   default:
5300     break;
5301   }
5302
5303   return false;
5304 }