Revert r240137 (Fixed/added namespace ending comments using clang-tidy. NFC)

[oota-llvm.git] / lib / Target / AArch64 / AArch64InstrFormats.td
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td

index 42326fc182bc7453ae44d94c1503e6a5202ef150..2c52f340d6d1501744e5d57116877f74b4002559 100644 (file)
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -441,20 +441,26 @@ def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
  // instructions for splatting repeating bit patterns across the immediate.
  def logical_imm32_XFORM : SDNodeXForm<imm, [{
    uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
  }]>;
  def logical_imm64_XFORM : SDNodeXForm<imm, [{
    uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
  }]>;
  
-def LogicalImm32Operand : AsmOperandClass {
-  let Name = "LogicalImm32";
-  let DiagnosticType = "LogicalSecondSource";
-}
-def LogicalImm64Operand : AsmOperandClass {
-  let Name = "LogicalImm64";
-  let DiagnosticType = "LogicalSecondSource";
+let DiagnosticType = "LogicalSecondSource" in {
+  def LogicalImm32Operand : AsmOperandClass {
+    let Name = "LogicalImm32";
+  }
+  def LogicalImm64Operand : AsmOperandClass {
+    let Name = "LogicalImm64";
+  }
+  def LogicalImm32NotOperand : AsmOperandClass {
+    let Name = "LogicalImm32Not";
+  }
+  def LogicalImm64NotOperand : AsmOperandClass {
+    let Name = "LogicalImm64Not";
+  }
  }
  def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
    return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
@@ -468,6 +474,12 @@ def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
    let PrintMethod = "printLogicalImm64";
    let ParserMatchClass = LogicalImm64Operand;
  }
+def logical_imm32_not : Operand<i32> {
+  let ParserMatchClass = LogicalImm32NotOperand;
+}
+def logical_imm64_not : Operand<i64> {
+  let ParserMatchClass = LogicalImm64NotOperand;
+}
  
  // imm0_65535 predicate - True if the immediate is in the range [0,65535].
  def Imm0_65535Operand : AsmImmRange<0, 65535>;
@@ -527,6 +539,11 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
    let ParserMatchClass = Imm0_7Operand;
  }
  
+// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
+def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 16;
+}]>;
+
  // An arithmetic shifter operand:
  //  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
  //  {5-0} - imm6
@@ -665,7 +682,7 @@ def fpimm32 : Operand<f32>,
      }], SDNodeXForm<fpimm, [{
        APFloat InVal = N->getValueAPF();
        uint32_t enc = AArch64_AM::getFP32Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
      }]>> {
    let ParserMatchClass = FPImmOperand;
    let PrintMethod = "printFPImmOperand";
@@ -676,7 +693,7 @@ def fpimm64 : Operand<f64>,
      }], SDNodeXForm<fpimm, [{
        APFloat InVal = N->getValueAPF();
        uint32_t enc = AArch64_AM::getFP64Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
      }]>> {
    let ParserMatchClass = FPImmOperand;
    let PrintMethod = "printFPImmOperand";
@@ -751,7 +768,7 @@ def simdimmtype10 : Operand<i32>,
        uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
                                                             .bitcastToAPInt()
                                                             .getZExtValue());
-      return CurDAG->getTargetConstant(enc, MVT::i32);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
      }]>> {
    let ParserMatchClass = SIMDImmType10Operand;
    let PrintMethod = "printSIMDType10Operand";
@@ -764,15 +781,17 @@ def simdimmtype10 : Operand<i32>,
  
  // Base encoding for system instruction operands.
  let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands,
+                  list<dag> pattern = []>
+    : I<oops, iops, asm, operands, "", pattern> {
    let Inst{31-22} = 0b1101010100;
    let Inst{21}    = L;
  }
  
  // System instructions which do not have an Rt register.
-class SimpleSystemI<bit L, dag iops, string asm, string operands>
-    : BaseSystemI<L, (outs), iops, asm, operands> {
+class SimpleSystemI<bit L, dag iops, string asm, string operands,
+                    list<dag> pattern = []>
+    : BaseSystemI<L, (outs), iops, asm, operands, pattern> {
    let Inst{4-0} = 0b11111;
  }
  
@@ -785,13 +804,17 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
  }
  
  // Hint instructions that take both a CRm and a 3-bit immediate.
-class HintI<string mnemonic>
-    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
-      Sched<[WriteHint]> {
-  bits <7> imm;
-  let Inst{20-12} = 0b000110010;
-  let Inst{11-5} = imm;
-}
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity
+let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
+  class HintI<string mnemonic>
+      : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "",
+                      [(int_aarch64_hint imm0_127:$imm)]>,
+        Sched<[WriteHint]> {
+    bits <7> imm;
+    let Inst{20-12} = 0b000110010;
+    let Inst{11-5} = imm;
+  }
  
  // System instructions taking a single literal operand which encodes into
  // CRm. op2 differentiates the opcodes.
@@ -803,8 +826,9 @@ def barrier_op : Operand<i32> {
    let PrintMethod = "printBarrierOption";
    let ParserMatchClass = BarrierAsmOperand;
  }
-class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
-    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
+                 list<dag> pattern = []>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
        Sched<[WriteBarrier]> {
    bits<4> CRm;
    let Inst{20-12} = 0b000110011;
@@ -819,7 +843,7 @@ def MRSSystemRegisterOperand : AsmOperandClass {
    let ParserMethod = "tryParseSysReg";
    let DiagnosticType = "MRS";
  }
-// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate.
  def mrs_sysreg_op : Operand<i32> {
    let ParserMatchClass = MRSSystemRegisterOperand;
    let DecoderMethod = "DecodeMRSSystemRegister";
@@ -839,9 +863,8 @@ def msr_sysreg_op : Operand<i32> {
  
  class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                         "mrs", "\t$Rt, $systemreg"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
+  bits<16> systemreg;
+  let Inst{20-5} = systemreg;
  }
  
  // FIXME: Some of these def NZCV, others don't. Best way to model that?
@@ -849,9 +872,8 @@ class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
  // would do it, but feels like overkill at this point.
  class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
                         "msr", "\t$systemreg, $Rt"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
+  bits<16> systemreg;
+  let Inst{20-5} = systemreg;
  }
  
  def SystemPStateFieldOperand : AsmOperandClass {
@@ -963,8 +985,14 @@ def ccode : Operand<i32> {
    let ParserMatchClass = CondCode;
  }
  def inv_ccode : Operand<i32> {
+  // AL and NV are invalid in the aliases which use inv_ccode
    let PrintMethod = "printInverseCondCode";
    let ParserMatchClass = CondCode;
+  let MCOperandPredicate = [{
+    return MCOp.isImm() &&
+           MCOp.getImm() != AArch64CC::AL &&
+           MCOp.getImm() != AArch64CC::NV;
+  }];
  }
  
  // Conditional branch target. 19-bit immediate. The low two bits of the target
@@ -1321,14 +1349,15 @@ class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
  }
  
  multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  // MADD/MSUB generation is decided by MachineCombiner.cpp
    def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
        Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
      let Inst{31} = 0;
    }
  
    def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
        Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
      let Inst{31} = 1;
    }
@@ -1606,12 +1635,18 @@ class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
  
  multiclass AddSub<bit isSub, string mnemonic,
                    SDPatternOperator OpNode = null_frag> {
-  let hasSideEffects = 0 in {
+  let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
    // Add/Subtract immediate
+  // Increase the weight of the immediate variant to try to match it before
+  // the extended register variant.
+  // We used to match the register variant before the immediate when the
+  // register argument could be implicitly zero-extended.
+  let AddedComplexity = 6 in
    def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
                             mnemonic, OpNode> {
      let Inst{31} = 0;
    }
+  let AddedComplexity = 6 in
    def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
                             mnemonic, OpNode> {
      let Inst{31} = 1;
@@ -1929,22 +1964,32 @@ class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
      : InstAlias<asm#" $dst, $src1, $src2",
                  (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
  
-let AddedComplexity = 6 in
-multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
+                      string Alias> {
+  let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
    def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
                             [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
                                                 logical_imm32:$imm))]> {
      let Inst{31} = 0;
      let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
    }
+  let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
    def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
                             [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
                                                 logical_imm64:$imm))]> {
      let Inst{31} = 1;
    }
+
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
+                      logical_imm32_not:$imm), 0>;
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
+                       logical_imm64_not:$imm), 0>;
  }
  
-multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
+                       string Alias> {
    let isCompare = 1, Defs = [NZCV] in {
    def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
        [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
@@ -1956,6 +2001,13 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
      let Inst{31} = 1;
    }
    } // end Defs = [NZCV]
+
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
+                      logical_imm32_not:$imm), 0>;
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
+                       logical_imm64_not:$imm), 0>;
  }
  
  class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
@@ -1966,8 +2018,10 @@ class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
  // Split from LogicalImm as not all instructions have both.
  multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
                        SDPatternOperator OpNode> {
+  let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
    def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
    def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+  }
  
    def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
                              [(set GPR32:$Rd, (OpNode GPR32:$Rn,
@@ -2138,7 +2192,8 @@ class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
  
  def inv_cond_XFORM : SDNodeXForm<imm, [{
    AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
-  return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), MVT::i32);
+  return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), SDLoc(N),
+                                   MVT::i32);
  }]>;
  
  multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
@@ -2948,7 +3003,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
      : BaseLoadStorePreIdx<sz, V, opc,
                       (outs GPR64sp:$wback, regtype:$Rt),
                       (ins GPR64sp:$Rn, simm9:$offset), asm,
-                     "$Rn = $wback", []>,
+                     "$Rn = $wback,@earlyclobber $wback", []>,
        Sched<[WriteLD, WriteAdr]>;
  
  let mayStore = 1, mayLoad = 0 in
@@ -2957,7 +3012,7 @@ class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
      : BaseLoadStorePreIdx<sz, V, opc,
                        (outs GPR64sp:$wback),
                        (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                      asm, "$Rn = $wback",
+                      asm, "$Rn = $wback,@earlyclobber $wback",
        [(set GPR64sp:$wback,
              (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
        Sched<[WriteAdr, WriteST]>;
@@ -2967,7 +3022,6 @@ class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
  // Load/store post-indexed
  //---
  
-// (pre-index) load/stores.
  class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
                            string asm, string cstr, list<dag> pat>
      : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
@@ -2995,7 +3049,7 @@ class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
      : BaseLoadStorePostIdx<sz, V, opc,
                        (outs GPR64sp:$wback, regtype:$Rt),
                        (ins GPR64sp:$Rn, simm9:$offset),
-                      asm, "$Rn = $wback", []>,
+                      asm, "$Rn = $wback,@earlyclobber $wback", []>,
        Sched<[WriteLD, WriteI]>;
  
  let mayStore = 1, mayLoad = 0 in
@@ -3004,7 +3058,7 @@ class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
      : BaseLoadStorePostIdx<sz, V, opc,
                        (outs GPR64sp:$wback),
                        (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                       asm, "$Rn = $wback",
+                       asm, "$Rn = $wback,@earlyclobber $wback",
        [(set GPR64sp:$wback,
              (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
      Sched<[WriteAdr, WriteST, ReadAdrBase]>;
@@ -3068,7 +3122,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
  // (pre-indexed)
  class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
                                string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> {
    bits<5> Rt;
    bits<5> Rt2;
    bits<5> Rn;
@@ -3109,7 +3163,7 @@ class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
  
  class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
                                string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> {
    bits<5> Rt;
    bits<5> Rt2;
    bits<5> Rn;
@@ -3235,6 +3289,10 @@ class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
      : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
    bits<5> Rt;
    bits<5> Rn;
+  let Inst{20-16} = 0b11111;
+  let Unpredictable{20-16} = 0b11111;
+  let Inst{14-10} = 0b11111;
+  let Unpredictable{14-10} = 0b11111;
    let Inst{9-5} = Rn;
    let Inst{4-0} = Rt;
  
@@ -4336,7 +4394,7 @@ class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
  }
  
  multiclass SIMDVectorLShiftLongBySizeBHS {
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
    def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
                                               "shll", ".8h",  ".8b", "8">;
    def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
@@ -5213,6 +5271,10 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
    def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
        asm, ".2d", OpNode, v2i64>;
  
+  def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
    def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
          (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
    def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
@@ -5247,6 +5309,27 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
    let Inst{4-0}   = Rd;
  }
  
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
+                        dag oops, dag iops, string asm,
+            list<dag> pattern>
+  : I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = R;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
  multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
    def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
@@ -5274,6 +5357,16 @@ multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
    def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
  }
  
+multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
+                                     (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm), 
+                                     asm, []>;
+  def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
+                                     (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm), 
+                                     asm, []>;
+}
+
  multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode = null_frag> {
    let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
@@ -5834,7 +5927,7 @@ multiclass SIMDIns {
      let Inst{20-18} = idx;
      let Inst{17-16} = 0b10;
      let Inst{14-12} = idx2;
-    let Inst{11} = 0;
+    let Inst{11} = {?};
    }
    def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
      bits<2> idx;
@@ -5842,7 +5935,7 @@ multiclass SIMDIns {
      let Inst{20-19} = idx;
      let Inst{18-16} = 0b100;
      let Inst{14-13} = idx2;
-    let Inst{12-11} = 0;
+    let Inst{12-11} = {?,?};
    }
    def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
      bits<1> idx;
@@ -5850,7 +5943,7 @@ multiclass SIMDIns {
      let Inst{20} = idx;
      let Inst{19-16} = 0b1000;
      let Inst{14} = idx2;
-    let Inst{13-11} = 0;
+    let Inst{13-11} = {?,?,?};
    }
  
    // For all forms of the INS instruction, the "mov" mnemonic is the
@@ -8465,6 +8558,174 @@ multiclass SIMDLdSt4SingleAliases<string asm> {
  }
  } // end of 'let Predicates = [HasNEON]'
  
+//----------------------------------------------------------------------------
+// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON, HasV8_1a] in {
+
+class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
+                                    RegisterOperand regtype, string asm, 
+                                    string kind, list<dag> pattern>
+  : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, 
+                                pattern> {
+  let Inst{21}=0;
+}
+multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
+                                             SDPatternOperator Accum> {
+  def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
+    [(set (v4i16 V64:$dst),
+          (Accum (v4i16 V64:$Rd),
+                 (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
+                                                   (v4i16 V64:$Rm)))))]>;         
+  def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
+    [(set (v8i16 V128:$dst),
+          (Accum (v8i16 V128:$Rd),
+                 (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm)))))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
+    [(set (v2i32 V64:$dst),
+          (Accum (v2i32 V64:$Rd),
+                 (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
+                                                   (v2i32 V64:$Rm)))))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
+                                                   (v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
+                                     SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                          V64, V64, V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+          (Accum (v4i16 V64:$Rd),
+                 (v4i16 (int_aarch64_neon_sqrdmulh
+                          (v4i16 V64:$Rn),
+                          (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                          V128, V128, V128_lo, VectorIndexH,
+                                          asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+          (Accum (v8i16 V128:$Rd),
+                 (v8i16 (int_aarch64_neon_sqrdmulh
+                          (v8i16 V128:$Rn),
+                          (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                   VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                          V64, V64, V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+        (Accum (v2i32 V64:$Rd),
+               (v2i32 (int_aarch64_neon_sqrdmulh
+                        (v2i32 V64:$Rn),
+                        (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but 
+  // an intermediate EXTRACT_SUBREG would be untyped.
+  // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we 
+  // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                       (i32 (vector_extract 
+                               (v4i32 (insert_subvector
+                                       (undef), 
+                                        (v2i32 (int_aarch64_neon_sqrdmulh 
+                                                 (v2i32 V64:$Rn),
+                                                 (v2i32 (AArch64duplane32 
+                                                          (v4i32 V128:$Rm),
+                                                          VectorIndexS:$idx)))),
+                                      (i32 0))),
+                               (i64 0))))),
+            (EXTRACT_SUBREG
+                (v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
+                          (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), 
+                                                FPR32Op:$Rd, 
+                                                ssub)), 
+                          V64:$Rn,
+                          V128:$Rm, 
+                          VectorIndexS:$idx)),
+                ssub)>;
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                          V128, V128, V128, VectorIndexS,
+                                          asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqrdmulh
+                          (v4i32 V128:$Rn),
+                          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                   VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
+  // an intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                        (i32 (vector_extract 
+                               (v4i32 (int_aarch64_neon_sqrdmulh 
+                                        (v4i32 V128:$Rn),
+                                        (v4i32 (AArch64duplane32 
+                                                 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx)))),
+                               (i64 0))))),
+            (EXTRACT_SUBREG
+                (v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
+                         (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), 
+                                               FPR32Op:$Rd, 
+                                               ssub)), 
+                         V128:$Rn,
+                         V128:$Rm, 
+                         VectorIndexS:$idx)),
+                ssub)>;
+
+  def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                        FPR16Op, FPR16Op, V128_lo,
+                                        VectorIndexH, asm, ".h", "", "", ".h", 
+                                        []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                        FPR32Op, FPR32Op, V128, VectorIndexS,
+                                        asm, ".s", "", "", ".s",
+    [(set (i32 FPR32Op:$dst),
+          (Accum (i32 FPR32Op:$Rd),
+                 (i32 (int_aarch64_neon_sqrdmulh
+                        (i32 FPR32Op:$Rn),
+                        (i32 (vector_extract (v4i32 V128:$Rm),
+                                             VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+} // let Predicates = [HasNeon, HasV8_1a]
+
  //----------------------------------------------------------------------------
  // Crypto extensions
  //----------------------------------------------------------------------------
@@ -8561,6 +8822,178 @@ class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
                 [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
  } // end of 'let Predicates = [HasCrypto]'
  
+//----------------------------------------------------------------------------
+// v8.1 atomic instructions extension:
+// * CAS
+// * CASP
+// * SWP
+// * LDOPregister<OP>, and aliases STOPregister<OP>
+
+// Instruction encodings:
+//
+//      31 30|29  24|23|22|21|20 16|15|14  10|9 5|4 0
+// CAS  SZ   |001000|1 |A |1 |Rs   |R |11111 |Rn |Rt
+// CASP  0|SZ|001000|0 |A |1 |Rs   |R |11111 |Rn |Rt
+// SWP  SZ   |111000|A |R |1 |Rs   |1 |OPC|00|Rn |Rt
+// LD   SZ   |111000|A |R |1 |Rs   |0 |OPC|00|Rn |Rt
+// ST   SZ   |111000|A |R |1 |Rs   |0 |OPC|00|Rn |11111
+
+// Instruction syntax:
+//
+// CAS{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// CAS{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// CASP{<order>} <Ws>, <W(s+1)>, <Wt>, <W(t+1)>, [<Xn|SP>]
+// CASP{<order>} <Xs>, <X(s+1)>, <Xt>, <X(t+1)>, [<Xn|SP>]
+// SWP{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// SWP{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// LD<OP>{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// LD<OP>{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// ST<OP>{<order>}[<size>] <Ws>, [<Xn|SP>]
+// ST<OP>{<order>} <Xs>, [<Xn|SP>]
+
+let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
+                      string cstr, list<dag> pattern>
+      : I<oops, iops, asm, operands, cstr, pattern> {
+  bits<2> Sz;
+  bit NP;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23} = NP;
+  let Inst{22} = Acq;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = Rel;
+  let Inst{14-10} = 0b11111;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+class BaseCAS<string order, string size, RegisterClass RC>
+      : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+                        "cas" # order # size, "\t$Rs, $Rt, [$Rn]",
+                        "$out = $Rs",[]> {
+  let NP = 1;
+}
+
+multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseCAS<order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseCAS<order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseCAS<order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseCAS<order, "", GPR64>;
+}
+
+class BaseCASP<string order, string size, RegisterOperand RC>
+      : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+                        "casp" # order # size, "\t$Rs, $Rt, [$Rn]",
+                        "$out = $Rs",[]> {
+  let NP = 0;
+}
+
+multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in 
+    def s : BaseCASP<order, "", WSeqPairClassOperand>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in 
+    def d : BaseCASP<order, "", XSeqPairClassOperand>;
+}
+
+let Predicates = [HasV8_1a] in
+class BaseSWP<string order, string size, RegisterClass RC>
+      : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
+          "\t$Rs, $Rt, [$Rn]","",[]> {
+  bits<2> Sz;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<3> opc = 0b000;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b111000;
+  let Inst{23} = Acq;
+  let Inst{22} = Rel;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = 0b1;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseSWP<order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseSWP<order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseSWP<order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseSWP<order, "", GPR64>;
+}
+
+let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
+      : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
+          "\t$Rs, $Rt, [$Rn]","",[]> {
+  bits<2> Sz;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<3> opc;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b111000;
+  let Inst{23} = Acq;
+  let Inst{22} = Rel;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel, 
+                        string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in 
+    def b : BaseLDOPregister<op, order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in 
+    def h : BaseLDOPregister<op, order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in 
+    def s : BaseLDOPregister<op, order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in 
+    def d : BaseLDOPregister<op, order, "", GPR64>;
+}
+
+let Predicates = [HasV8_1a] in
+class BaseSTOPregister<string asm, RegisterClass OP, Register Reg,
+                        Instruction inst> :
+      InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;
+
+multiclass STOPregister<string asm, string instr> {
+  def : BaseSTOPregister<asm # "lb", GPR32, WZR, 
+                    !cast<Instruction>(instr # "Lb")>;
+  def : BaseSTOPregister<asm # "lh", GPR32, WZR, 
+                    !cast<Instruction>(instr # "Lh")>;
+  def : BaseSTOPregister<asm # "l",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "Ls")>;
+  def : BaseSTOPregister<asm # "l",  GPR64, XZR, 
+                    !cast<Instruction>(instr # "Ld")>;
+  def : BaseSTOPregister<asm # "b",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "b")>;
+  def : BaseSTOPregister<asm # "h",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "h")>;
+  def : BaseSTOPregister<asm,        GPR32, WZR, 
+                    !cast<Instruction>(instr # "s")>;
+  def : BaseSTOPregister<asm,        GPR64, XZR, 
+                    !cast<Instruction>(instr # "d")>;
+}
+
+//----------------------------------------------------------------------------
  // Allow the size specifier tokens to be upper case, not just lower.
  def : TokenAlias<".8B", ".8b">;
  def : TokenAlias<".4H", ".4h">;