Fixes ARM LNT bot from SLP change in O3

[oota-llvm.git] / lib / Target / ARM / ARMInstrNEON.td
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td

index 31dd843fe898096c7b1bc8d739fc77b241de1f6a..af4f4d1914a73ec6f9058a14df5c3c2a1a164540 100644 (file)
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -398,6 +398,41 @@ def VecListFourQWordIndexed : Operand<i32> {
    let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
  }
  
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
+def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 2;
+}]>;
+def hword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 2;
+}]>;
+def byte_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 1;
+}]>;
+def byte_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 1;
+}]>;
+def non_word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                    (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
  
  //===----------------------------------------------------------------------===//
  // NEON-specific DAG Nodes.
@@ -591,7 +626,7 @@ class VLD1D<bits<4> op7_4, string Dt>
            "vld1", Dt, "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
  }
  class VLD1Q<bits<4> op7_4, string Dt>
    : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
@@ -599,7 +634,7 @@ class VLD1Q<bits<4> op7_4, string Dt>
            "vld1", Dt, "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
  }
  
  def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
@@ -620,16 +655,14 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
    def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1u,
                          "vld1", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
  }
  multiclass VLD1QWB<bits<4> op7_4, string Dt> {
@@ -639,16 +672,14 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
    def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                          "vld1", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
  }
  
@@ -668,7 +699,7 @@ class VLD1D3<bits<4> op7_4, string Dt>
            "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
  }
  multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
    def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
@@ -677,16 +708,14 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
    def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                          "vld1", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
  }
  
@@ -709,7 +738,7 @@ class VLD1D4<bits<4> op7_4, string Dt>
            "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
  }
  multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
    def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
@@ -718,16 +747,14 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
    def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                          "vld1", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
  }
  
@@ -751,7 +778,7 @@ class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
            "vld2", Dt, "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST2Instruction";
  }
  
  def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2>;
@@ -775,16 +802,14 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST2Instruction";
    }
    def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm), itin,
                          "vld2", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST2Instruction";
    }
  }
  
@@ -818,7 +843,7 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
  }
  
  def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
@@ -837,7 +862,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
            "$Rn.addr = $wb", []> {
    let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
  }
  
  def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
@@ -877,7 +902,7 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
  }
  
  def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
@@ -896,7 +921,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
            "$Rn.addr = $wb", []> {
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
  }
  
  def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
@@ -1313,7 +1338,6 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{4} = Rn{4};
      let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
    }
    def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                          (outs VecListOneDAllLanes:$Vd, GPR:$wb),
@@ -1322,7 +1346,6 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
                          "$Rn.addr = $wb", []> {
      let Inst{4} = Rn{4};
      let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
    }
  }
  multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
@@ -1334,7 +1357,6 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{4} = Rn{4};
      let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
    }
    def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                          (outs VecListDPairAllLanes:$Vd, GPR:$wb),
@@ -1343,7 +1365,6 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
                          "$Rn.addr = $wb", []> {
      let Inst{4} = Rn{4};
      let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
    }
  }
  
@@ -1384,7 +1405,6 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{4} = Rn{4};
      let DecoderMethod = "DecodeVLD2DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
    }
    def _register : NLdSt<1, 0b10, 0b1101, op7_4,
                          (outs VdTy:$Vd, GPR:$wb),
@@ -1393,7 +1413,6 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
                          "$Rn.addr = $wb", []> {
      let Inst{4} = Rn{4};
      let DecoderMethod = "DecodeVLD2DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
    }
  }
  
@@ -1545,14 +1564,14 @@ class VST1D<bits<4> op7_4, string Dt>
            IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
  }
  class VST1Q<bits<4> op7_4, string Dt>
    : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$Rn, VecListDPair:$Vd),
            IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
  }
  
  def  VST1d8   : VST1D<{0,0,0,?}, "8">;
@@ -1573,8 +1592,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
    def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm, VecListOneD:$Vd),
@@ -1582,8 +1600,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
                          "vst1", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
  }
  multiclass VST1QWB<bits<4> op7_4, string Dt> {
@@ -1593,8 +1610,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
    def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm, VecListDPair:$Vd),
@@ -1602,8 +1618,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
                          "vst1", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
  }
  
@@ -1624,7 +1639,7 @@ class VST1D3<bits<4> op7_4, string Dt>
            IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
  }
  multiclass VST1D3WB<bits<4> op7_4, string Dt> {
    def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
@@ -1633,8 +1648,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
    def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
@@ -1642,8 +1656,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
                          "vst1", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
  }
  
@@ -1669,7 +1682,7 @@ class VST1D4<bits<4> op7_4, string Dt>
            []> {
    let Rm = 0b1111;
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
  }
  multiclass VST1D4WB<bits<4> op7_4, string Dt> {
    def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
@@ -1678,8 +1691,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
    def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
@@ -1687,8 +1699,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
                          "vst1", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
    }
  }
  
@@ -1713,7 +1724,7 @@ class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
            itin, "vst2", Dt, "$Vd, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST2Instruction";
  }
  
  def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2>;
@@ -1737,16 +1748,14 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST2Instruction";
    }
    def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
                          "vst2", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST2Instruction";
    }
  }
  multiclass VST2QWB<bits<4> op7_4, string Dt> {
@@ -1756,8 +1765,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
                       "$Rn.addr = $wb", []> {
      let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST2Instruction";
    }
    def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
                          (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
@@ -1765,8 +1773,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
                          "vst2", Dt, "$Vd, $Rn, $Rm",
                          "$Rn.addr = $wb", []> {
      let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST2Instruction";
    }
  }
  
@@ -1800,7 +1807,7 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
    let Rm = 0b1111;
    let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
  }
  
  def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
@@ -1819,7 +1826,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
            "$Rn.addr = $wb", []> {
    let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
  }
  
  def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
@@ -1859,7 +1866,7 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
            "", []> {
    let Rm = 0b1111;
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
  }
  
  def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
@@ -1878,7 +1885,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
             "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
            "$Rn.addr = $wb", []> {
    let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
  }
  
  def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
@@ -1959,7 +1966,7 @@ def VST1LNd8  : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
  def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
                         NEONvgetlaneu, addrmode6> {
    let Inst{7-6} = lane{1-0};
-  let Inst{4}   = Rn{5};
+  let Inst{4}   = Rn{4};
  }
  
  def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
@@ -2002,7 +2009,7 @@ def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
  def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
                               NEONvgetlaneu, addrmode6> {
    let Inst{7-6} = lane{1-0};
-  let Inst{4}   = Rn{5};
+  let Inst{4}   = Rn{4};
  }
  def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
                               extractelt, addrmode6oneL32> {
@@ -2238,6 +2245,38 @@ def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
  
  } // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
  
+// Use vld1/vst1 for unaligned f64 load / store
+def : Pat<(f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
+          (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>;
+def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
+
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+          (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+          (VLD1q32 addrmode6:$addr)>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q32 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
  
  //===----------------------------------------------------------------------===//
  // NEON pattern fragments
@@ -2300,18 +2339,33 @@ class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
  class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                bits<2> op17_16, bits<5> op11_7, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
          (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
  class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                bits<2> op17_16, bits<5> op11_7, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
          (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
  
+// Same as above, but not predicated.
+class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+
+class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
  // Narrow 2-register operations.
  class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
             bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
@@ -2325,7 +2379,7 @@ class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
  class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyD, ValueType TyQ, Intrinsic IntOp>
+              ValueType TyD, ValueType TyQ, SDPatternOperator IntOp>
    : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
          (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
          [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>;
@@ -2343,7 +2397,7 @@ class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
  class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
    : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
          (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
          [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>;
@@ -2465,7 +2519,7 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
  // Basic 3-register intrinsics, both double- and quad-register.
  class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -2474,8 +2528,18 @@ class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
    let TwoOperandAliasConstraint = "$Vn = $Vd";
    let isCommutable = Commutable;
  }
+
+class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
  class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
-                string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
+                string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
    : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
          (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
          NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2485,8 +2549,9 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                             imm:$lane)))))]> {
    let isCommutable = 0;
  }
+
  class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
-                  string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
+                  string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
    : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
          (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
          NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2497,7 +2562,7 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
  }
  class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin,
          OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
@@ -2508,7 +2573,7 @@ class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
  
  class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
    : N3V<op24, op23, op21_20, op11_8, 1, op4,
          (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -2517,9 +2582,19 @@ class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
    let TwoOperandAliasConstraint = "$Vn = $Vd";
    let isCommutable = Commutable;
  }
+
+class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
  class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                  string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
          (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
          NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2531,7 +2606,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
  }
  class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                    string OpcodeStr, string Dt,
-                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
          (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
          NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2543,7 +2618,7 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
  }
  class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3V<op24, op23, op21_20, op11_8, 1, op4,
          (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin,
          OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
@@ -2628,7 +2703,7 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
  // Neon Intrinsic-Op instructions (VABA): double- and quad-register.
  class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                  InstrItinClass itin, string OpcodeStr, string Dt,
-                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2636,7 +2711,7 @@ class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                               (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>;
  class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                  InstrItinClass itin, string OpcodeStr, string Dt,
-                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
    : N3V<op24, op23, op21_20, op11_8, 1, op4,
          (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2647,7 +2722,7 @@ class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
  // The destination register is also used as the first source operand register.
  class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2655,7 +2730,7 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                                        (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
  class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3V<op24, op23, op21_20, op11_8, 1, op4,
          (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2700,7 +2775,7 @@ class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
  // Long Intrinsic-Op vector operations with explicit extend (VABAL).
  class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                     InstrItinClass itin, string OpcodeStr, string Dt,
-                   ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                   ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
                     SDNode OpNode>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
@@ -2713,7 +2788,7 @@ class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
  // a quad-register and is also used as the first source operand register.
  class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+               ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2721,7 +2796,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>;
  class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
          (outs QPR:$Vd),
          (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
@@ -2734,7 +2809,7 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                                  imm:$lane)))))]>;
  class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                     InstrItinClass itin, string OpcodeStr, string Dt,
-                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                   ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
          (outs QPR:$Vd),
          (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
@@ -2749,7 +2824,7 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
  // Narrowing 3-register intrinsics.
  class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
-              Intrinsic IntOp, bit Commutable>
+              SDPatternOperator IntOp, bit Commutable>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -2802,7 +2877,7 @@ class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
  // Long 3-register intrinsics with explicit extend (VABDL).
  class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                   InstrItinClass itin, string OpcodeStr, string Dt,
-                 ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                 ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
                   bit Commutable>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
@@ -2815,7 +2890,7 @@ class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
  // Long 3-register intrinsics.
  class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable>
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable>
    : N3V<op24, op23, op21_20, op11_8, 0, op4,
          (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
          OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -2824,7 +2899,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
  }
  class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                  string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
          (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
          NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2834,7 +2909,7 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                                  imm:$lane)))))]>;
  class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                    InstrItinClass itin, string OpcodeStr, string Dt,
-                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
          (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
          NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2861,14 +2936,14 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
  class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4,
                  string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
          (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
  class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4,
                  string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
          (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
@@ -2879,7 +2954,7 @@ class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
  class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                   bits<2> op17_16, bits<5> op11_7, bit op4,
                   string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
          (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD,
          OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
@@ -2887,7 +2962,7 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
  class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                   bits<2> op17_16, bits<5> op11_7, bit op4,
                   string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
    : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
          (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ,
          OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
@@ -2976,14 +3051,14 @@ class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
  // both double- and quad-register.
  class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-              Intrinsic IntOp>
+              SDPatternOperator IntOp>
    : N2VImm<op24, op23, op11_8, op7, 0, op4,
             (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
             IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
             [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>;
  class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-              Intrinsic IntOp>
+              SDPatternOperator IntOp>
    : N2VImm<op24, op23, op11_8, op7, 1, op4,
             (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
             IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
@@ -3053,7 +3128,7 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
  multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                        bits<5> op11_7, bit op4,
                        InstrItinClass itinD, InstrItinClass itinQ,
-                      string OpcodeStr, string Dt, Intrinsic IntOp> {
+                      string OpcodeStr, string Dt, SDPatternOperator IntOp> {
    // 64-bit vector types.
    def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                        itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
@@ -3094,7 +3169,7 @@ multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
  multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                         bits<5> op11_7, bit op6, bit op4,
                         InstrItinClass itin, string OpcodeStr, string Dt,
-                       Intrinsic IntOp> {
+                       SDPatternOperator IntOp> {
    def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
                        itin, OpcodeStr, !strconcat(Dt, "16"),
                        v8i8, v8i16, IntOp>;
@@ -3182,7 +3257,7 @@ multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       InstrItinClass itinD16, InstrItinClass itinD32,
                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, string Dt,
-                     Intrinsic IntOp, bit Commutable = 0> {
+                     SDPatternOperator IntOp, bit Commutable = 0> {
    // 64-bit vector types.
    def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
                        OpcodeStr, !strconcat(Dt, "16"),
@@ -3203,7 +3278,7 @@ multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       InstrItinClass itinD16, InstrItinClass itinD32,
                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, string Dt,
-                     Intrinsic IntOp> {
+                     SDPatternOperator IntOp> {
    // 64-bit vector types.
    def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16,
                        OpcodeStr, !strconcat(Dt, "16"),
@@ -3224,7 +3299,7 @@ multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
  multiclass N3VIntSL_HS<bits<4> op11_8,
                         InstrItinClass itinD16, InstrItinClass itinD32,
                         InstrItinClass itinQ16, InstrItinClass itinQ32,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
    def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16,
                            OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>;
    def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
@@ -3240,7 +3315,7 @@ multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
-                      Intrinsic IntOp, bit Commutable = 0>
+                      SDPatternOperator IntOp, bit Commutable = 0>
    : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp, Commutable> {
    def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
@@ -3254,7 +3329,7 @@ multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
-                      Intrinsic IntOp>
+                      SDPatternOperator IntOp>
    : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp> {
    def v8i8  : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16,
@@ -3271,7 +3346,7 @@ multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                         InstrItinClass itinD16, InstrItinClass itinD32,
                         InstrItinClass itinQ16, InstrItinClass itinQ32,
                         string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0>
+                       SDPatternOperator IntOp, bit Commutable = 0>
    : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                 OpcodeStr, Dt, IntOp, Commutable> {
    def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
@@ -3285,7 +3360,7 @@ multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                         InstrItinClass itinD16, InstrItinClass itinD32,
                         InstrItinClass itinQ16, InstrItinClass itinQ32,
                         string OpcodeStr, string Dt,
-                       Intrinsic IntOp>
+                       SDPatternOperator IntOp>
    : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                 OpcodeStr, Dt, IntOp> {
    def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32,
@@ -3300,7 +3375,7 @@ multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
  //   source operand element sizes of 16, 32 and 64 bits:
  multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                         string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0> {
+                       SDPatternOperator IntOp, bit Commutable = 0> {
    def v8i8  : N3VNInt<op24, op23, 0b00, op11_8, op4,
                        OpcodeStr, !strconcat(Dt, "16"),
                        v8i8, v8i16, IntOp, Commutable>;
@@ -3360,7 +3435,7 @@ multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
  multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin16, InstrItinClass itin32,
                        string OpcodeStr, string Dt,
-                      Intrinsic IntOp, bit Commutable = 0> {
+                      SDPatternOperator IntOp, bit Commutable = 0> {
    def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16,
                        OpcodeStr, !strconcat(Dt, "16"),
                        v4i32, v4i16, IntOp, Commutable>;
@@ -3371,7 +3446,7 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
  
  multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
                          InstrItinClass itin, string OpcodeStr, string Dt,
-                        Intrinsic IntOp> {
+                        SDPatternOperator IntOp> {
    def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
                            OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
    def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
@@ -3382,7 +3457,7 @@ multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
  multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         InstrItinClass itin16, InstrItinClass itin32,
                         string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0>
+                       SDPatternOperator IntOp, bit Commutable = 0>
    : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt,
                 IntOp, Commutable> {
    def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16,
@@ -3393,7 +3468,7 @@ multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
  // ....with explicit extend (VABDL).
  multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         InstrItinClass itin, string OpcodeStr, string Dt,
-                       Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> {
+                       SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> {
    def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
                           OpcodeStr, !strconcat(Dt, "8"),
                           v8i16, v8i8, IntOp, ExtOp, Commutable>;
@@ -3466,7 +3541,7 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8,
  //   element sizes of 8, 16 and 32 bits:
  multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                          InstrItinClass itinD, InstrItinClass itinQ,
-                        string OpcodeStr, string Dt, Intrinsic IntOp,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp,
                          SDNode OpNode> {
    // 64-bit vector types.
    def v8i8  : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
@@ -3489,7 +3564,7 @@ multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
  //   element sizes of 8, 16 and 32 bits:
  multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         InstrItinClass itinD, InstrItinClass itinQ,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
    // 64-bit vector types.
    def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD,
                         OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
@@ -3536,7 +3611,7 @@ multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
  // First with only element sizes of 16 and 32 bits:
  multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         InstrItinClass itin16, InstrItinClass itin32,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
    def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16,
                         OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
    def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32,
@@ -3544,7 +3619,7 @@ multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
  }
  
  multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
-                         string OpcodeStr, string Dt, Intrinsic IntOp> {
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
    def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
                             OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>;
    def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
@@ -3554,7 +3629,7 @@ multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
  // ....then also with element size of 8 bits:
  multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                          InstrItinClass itin16, InstrItinClass itin32,
-                        string OpcodeStr, string Dt, Intrinsic IntOp>
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp>
    : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> {
    def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16,
                         OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
@@ -3563,7 +3638,7 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
  // ....with explicit extend (VABAL).
  multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                              InstrItinClass itin, string OpcodeStr, string Dt,
-                            Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> {
+                            SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> {
    def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
                             OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
                             IntOp, ExtOp, OpNode>;
@@ -3580,7 +3655,7 @@ multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
  //   element sizes of 8, 16 and 32 bits:
  multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                          bits<5> op11_7, bit op4,
-                        string OpcodeStr, string Dt, Intrinsic IntOp> {
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp> {
    // 64-bit vector types.
    def v8i8  : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                          OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
@@ -3603,7 +3678,7 @@ multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
  //   element sizes of 8, 16 and 32 bits:
  multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                           bits<5> op11_7, bit op4,
-                         string OpcodeStr, string Dt, Intrinsic IntOp> {
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
    // 64-bit vector types.
    def v8i8  : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                           OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
@@ -4136,16 +4211,16 @@ def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
                  Requires<[HasVFP4,UseFusedMAC]>;
  
  // Match @llvm.fma.* intrinsics
-def : Pat<(v2f32 (fma DPR:$src1, DPR:$Vn, DPR:$Vm)),
+def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
            (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
            Requires<[HasVFP4]>;
-def : Pat<(v4f32 (fma QPR:$src1, QPR:$Vn, QPR:$Vm)),
+def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
            (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
            Requires<[HasVFP4]>;
-def : Pat<(v2f32 (fma (fneg DPR:$src1), DPR:$Vn, DPR:$Vm)),
+def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)),
            (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
        Requires<[HasVFP4]>;
-def : Pat<(v4f32 (fma (fneg QPR:$src1), QPR:$Vn, QPR:$Vm)),
+def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
            (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
        Requires<[HasVFP4]>;
  
@@ -4197,6 +4272,7 @@ def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
  def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
                       NEONvceq, 1>;
  
+let TwoOperandAliasConstraint = "$Vm = $Vd" in
  defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
                              "$Vd, $Vm, #0", NEONvceqz>;
  
@@ -4210,10 +4286,12 @@ def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
  def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
                       NEONvcge, 0>;
  
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
  defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
                              "$Vd, $Vm, #0", NEONvcgez>;
  defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
                              "$Vd, $Vm, #0", NEONvclez>;
+}
  
  //   VCGT     : Vector Compare Greater Than
  defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
@@ -4225,10 +4303,12 @@ def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
  def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
                       NEONvcgt, 0>;
  
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
  defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
                              "$Vd, $Vm, #0", NEONvcgtz>;
  defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
                              "$Vd, $Vm, #0", NEONvcltz>;
+}
  
  //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
  def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
@@ -4244,6 +4324,24 @@ def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
  defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                          IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
  
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
+                   (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
+                   (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
+                   (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
+                   (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+
  // Vector Bitwise Operations.
  
  def vnotd : PatFrag<(ops node:$in),
@@ -4308,6 +4406,7 @@ def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
  
  
  //   VBIC     : Vector Bitwise Bit Clear (AND NOT)
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
  def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
                       (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
                       "vbic", "$Vd, $Vn, $Vm", "",
@@ -4318,6 +4417,7 @@ def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                       "vbic", "$Vd, $Vn, $Vm", "",
                       [(set QPR:$Vd, (v4i32 (and QPR:$Vn,
                                                   (vnotq QPR:$Vm))))]>;
+}
  
  def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
                            (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
@@ -4419,10 +4519,36 @@ def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
                       "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
                       [(set DPR:$Vd,
                             (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
+                                   (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
+                                    (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
+                                    (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
+                                    (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
+                                    (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
  
  def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
                       (and DPR:$Vm, (vnotd DPR:$Vd)))),
-          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
  
  def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                       (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
@@ -4431,9 +4557,35 @@ def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                       [(set QPR:$Vd,
                             (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
  
+def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
+                                   (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
+                                    (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
+                                    (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
  def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
                       (and QPR:$Vm, (vnotq QPR:$Vd)))),
-          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
  
  //   VBIF     : Vector Bitwise Insert if False
  //              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
@@ -4515,6 +4667,18 @@ def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                          "vmax", "f32",
                          v4f32, v4f32, int_arm_neon_vmaxs, 1>;
  
+// VMAXNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMAXNMND  : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v2f32, v2f32, int_arm_neon_vmaxnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMAXNMNQ  : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v4f32, v4f32, int_arm_neon_vmaxnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+}
+
  //   VMIN     : Vector Minimum
  defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
                             IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
@@ -4529,6 +4693,18 @@ def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                          "vmin", "f32",
                          v4f32, v4f32, int_arm_neon_vmins, 1>;
  
+// VMINNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMINNMND  : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v2f32, v2f32, int_arm_neon_vminnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMINNMNQ  : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v4f32, v4f32, int_arm_neon_vminnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+}
+
  // Vector Pairwise Operations.
  
  //   VPADD    : Vector Pairwise Add
@@ -4756,12 +4932,38 @@ defm VSRI     : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">;
  defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0,
                             IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s",
                             int_arm_neon_vabs>;
-def  VABSfd   : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
-                        IIC_VUNAD, "vabs", "f32",
-                        v2f32, v2f32, int_arm_neon_vabs>;
-def  VABSfq   : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
-                        IIC_VUNAQ, "vabs", "f32",
-                        v4f32, v4f32, int_arm_neon_vabs>;
+def  VABSfd   : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                     "vabs", "f32",
+                     v2f32, v2f32, fabs>;
+def  VABSfq   : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                     "vabs", "f32",
+                      v4f32, v4f32, fabs>;
+
+def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))),
+               (v2i32 (bitconvert (v8i8 (add DPR:$src,
+                                             (NEONvshrs DPR:$src, (i32 7))))))),
+          (VABSv8i8 DPR:$src)>;
+def : Pat<(xor (v2i32 (bitconvert (v4i16 (NEONvshrs DPR:$src, (i32 15))))),
+               (v2i32 (bitconvert (v4i16 (add DPR:$src,
+                                            (NEONvshrs DPR:$src, (i32 15))))))),
+          (VABSv4i16 DPR:$src)>;
+def : Pat<(xor (v2i32 (NEONvshrs DPR:$src, (i32 31))),
+               (v2i32 (add DPR:$src, (NEONvshrs DPR:$src, (i32 31))))),
+          (VABSv2i32 DPR:$src)>;
+def : Pat<(xor (v4i32 (bitconvert (v16i8 (NEONvshrs QPR:$src, (i32 7))))),
+               (v4i32 (bitconvert (v16i8 (add QPR:$src,
+                                             (NEONvshrs QPR:$src, (i32 7))))))),
+          (VABSv16i8 QPR:$src)>;
+def : Pat<(xor (v4i32 (bitconvert (v8i16 (NEONvshrs QPR:$src, (i32 15))))),
+               (v4i32 (bitconvert (v8i16 (add QPR:$src,
+                                            (NEONvshrs QPR:$src, (i32 15))))))),
+          (VABSv8i16 QPR:$src)>;
+def : Pat<(xor (v4i32 (NEONvshrs QPR:$src, (i32 31))),
+               (v4i32 (add QPR:$src, (NEONvshrs QPR:$src, (i32 31))))),
+          (VABSv4i32 QPR:$src)>;
+
+def : Pat<(v2f32 (int_arm_neon_vabs (v2f32 DPR:$src))), (VABSfd DPR:$src)>;
+def : Pat<(v4f32 (int_arm_neon_vabs (v4f32 QPR:$src))), (VABSfq QPR:$src)>;
  
  //   VQABS    : Vector Saturating Absolute Value
  defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
@@ -4823,14 +5025,14 @@ defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
  //   VCLZ     : Vector Count Leading Zeros
  defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
                             IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
-                           int_arm_neon_vclz>;
+                           ctlz>;
  //   VCNT     : Vector Count One Bits
  def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
                          IIC_VCNTiD, "vcnt", "8",
-                        v8i8, v8i8, int_arm_neon_vcnt>;
+                        v8i8, v8i8, ctpop>;
  def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
                          IIC_VCNTiQ, "vcnt", "8",
-                        v16i8, v16i8, int_arm_neon_vcnt>;
+                        v16i8, v16i8, ctpop>;
  
  // Vector Swap
  def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
@@ -4947,7 +5149,8 @@ def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
                            (outs GPR:$R), (ins DPR:$V, VectorIndex32:$lane),
                            IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
                            [(set GPR:$R, (extractelt (v2i32 DPR:$V),
-                                           imm:$lane))]> {
+                                           imm:$lane))]>,
+                Requires<[HasNEON, HasFastVGETLNi32]> {
    let Inst{21} = lane{0};
  }
  // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
@@ -4970,7 +5173,16 @@ def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
  def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
            (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
                               (DSubReg_i32_reg imm:$lane))),
-                     (SubReg_i32_lane imm:$lane))>;
+                     (SubReg_i32_lane imm:$lane))>,
+      Requires<[HasNEON, HasFastVGETLNi32]>;
+def : Pat<(extractelt (v2i32 DPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
  def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
            (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
                            (SSubReg_f32_reg imm:$src2))>;
@@ -5081,14 +5293,23 @@ class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
  
  def  VDUP8d   : VDUPD<0b11101100, 0b00, "8", v8i8>;
  def  VDUP16d  : VDUPD<0b11101000, 0b01, "16", v4i16>;
-def  VDUP32d  : VDUPD<0b11101000, 0b00, "32", v2i32>;
+def  VDUP32d  : VDUPD<0b11101000, 0b00, "32", v2i32>,
+                Requires<[HasNEON, HasFastVDUP32]>;
  def  VDUP8q   : VDUPQ<0b11101110, 0b00, "8", v16i8>;
  def  VDUP16q  : VDUPQ<0b11101010, 0b01, "16", v8i16>;
  def  VDUP32q  : VDUPQ<0b11101010, 0b00, "32", v4i32>;
  
-def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>;
+// NEONvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+      Requires<[HasNEON,HasFastVDUP32]>;
  def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
  
+// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+
  //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
  
  class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
@@ -5197,6 +5418,26 @@ def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
  def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
                       v4f32, v4i32, uint_to_fp>;
  
+// VCVT{A, N, P, M}
+multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS,
+                    SDPatternOperator IntU> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>;
+    def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>;
+  }
+}
+
+defm VCVTAN : VCVT_FPI<"a", 0b000, int_arm_neon_vcvtas, int_arm_neon_vcvtau>;
+defm VCVTNN : VCVT_FPI<"n", 0b001, int_arm_neon_vcvtns, int_arm_neon_vcvtnu>;
+defm VCVTPN : VCVT_FPI<"p", 0b010, int_arm_neon_vcvtps, int_arm_neon_vcvtpu>;
+defm VCVTMN : VCVT_FPI<"m", 0b011, int_arm_neon_vcvtms, int_arm_neon_vcvtmu>;
+
  //   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
  let DecoderMethod = "DecodeVCVTD" in {
  def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
@@ -5320,8 +5561,9 @@ class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
          IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
          [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
                                       (Ty DPR:$Vm), imm:$index)))]> {
-  bits<4> index;
-  let Inst{11-8} = index{3-0};
+  bits<3> index;
+  let Inst{11} = 0b0;
+  let Inst{10-8} = index{2-0};
  }
  
  class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
@@ -5336,14 +5578,14 @@ class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
  }
  
  def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
-  let Inst{11-8} = index{3-0};
+  let Inst{10-8} = index{2-0};
  }
  def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
-  let Inst{11-9} = index{2-0};
+  let Inst{10-9} = index{1-0};
    let Inst{8}    = 0b0;
  }
  def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
-  let Inst{11-10} = index{1-0};
+  let Inst{10}     = index{0};
    let Inst{9-8}    = 0b00;
  }
  def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
@@ -5468,6 +5710,34 @@ def  VTBX4Pseudo
                  IIC_VTBX4, "$orig = $dst", []>;
  } // DecoderMethod = "DecodeTBLInstruction"
  
+// VRINT      : Vector Rounding
+multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+    def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+  }
+
+  def : InstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+  def : InstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
+                  (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>;
+}
+
+defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>;
+defm VRINTXN : VRINT_FPI<"x", 0b001, int_arm_neon_vrintx>;
+defm VRINTAN : VRINT_FPI<"a", 0b010, int_arm_neon_vrinta>;
+defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>;
+defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>;
+defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
+
  //===----------------------------------------------------------------------===//
  // NEON instructions for single-precision FP math
  //===----------------------------------------------------------------------===//
@@ -5525,6 +5795,11 @@ def : N2VSPat<arm_ftoui, VCVTf2ud>;
  def : N2VSPat<arm_sitof, VCVTs2fd>;
  def : N2VSPat<arm_uitof, VCVTu2fd>;
  
+// Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers.
+def : Pat<(f32 (bitconvert GPR:$a)),
+          (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+        Requires<[HasNEON, DontUseVMOVSR]>;
+
  //===----------------------------------------------------------------------===//
  // Non-Instruction Patterns
  //===----------------------------------------------------------------------===//
@@ -5592,29 +5867,34 @@ def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
  
+// Fold extracting an element out of a v2i32 into a vfp register.
+def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
+          (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+
  // Vector lengthening move with load, matching extending loads.
  
  // extload, zextload and sextload for a standard lengthening load. Example:
-// Lengthen_Single<"8", "i16", "i8"> = 
-//     Pat<(v8i16 (extloadvi8 addrmode6oneL32:$addr))
-//         (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
-//                                 (f64 (IMPLICIT_DEF)), (i32 0)))>;
+// Lengthen_Single<"8", "i16", "8"> = 
+//     Pat<(v8i16 (extloadvi8 addrmode6:$addr))
+//         (VMOVLuv8i16 (VLD1d8 addrmode6:$addr,
+//                              (f64 (IMPLICIT_DEF)), (i32 0)))>;
  multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
+  let AddedComplexity = 10 in {
    def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                    (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+                    (!cast<PatFrag>("extloadvi" # SrcTy) addrmode6:$addr)),
                    (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
-                    (VLD1LNd32 addrmode6oneL32:$addr, 
-                               (f64 (IMPLICIT_DEF)), (i32 0)))>;
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
    def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                  (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+                  (!cast<PatFrag>("zextloadvi" # SrcTy) addrmode6:$addr)),
                  (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
-                    (VLD1LNd32 addrmode6oneL32:$addr, 
-                               (f64 (IMPLICIT_DEF)), (i32 0)))>;
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
    def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                  (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+                  (!cast<PatFrag>("sextloadvi" # SrcTy) addrmode6:$addr)),
                  (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy)
-                    (VLD1LNd32 addrmode6oneL32:$addr, 
-                               (f64 (IMPLICIT_DEF)), (i32 0)))>;
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+  }
  }
  
  // extload, zextload and sextload for a lengthening load which only uses
@@ -5646,8 +5926,8 @@ multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
  // extload, zextload and sextload for a lengthening load followed by another
  // lengthening load, to quadruple the initial length.
  //
-// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32", qsub_0> =
-//     Pat<(v4i32 (extloadvi8 addrmode5:$addr))
+// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32"> =
+//     Pat<(v4i32 (extloadvi8 addrmode6oneL32:$addr))
  //         (EXTRACT_SUBREG (VMOVLuv4i32
  //           (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
  //                                                   (f64 (IMPLICIT_DEF)),
@@ -5661,19 +5941,19 @@ multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
                     (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
           (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
             (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
               dsub_0))>;
    def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
                     (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
           (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
             (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
               dsub_0))>;
    def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
                     (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
           (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
             (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
-             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
               dsub_0))>;
  }
  
@@ -5682,9 +5962,9 @@ multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
  // requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
  //
  // Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> =
-// Pat<(v4i32 (extloadvi8 addrmode5:$addr))
+// Pat<(v2i32 (extloadvi8 addrmode6:$addr))
  //     (EXTRACT_SUBREG (VMOVLuv4i32
-//       (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//       (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd16 addrmode6:$addr,
  //                                               (f64 (IMPLICIT_DEF)), (i32 0))),
  //                       dsub_0)),
  //       dsub_0)>;
@@ -5692,34 +5972,33 @@ multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
                             string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
                             string Insn2Ty> {
    def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
             (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
               dsub_0)),
            dsub_0)>;
    def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
             (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
               dsub_0)),
            dsub_0)>;
    def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
             (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
-             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
               dsub_0)),
            dsub_0)>;
  }
  
-defm : Lengthen_Single<"8", "i16", "i8">; // v8i8 -> v8i16
-defm : Lengthen_Single<"4", "i32", "i16">; // v4i16 -> v4i32
-defm : Lengthen_Single<"2", "i64", "i32">; // v2i32 -> v2i64
+defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16
+defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32
+defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64
  
  defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
-defm : Lengthen_HalfSingle<"2", "i16", "i8", "8", "i16">; // v2i8 -> v2i16
  defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
  
  // Double lengthening - v4i8 -> v4i16 -> v4i32
@@ -5730,17 +6009,17 @@ defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
  defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
  
  // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
-def : Pat<(v2i64 (extloadvi8 addrmode6oneL32:$addr)),
+def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (VLD1LNd32 addrmode6oneL32:$addr, 
+         (VLD1LNd16 addrmode6:$addr, 
                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
-def : Pat<(v2i64 (zextloadvi8 addrmode6oneL32:$addr)),
+def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (VLD1LNd32 addrmode6oneL32:$addr,
+         (VLD1LNd16 addrmode6:$addr,
                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
-def : Pat<(v2i64 (sextloadvi8 addrmode6oneL32:$addr)),
+def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
        (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
-         (VLD1LNd32 addrmode6oneL32:$addr,
+         (VLD1LNd16 addrmode6:$addr,
                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
  
  //===----------------------------------------------------------------------===//
@@ -6499,12 +6778,17 @@ def VST4qWB_register_Asm_32 :
                    (ins VecListFourQ:$list, addrmode6:$addr,
                         rGPR:$Rm, pred:$p)>;
  
-// VMOV takes an optional datatype suffix
+// VMOV/VMVN takes an optional datatype suffix
  defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
                           (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
  defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
                           (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
  
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
  // VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
  // D-register versions.
  def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm",