From ea28aafa83fc2b6dd632041278c9a18e5a2b2b41 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel.sanders@imgtec.com>
Date: Fri, 15 Nov 2013 12:56:49 +0000
Subject: [PATCH] Fix illegal DAG produced by SelectionDAG::getConstant() for
 v2i64 type

Summary:
When getConstant() is called for an expanded vector type, it is split into
multiple scalar constants which are then combined using appropriate build_vector
and bitcast operations.

In addition to the usual big/little endian differences, the case where the
element-order of the vector does not have the same endianness as the elements
themselves is also accounted for.  For example, for v4i32 on big-endian MIPS,
the byte-order of the vector is <3210,7654,BA98,FEDC>. For little-endian, it is
<0123,4567,89AB,CDEF>.
Handling this case turns out to be a nop since getConstant() returns a splatted
vector (so reversing the element order doesn't change the value)

This fixes a number of cases in MIPS MSA where calling getConstant() during
operation legalization introduces illegal types (e.g. to legalize v2i64 UNDEF
into a v2i64 BUILD_VECTOR of illegal i64 zeros). It should also handle bigger
differences between illegal and legal types such as legalizing v2i64 into v8i16.

lowerMSASplatImm() in the MIPS backend no longer needs to avoid calling
getConstant() so this function has been updated in the same patch.

For the sake of transparency, the steps I've taken since the review are:
* Added 'virtual' to isVectorEltOrderLittleEndian() as requested. This revealed
  that the MIPS tests were falsely passing because a polymorphic function was
  not actually polymorphic in the reviewed patch.
* Fixed the tests that were now failing. This involved deleting the code to
  handle the MIPS MSA element-order (which was previously doing an byte-order
  swap instead of an element-order swap). This left
  isVectorEltOrderLittleEndian() unused and it was deleted.
* Fixed build failures caused by rebasing beyond r194467-r194472. These build
  failures involved the bset, bneg, and bclr instructions added in these commits
  using lowerMSASplatImm() in a way that was no longer valid after this patch.
  Some of these were fixed by calling SelectionDAG::getConstant() instead,
  others were fixed by a new function getBuildVectorSplat() that provided the
  removed functionality of lowerMSASplatImm() in a more sensible way.

Reviewers: bkramer

Reviewed By: bkramer

CC: llvm-commits

Differential Revision: http://llvm-reviews.chandlerc.com/D1973

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194811 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SelectionDAG.h           |   7 +
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  51 +++++-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |   2 +
 lib/Target/Mips/MipsSEISelLowering.cpp        | 150 +++++++++++-------
 test/CodeGen/Mips/msa/3r-b.ll                 |   1 -
 test/CodeGen/Mips/msa/bit.ll                  |   4 -
 test/CodeGen/Mips/msa/elm_shift_slide.ll      |   4 -
 test/CodeGen/Mips/msa/i5-a.ll                 |   4 -
 test/CodeGen/Mips/msa/i5-b.ll                 |   1 -
 test/CodeGen/Mips/msa/i5-c.ll                 |   4 -
 test/CodeGen/Mips/msa/i5-m.ll                 |   4 -
 test/CodeGen/Mips/msa/i5-s.ll                 |   4 -
 .../Mips/msa/llvm-stress-s1704963983.ll       | 134 ++++++++++++++++
 13 files changed, 284 insertions(+), 86 deletions(-)
 create mode 100644 test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll

diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 5f18c4eb251..9a9bb991bb8 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -236,6 +236,13 @@ public:
     virtual void NodeUpdated(SDNode *N);
   };
 
+  /// NewNodesMustHaveLegalTypes - When true, additional steps are taken to
+  /// ensure that getConstant() and similar functions return DAG nodes that
+  /// have legal types. This is important after type legalization since
+  /// any illegally typed nodes generated after this point will not experience
+  /// type legalization.
+  bool NewNodesMustHaveLegalTypes;
+
 private:
   /// DAGUpdateListener is a friend so it can manipulate the listener stack.
   friend struct DAGUpdateListener;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index fb0e63e9e5d..bac06ccaf57 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -871,7 +871,8 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
   : TM(tm), TSI(*tm.getSelectionDAGInfo()), TTI(0), OptLevel(OL),
     EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
-    Root(getEntryNode()), UpdateListeners(0) {
+    Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
+    UpdateListeners(0) {
   AllNodes.push_back(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
@@ -983,6 +984,54 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
    APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
    Elt = ConstantInt::get(*getContext(), NewVal);
   }
+  // In other cases the element type is illegal and needs to be expanded, for
+  // example v2i64 on MIPS32. In this case, find the nearest legal type, split
+  // the value into n parts and use a vector type with n-times the elements.
+  // Then bitcast to the type requested.
+  // Legalizing constants too early makes the DAGCombiner's job harder so we
+  // only legalize if the DAG tells us we must produce legal types.
+  else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
+           TLI->getTypeAction(*getContext(), EltVT) ==
+           TargetLowering::TypeExpandInteger) {
+    APInt NewVal = Elt->getValue();
+    EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+    unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+    EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+
+    // Check the temporary vector is the correct size. If this fails then
+    // getTypeToTransformTo() probably returned a type whose size (in bits)
+    // isn't a power-of-2 factor of the requested type size.
+    assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SmallVector<SDValue, 2> EltParts;
+    for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
+      EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
+                                           .trunc(ViaEltSizeInBits),
+                                     ViaEltVT, isT));
+    }
+
+    // EltParts is currently in little endian order. If we actually want
+    // big-endian order then reverse it now.
+    if (TLI->isBigEndian())
+      std::reverse(EltParts.begin(), EltParts.end());
+
+    // The elements must be reversed when the element order is different
+    // to the endianness of the elements (because the BITCAST is itself a
+    // vector shuffle in this situation). However, we do not need any code to
+    // perform this reversal because getConstant() is producing a vector
+    // splat.
+    // This situation occurs in MIPS MSA.
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
+      Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
+
+    SDValue Result = getNode(ISD::BITCAST, SDLoc(), VT,
+                             getNode(ISD::BUILD_VECTOR, SDLoc(), ViaVecVT,
+                                     &Ops[0], Ops.size()));
+    return Result;
+  }
 
   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
          "APInt size does not match type size!");
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c937996efdd..8bf875642b9 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -666,6 +666,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
           << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump());
   }
 
+  CurDAG->NewNodesMustHaveLegalTypes = true;
+
   if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
 
   {
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 9585efb272b..e2ea629d954 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1254,38 +1254,73 @@ static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
   return Result;
 }
 
-static SDValue
-lowerMSASplatImm(SDLoc DL, EVT ResTy, SDValue ImmOp, SelectionDAG &DAG) {
-  EVT ViaVecTy = ResTy;
-  SmallVector<SDValue, 16> Ops;
-  SDValue ImmHiOp;
-
-  if (ViaVecTy == MVT::v2i64) {
-    ImmHiOp = DAG.getNode(ISD::SRA, DL, MVT::i32, ImmOp,
-                          DAG.getConstant(31, MVT::i32));
-    for (unsigned i = 0; i < ViaVecTy.getVectorNumElements(); ++i) {
-      Ops.push_back(ImmHiOp);
-      Ops.push_back(ImmOp);
-    }
+static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
+  EVT ResVecTy = Op->getValueType(0);
+  EVT ViaVecTy = ResVecTy;
+  SDLoc DL(Op);
+
+  // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
+  // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
+  // lanes.
+  SDValue LaneA;
+  SDValue LaneB = Op->getOperand(2);
+
+  if (ResVecTy == MVT::v2i64) {
+    LaneA = DAG.getConstant(0, MVT::i32);
     ViaVecTy = MVT::v4i32;
-  } else {
-    for (unsigned i = 0; i < ResTy.getVectorNumElements(); ++i)
-      Ops.push_back(ImmOp);
-  }
+  } else
+    LaneA = LaneB;
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, &Ops[0],
-                               Ops.size());
+  SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
+                      LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
 
-  if (ResTy != ViaVecTy)
-    Result = DAG.getNode(ISD::BITCAST, DL, ResTy, Result);
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
+                               ViaVecTy.getVectorNumElements());
+
+  if (ViaVecTy != ResVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
 
   return Result;
 }
 
-static SDValue
-lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
-  return lowerMSASplatImm(SDLoc(Op), Op->getValueType(0),
-                          Op->getOperand(ImmOp), DAG);
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
+  return DAG.getConstant(Op->getConstantOperandVal(ImmOp), Op->getValueType(0));
+}
+
+static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
+                                   bool BigEndian, SelectionDAG &DAG) {
+  EVT ViaVecTy = VecTy;
+  SDValue SplatValueA = SplatValue;
+  SDValue SplatValueB = SplatValue;
+  SDLoc DL(SplatValue);
+
+  if (VecTy == MVT::v2i64) {
+    // v2i64 BUILD_VECTOR must be performed via v4i32 so split into i32's.
+    ViaVecTy = MVT::v4i32;
+
+    SplatValueA = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValue);
+    SplatValueB = DAG.getNode(ISD::SRL, DL, MVT::i64, SplatValue,
+                              DAG.getConstant(32, MVT::i32));
+    SplatValueB = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValueB);
+  }
+
+  // We currently hold the parts in little endian order. Swap them if
+  // necessary.
+  if (BigEndian)
+    std::swap(SplatValueA, SplatValueB);
+
+  SDValue Ops[16] = { SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
+                               ViaVecTy.getVectorNumElements());
+
+  if (VecTy != ViaVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
+
+  return Result;
 }
 
 static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
@@ -1295,27 +1330,37 @@ static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
   SDValue Exp2Imm;
   SDLoc DL(Op);
 
-  // The DAG Combiner can't constant fold bitcasted vectors so we must do it
-  // here.
+  // The DAG Combiner can't constant fold bitcasted vectors yet so we must do it
+  // here for now.
   if (VecTy == MVT::v2i64) {
     if (ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(Imm)) {
       APInt BitImm = APInt(64, 1) << CImm->getAPIntValue();
 
       SDValue BitImmHiOp = DAG.getConstant(BitImm.lshr(32).trunc(32), MVT::i32);
-      SDValue BitImmOp = DAG.getConstant(BitImm.trunc(32), MVT::i32);
-      Exp2Imm = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                            DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32,
-                                        BitImmHiOp, BitImmOp,
-                                        BitImmHiOp, BitImmOp));
+      SDValue BitImmLoOp = DAG.getConstant(BitImm.trunc(32), MVT::i32);
+
+      if (BigEndian)
+        std::swap(BitImmLoOp, BitImmHiOp);
+
+      Exp2Imm =
+          DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, BitImmLoOp,
+                                  BitImmHiOp, BitImmLoOp, BitImmHiOp));
     }
   }
 
   if (Exp2Imm.getNode() == NULL) {
     // We couldnt constant fold, do a vector shift instead
-    SDValue One = lowerMSASplatImm(DL, VecTy, DAG.getConstant(1, MVT::i32),
-                                   DAG);
-    Exp2Imm = lowerMSASplatImm(DL, VecTy, Imm, DAG);
-    Exp2Imm = DAG.getNode(ISD::SHL, DL, VecTy, One, Exp2Imm);
+
+    // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
+    // only values 0-63 are valid.
+    if (VecTy == MVT::v2i64)
+      Imm = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Imm);
+
+    Exp2Imm = getBuildVectorSplat(VecTy, Imm, BigEndian, DAG);
+
+    Exp2Imm =
+        DAG.getNode(ISD::SHL, DL, VecTy, DAG.getConstant(1, VecTy), Exp2Imm);
   }
 
   return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm);
@@ -1325,7 +1370,7 @@ static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
   EVT ResTy = Op->getValueType(0);
   EVT ViaVecTy = ResTy == MVT::v2i64 ? MVT::v4i32 : ResTy;
   SDLoc DL(Op);
-  SDValue One = lowerMSASplatImm(DL, ResTy, DAG.getConstant(1, MVT::i32), DAG);
+  SDValue One = DAG.getConstant(1, ResTy);
   SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2));
 
   SDValue AllOnes = DAG.getConstant(-1, MVT::i32);
@@ -1346,15 +1391,9 @@ static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
 static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   EVT ResTy = Op->getValueType(0);
-  SDValue SHAmount = Op->getOperand(2);
-  EVT ImmTy = SHAmount->getValueType(0);
-  SDValue Bit =
-      DAG.getNode(ISD::SHL, DL, ImmTy, DAG.getConstant(1, ImmTy), SHAmount);
-  SDValue BitMask = DAG.getNOT(DL, Bit, ImmTy);
-
-  assert(ResTy.getVectorNumElements() <= 16);
-
-  BitMask = lowerMSASplatImm(DL, ResTy, BitMask, DAG);
+  APInt BitImm = APInt(ResTy.getVectorElementType().getSizeInBits(), 1)
+                 << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+  SDValue BitMask = DAG.getConstant(~BitImm, ResTy);
 
   return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
 }
@@ -1469,8 +1508,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_bneg_w:
   case Intrinsic::mips_bneg_d: {
     EVT VecTy = Op->getValueType(0);
-    SDValue One = lowerMSASplatImm(DL, VecTy, DAG.getConstant(1, MVT::i32),
-                                   DAG);
+    SDValue One = DAG.getConstant(1, VecTy);
 
     return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1),
                        DAG.getNode(ISD::SHL, DL, VecTy, One,
@@ -1504,8 +1542,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_bset_w:
   case Intrinsic::mips_bset_d: {
     EVT VecTy = Op->getValueType(0);
-    SDValue One = lowerMSASplatImm(DL, VecTy, DAG.getConstant(1, MVT::i32),
-                                   DAG);
+    SDValue One = DAG.getConstant(1, VecTy);
 
     return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1),
                        DAG.getNode(ISD::SHL, DL, VecTy, One,
@@ -1926,7 +1963,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     // EXTRACT_VECTOR_ELT can't extract i64's on MIPS32.
     // Instead we lower to MipsISD::VSHF and match from there.
     return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
-                       lowerMSASplatImm(Op, 2, DAG), Op->getOperand(1),
+                       lowerMSASplatZExt(Op, 2, DAG), Op->getOperand(1),
                        Op->getOperand(1));
   case Intrinsic::mips_splati_b:
   case Intrinsic::mips_splati_h:
@@ -2200,15 +2237,10 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
       return SDValue();
     }
 
-    SmallVector<SDValue, 16> Ops;
-    SDValue Constant = DAG.getConstant(SplatValue.sextOrSelf(32), MVT::i32);
-
-    for (unsigned i = 0; i < ViaVecTy.getVectorNumElements(); ++i)
-      Ops.push_back(Constant);
-
-    SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Node), ViaVecTy,
-                                 &Ops[0], Ops.size());
+    // SelectionDAG::getConstant will promote SplatValue appropriately.
+    SDValue Result = DAG.getConstant(SplatValue, ViaVecTy);
 
+    // Bitcast to the type we originally wanted
     if (ViaVecTy != ResTy)
       Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
 
diff --git a/test/CodeGen/Mips/msa/3r-b.ll b/test/CodeGen/Mips/msa/3r-b.ll
index 7c45b2bf943..a05d19b4d49 100644
--- a/test/CodeGen/Mips/msa/3r-b.ll
+++ b/test/CodeGen/Mips/msa/3r-b.ll
@@ -3,7 +3,6 @@
 
 ; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
 ; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
-; XFAIL: *
 
 @llvm_mips_bclr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_bclr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/test/CodeGen/Mips/msa/bit.ll b/test/CodeGen/Mips/msa/bit.ll
index dc8bb8f2b6c..59ddbe17a33 100644
--- a/test/CodeGen/Mips/msa/bit.ll
+++ b/test/CodeGen/Mips/msa/bit.ll
@@ -1,7 +1,3 @@
-; Both endians should emit the same output for immediate instructions.
-; This is not currently true.
-; XFAIL: *
-
 ; Test the MSA intrinsics that are encoded with the BIT instruction format.
 
 ; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
diff --git a/test/CodeGen/Mips/msa/elm_shift_slide.ll b/test/CodeGen/Mips/msa/elm_shift_slide.ll
index 3b3504b398c..39d670dac84 100644
--- a/test/CodeGen/Mips/msa/elm_shift_slide.ll
+++ b/test/CodeGen/Mips/msa/elm_shift_slide.ll
@@ -1,7 +1,3 @@
-; Both endians should emit the same output for immediate instructions.
-; This is not currently true.
-; XFAIL: *
-
 ; Test the MSA intrinsics that are encoded with the ELM instruction format and
 ; are either shifts or slides.
 
diff --git a/test/CodeGen/Mips/msa/i5-a.ll b/test/CodeGen/Mips/msa/i5-a.ll
index f45df07f76b..0b507208f42 100644
--- a/test/CodeGen/Mips/msa/i5-a.ll
+++ b/test/CodeGen/Mips/msa/i5-a.ll
@@ -1,7 +1,3 @@
-; Both endians should emit the same output for immediate instructions.
-; This is not currently true.
-; XFAIL: *
-
 ; Test the MSA intrinsics that are encoded with the I5 instruction format.
 ; There are lots of these so this covers those beginning with 'a'
 
diff --git a/test/CodeGen/Mips/msa/i5-b.ll b/test/CodeGen/Mips/msa/i5-b.ll
index f4477a0671b..da6be669f0d 100644
--- a/test/CodeGen/Mips/msa/i5-b.ll
+++ b/test/CodeGen/Mips/msa/i5-b.ll
@@ -3,7 +3,6 @@
 
 ; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
 ; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
-; XFAIL: *
 
 @llvm_mips_bclri_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_bclri_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/test/CodeGen/Mips/msa/i5-c.ll b/test/CodeGen/Mips/msa/i5-c.ll
index d63b3ccf2c0..bf1578f30f3 100644
--- a/test/CodeGen/Mips/msa/i5-c.ll
+++ b/test/CodeGen/Mips/msa/i5-c.ll
@@ -1,7 +1,3 @@
-; Both endians should emit the same output for immediate instructions.
-; This is not currently true.
-; XFAIL: *
-
 ; Test the MSA intrinsics that are encoded with the I5 instruction format.
 ; There are lots of these so this covers those beginning with 'c'
 
diff --git a/test/CodeGen/Mips/msa/i5-m.ll b/test/CodeGen/Mips/msa/i5-m.ll
index 74e698b4752..27663494324 100644
--- a/test/CodeGen/Mips/msa/i5-m.ll
+++ b/test/CodeGen/Mips/msa/i5-m.ll
@@ -1,7 +1,3 @@
-; Both endians should emit the same output for immediate instructions.
-; This is not currently true.
-; XFAIL: *
-
 ; Test the MSA intrinsics that are encoded with the I5 instruction format.
 ; There are lots of these so this covers those beginning with 'm'
 
diff --git a/test/CodeGen/Mips/msa/i5-s.ll b/test/CodeGen/Mips/msa/i5-s.ll
index 60ba8e1e34b..184172f63b8 100644
--- a/test/CodeGen/Mips/msa/i5-s.ll
+++ b/test/CodeGen/Mips/msa/i5-s.ll
@@ -1,7 +1,3 @@
-; Both endians should emit the same output for immediate instructions.
-; This is not currently true.
-; XFAIL: *
-
 ; Test the MSA intrinsics that are encoded with the I5 instruction format.
 ; There are lots of these so this covers those beginning with 's'
 
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll b/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll
new file mode 100644
index 00000000000..4beaaa9c184
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll
@@ -0,0 +1,134 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; "Unexpected illegal type!" assertion.
+; It should at least successfully build.
+
+define void @autogen_SD1704963983(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <4 x double>
+  %A3 = alloca <8 x i64>
+  %A2 = alloca <1 x double>
+  %A1 = alloca double
+  %A = alloca i32
+  %L = load i8* %0
+  store i8 77, i8* %0
+  %E = extractelement <8 x i64> zeroinitializer, i32 2
+  %Shuff = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15, i32 1, i32 3>
+  %I = insertelement <8 x i64> zeroinitializer, i64 %E, i32 7
+  %Sl = select i1 false, i8* %0, i8* %0
+  %Cmp = icmp eq i32 434069, 272505
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF78, %BB
+  %L5 = load i8* %Sl
+  store i8 %L, i8* %Sl
+  %E6 = extractelement <8 x i32> zeroinitializer, i32 2
+  %Shuff7 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 undef>
+  %I8 = insertelement <8 x i64> zeroinitializer, i64 %4, i32 7
+  %B = shl <1 x i16> zeroinitializer, zeroinitializer
+  %FC = sitofp <8 x i64> zeroinitializer to <8 x float>
+  %Sl9 = select i1 %Cmp, i8 77, i8 77
+  %Cmp10 = icmp uge <8 x i64> %Shuff, zeroinitializer
+  %L11 = load i8* %0
+  store i8 %Sl9, i8* %0
+  %E12 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff13 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 undef, i32 3, i32 5, i32 7>
+  %I14 = insertelement <4 x i32> zeroinitializer, i32 %3, i32 3
+  %B15 = udiv <1 x i16> %B, zeroinitializer
+  %Tr = trunc <8 x i64> %Shuff to <8 x i32>
+  %Sl16 = select i1 %Cmp, i8 77, i8 %5
+  %Cmp17 = icmp ult <8 x i1> %Cmp10, %Cmp10
+  %L18 = load i8* %Sl
+  store i8 -1, i8* %Sl
+  %E19 = extractelement <8 x i32> zeroinitializer, i32 3
+  %Shuff20 = shufflevector <8 x float> %FC, <8 x float> %FC, <8 x i32> <i32 6, i32 8, i32 undef, i32 12, i32 14, i32 0, i32 2, i32 undef>
+  %I21 = insertelement <8 x i64> %Shuff13, i64 %E, i32 0
+  %B22 = urem <8 x i64> %Shuff7, %I21
+  %FC23 = sitofp i32 50347 to float
+  %Sl24 = select i1 %Cmp, double 0.000000e+00, double 0.000000e+00
+  %Cmp25 = icmp ugt i32 465489, 47533
+  br i1 %Cmp25, label %CF, label %CF78
+
+CF78:                                             ; preds = %CF
+  %L26 = load i8* %Sl
+  store i32 50347, i32* %A
+  %E27 = extractelement <8 x i1> %Cmp10, i32 2
+  br i1 %E27, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF81, %CF78
+  %Shuff28 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 undef>
+  %I29 = insertelement <1 x i16> zeroinitializer, i16 -1, i32 0
+  %B30 = urem <8 x i32> %Tr, zeroinitializer
+  %Tr31 = trunc i32 0 to i16
+  %Sl32 = select i1 %Cmp, <2 x i1> zeroinitializer, <2 x i1> zeroinitializer
+  %L33 = load i8* %Sl
+  store i8 %L26, i8* %Sl
+  %E34 = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff35 = shufflevector <1 x i16> zeroinitializer, <1 x i16> %B, <1 x i32> undef
+  %I36 = insertelement <8 x i64> %Shuff28, i64 %E, i32 7
+  %B37 = srem <1 x i16> %I29, zeroinitializer
+  %FC38 = sitofp <8 x i32> %B30 to <8 x double>
+  %Sl39 = select i1 %Cmp, double 0.000000e+00, double %Sl24
+  %L40 = load i8* %Sl
+  store i8 %Sl16, i8* %Sl
+  %E41 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff42 = shufflevector <8 x i1> %Cmp17, <8 x i1> %Cmp10, <8 x i32> <i32 14, i32 undef, i32 2, i32 4, i32 undef, i32 8, i32 10, i32 12>
+  %I43 = insertelement <4 x i32> zeroinitializer, i32 272505, i32 0
+  %B44 = urem <8 x i32> %B30, %Tr
+  %PC = bitcast i8* %0 to i64*
+  %Sl45 = select i1 %Cmp, <8 x i1> %Cmp10, <8 x i1> %Shuff42
+  %Cmp46 = fcmp ugt float 0xB856238A00000000, 0x47DA795E40000000
+  br i1 %Cmp46, label %CF77, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF77
+  %L47 = load i64* %PC
+  store i8 77, i8* %Sl
+  %E48 = extractelement <8 x i64> zeroinitializer, i32 2
+  %Shuff49 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff7, <8 x i32> <i32 5, i32 7, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 3>
+  %I50 = insertelement <8 x i64> zeroinitializer, i64 %L47, i32 7
+  %B51 = fdiv float 0x46CC2D8000000000, %FC23
+  %PC52 = bitcast <8 x i64>* %A3 to i64*
+  %Sl53 = select i1 %Cmp, <8 x i64> %Shuff, <8 x i64> %Shuff
+  %Cmp54 = fcmp ole float 0x47DA795E40000000, 0xB856238A00000000
+  br i1 %Cmp54, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF80
+  %L55 = load i8* %Sl
+  store i8 %Sl16, i8* %Sl
+  %E56 = extractelement <1 x i16> %B, i32 0
+  %Shuff57 = shufflevector <1 x i16> zeroinitializer, <1 x i16> zeroinitializer, <1 x i32> <i32 1>
+  %I58 = insertelement <8 x i64> zeroinitializer, i64 %L47, i32 7
+  %B59 = srem i32 %E19, %E19
+  %Sl60 = select i1 %Cmp, i8 77, i8 77
+  %Cmp61 = icmp ult <1 x i16> zeroinitializer, %B
+  %L62 = load i8* %Sl
+  store i64 %L47, i64* %PC52
+  %E63 = extractelement <4 x i32> %I43, i32 2
+  %Shuff64 = shufflevector <4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
+  %I65 = insertelement <8 x i64> %B22, i64 %L47, i32 7
+  %B66 = add <8 x i64> %I50, %I65
+  %FC67 = uitofp i16 %E12 to float
+  %Sl68 = select i1 %Cmp, <8 x i32> %B30, <8 x i32> zeroinitializer
+  %Cmp69 = fcmp ord double 0.000000e+00, 0.000000e+00
+  br i1 %Cmp69, label %CF77, label %CF79
+
+CF79:                                             ; preds = %CF81
+  %L70 = load i32* %A
+  store i64 %4, i64* %PC
+  %E71 = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff72 = shufflevector <8 x i32> zeroinitializer, <8 x i32> %B44, <8 x i32> <i32 11, i32 undef, i32 15, i32 1, i32 3, i32 undef, i32 7, i32 9>
+  %I73 = insertelement <8 x i16> zeroinitializer, i16 %E12, i32 5
+  %B74 = fsub double 0.000000e+00, 0.000000e+00
+  %Sl75 = select i1 %Cmp46, i32 %E6, i32 %E19
+  %Cmp76 = icmp ugt <4 x i32> %I43, zeroinitializer
+  store i8 %L, i8* %Sl
+  store i64 %L47, i64* %PC
+  store i64 %L47, i64* %PC
+  store i8 %L5, i8* %Sl
+  store i8 %L5, i8* %0
+  ret void
+}
-- 
2.34.1