From 7a5e55509b99d579d56d126a7b503ec6fe153a8f Mon Sep 17 00:00:00 2001
From: Eli Friedman <eli.friedman@gmail.com>
Date: Sun, 7 Jun 2009 06:52:44 +0000
Subject: [PATCH] Slightly generalize the code that handles shuffles of
 consecutive loads on x86 to handle more cases.  Fix a bug in said code that
 would cause it to read past the end of an object.  Rewrite the code in
 SelectionDAGLegalize::ExpandBUILD_VECTOR to be a bit more general. Remove
 PerformBuildVectorCombine, which is no longer necessary with these changes. 
 In addition to simplifying the code, with this change, we can now catch a few
 more cases of consecutive loads.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73012 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 115 ++++++------------
 lib/Target/X86/X86ISelLowering.cpp       | 143 +++++++----------------
 test/CodeGen/X86/vec_loadsingles.ll      |  12 ++
 test/CodeGen/X86/vec_set-5.ll            |   3 +-
 test/CodeGen/X86/vec_set-6.ll            |   2 +-
 5 files changed, 90 insertions(+), 185 deletions(-)
 create mode 100644 test/CodeGen/X86/vec_loadsingles.ll
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5ae183e2fa0..f3c2833e0fe 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1785,48 +1785,41 @@ SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
 /// support the operation, but do support the resultant vector type.
 SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   unsigned NumElems = Node->getNumOperands();
-  SDValue SplatValue = Node->getOperand(0);
+  SDValue Value1, Value2;
   DebugLoc dl = Node->getDebugLoc();
   MVT VT = Node->getValueType(0);
-  MVT OpVT = SplatValue.getValueType();
+  MVT OpVT = Node->getOperand(0).getValueType();
   MVT EltVT = VT.getVectorElementType();
 
   // If the only non-undef value is the low element, turn this into a
   // SCALAR_TO_VECTOR node.  If this is { X, X, X, X }, determine X.
   bool isOnlyLowElement = true;
-
-  // FIXME: it would be far nicer to change this into map<SDValue,uint64_t>
-  // and use a bitmask instead of a list of elements.
-  // FIXME: this doesn't treat <0, u, 0, u> for example, as a splat.
-  std::map<SDValue, std::vector<unsigned> > Values;
-  Values[SplatValue].push_back(0);
+  bool MoreThanTwoValues = false;
   bool isConstant = true;
-  if (!isa<ConstantFPSDNode>(SplatValue) && !isa<ConstantSDNode>(SplatValue) &&
-      SplatValue.getOpcode() != ISD::UNDEF)
-    isConstant = false;
-
-  for (unsigned i = 1; i < NumElems; ++i) {
+  for (unsigned i = 0; i < NumElems; ++i) {
     SDValue V = Node->getOperand(i);
-    Values[V].push_back(i);
-    if (V.getOpcode() != ISD::UNDEF)
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
       isOnlyLowElement = false;
-    if (SplatValue != V)
-      SplatValue = SDValue(0, 0);
-
-    // If this isn't a constant element or an undef, we can't use a constant
-    // pool load.
-    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V) &&
-        V.getOpcode() != ISD::UNDEF)
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
+
+    if (!Value1.getNode()) {
+      Value1 = V;
+    } else if (!Value2.getNode()) {
+      if (V != Value1)
+        Value2 = V;
+    } else if (V != Value1 && V != Value2) {
+      MoreThanTwoValues = true;
+    }
   }
 
-  if (isOnlyLowElement) {
-    // If the low element is an undef too, then this whole things is an undef.
-    if (Node->getOperand(0).getOpcode() == ISD::UNDEF)
-      return DAG.getUNDEF(VT);
-    // Otherwise, turn this into a scalar_to_vector node.
+  if (!Value1.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));
-  }
 
   // If all elements are constants, create a load from the constant pool.
   if (isConstant) {
@@ -1852,59 +1845,25 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
                        false, Alignment);
   }
 
-  if (SplatValue.getNode()) {   // Splat of one value?
-    // Build the shuffle constant vector: <0, 0, 0, 0>
-    SmallVector<int, 8> ZeroVec(NumElems, 0);
-
-    // If the target supports VECTOR_SHUFFLE and this shuffle mask, use it.
-    if (TLI.isShuffleMaskLegal(ZeroVec, Node->getValueType(0))) {
+  if (!MoreThanTwoValues) {
+    SmallVector<int, 8> ShuffleVec(NumElems, -1);
+    for (unsigned i = 0; i < NumElems; ++i) {
+      SDValue V = Node->getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      ShuffleVec[i] = V == Value1 ? 0 : NumElems;
+    }
+    if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
       // Get the splatted value into the low element of a vector register.
-      SDValue LowValVec =
-        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, SplatValue);
+      SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
+      SDValue Vec2;
+      if (Value2.getNode())
+        Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
+      else
+        Vec2 = DAG.getUNDEF(VT);
 
       // Return shuffle(LowValVec, undef, <0,0,0,0>)
-      return DAG.getVectorShuffle(VT, dl, LowValVec, DAG.getUNDEF(VT),
-                                  &ZeroVec[0]);
-    }
-  }
-
-  // If there are only two unique elements, we may be able to turn this into a
-  // vector shuffle.
-  if (Values.size() == 2) {
-    // Get the two values in deterministic order.
-    SDValue Val1 = Node->getOperand(1);
-    SDValue Val2;
-    std::map<SDValue, std::vector<unsigned> >::iterator MI = Values.begin();
-    if (MI->first != Val1)
-      Val2 = MI->first;
-    else
-      Val2 = (++MI)->first;
-
-    // If Val1 is an undef, make sure it ends up as Val2, to ensure that our
-    // vector shuffle has the undef vector on the RHS.
-    if (Val1.getOpcode() == ISD::UNDEF)
-      std::swap(Val1, Val2);
-
-    // Build the shuffle constant vector: e.g. <0, 4, 0, 4>
-    SmallVector<int, 8> ShuffleMask(NumElems, -1);
-
-    // Set elements of the shuffle mask for Val1.
-    std::vector<unsigned> &Val1Elts = Values[Val1];
-    for (unsigned i = 0, e = Val1Elts.size(); i != e; ++i)
-      ShuffleMask[Val1Elts[i]] = 0;
-
-    // Set elements of the shuffle mask for Val2.
-    std::vector<unsigned> &Val2Elts = Values[Val2];
-    for (unsigned i = 0, e = Val2Elts.size(); i != e; ++i)
-      if (Val2.getOpcode() != ISD::UNDEF)
-        ShuffleMask[Val2Elts[i]] = NumElems;
-
-    // If the target supports SCALAR_TO_VECTOR and this shuffle mask, use it.
-    if (TLI.isOperationLegalOrCustom(ISD::SCALAR_TO_VECTOR, VT) &&
-        TLI.isShuffleMaskLegal(ShuffleMask, VT)) {
-      Val1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Val1);
-      Val2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Val2);
-      return DAG.getVectorShuffle(VT, dl, Val1, Val2, &ShuffleMask[0]);
+      return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec.data());
     }
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ef60ff5e092..7d18b968f78 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7691,13 +7691,15 @@ static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
 }
 
 static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
-                                     MVT EVT, SDNode *&Base,
+                                     MVT EVT, LoadSDNode *&LDBase,
+                                     unsigned &LastLoadedElt,
                                      SelectionDAG &DAG, MachineFrameInfo *MFI,
                                      const TargetLowering &TLI) {
-  Base = NULL;
+  LDBase = NULL;
+  LastLoadedElt = -1;
   for (unsigned i = 0; i < NumElems; ++i) {
     if (N->getMaskElt(i) < 0) {
-      if (!Base)
+      if (!LDBase)
         return false;
       continue;
     }
@@ -7706,19 +7708,20 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
     if (!Elt.getNode() ||
         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
       return false;
-    if (!Base) {
-      Base = Elt.getNode();
-      if (Base->getOpcode() == ISD::UNDEF)
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
         return false;
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
       continue;
     }
     if (Elt.getOpcode() == ISD::UNDEF)
       continue;
 
     LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    LoadSDNode *LDBase = cast<LoadSDNode>(Base);
     if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI))
       return false;
+    LastLoadedElt = i;
   }
   return true;
 }
@@ -7737,6 +7740,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
   unsigned NumElems = VT.getVectorNumElements();
 
+  if (VT.getSizeInBits() != 128)
+    return SDValue();
+
   // For x86-32 machines, if we see an insert and then a shuffle in a v2i64
   // where the upper half is 0, it is advantageous to rewrite it as a build
   // vector of (0, val) so it can use movq.
@@ -7764,107 +7770,24 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
 
   // Try to combine a vector_shuffle into a 128-bit load.
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  SDNode *Base = NULL;
-  if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, Base, DAG, MFI, TLI))
+  LoadSDNode *LD = NULL;
+  unsigned LastLoadedElt;
+  if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG,
+                                MFI, TLI))
     return SDValue();
 
-  LoadSDNode *LD = cast<LoadSDNode>(Base);
-  if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI))
+  if (LastLoadedElt == NumElems - 1) {
+    if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI))
+      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+                         LD->getSrcValue(), LD->getSrcValueOffset(),
+                         LD->isVolatile());
     return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
                        LD->getSrcValue(), LD->getSrcValueOffset(),
-                       LD->isVolatile());
-  return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                     LD->getSrcValue(), LD->getSrcValueOffset(),
-                     LD->isVolatile(), LD->getAlignment());
-}
-
-/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
-static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
-                                         TargetLowering::DAGCombinerInfo &DCI,
-                                         const X86Subtarget *Subtarget,
-                                         const TargetLowering &TLI) {
-  unsigned NumOps = N->getNumOperands();
-  DebugLoc dl = N->getDebugLoc();
-
-  // Ignore single operand BUILD_VECTOR.
-  if (NumOps == 1)
-    return SDValue();
-
-  MVT VT = N->getValueType(0);
-  MVT EVT = VT.getVectorElementType();
-  
-  // Before or during type legalization, we want to try and convert a
-  // build_vector of an i64 load and a zero value into vzext_movl before the 
-  // legalizer can break it up.  
-  // FIXME: does the case below remove the need to do this?
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) {
-    if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
-      return SDValue();
-    
-    // This must be an insertion into a zero vector.
-    SDValue HighElt = N->getOperand(1);
-    if (!isZeroNode(HighElt))
-      return SDValue();
-    
-    // Value must be a load.
-    SDNode *Base = N->getOperand(0).getNode();
-    if (!isa<LoadSDNode>(Base)) {
-      if (Base->getOpcode() != ISD::BIT_CONVERT)
-        return SDValue();
-      Base = Base->getOperand(0).getNode();
-      if (!isa<LoadSDNode>(Base))
-        return SDValue();
-    }
-    
-    // Transform it into VZEXT_LOAD addr.
-    LoadSDNode *LD = cast<LoadSDNode>(Base);
-    
-    // Load must not be an extload.
-    if (LD->getExtensionType() != ISD::NON_EXTLOAD)
-      return SDValue();
-    
-    // Load type should legal type so we don't have to legalize it.
-    if (!TLI.isTypeLegal(VT))
-      return SDValue();
-    
-    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
-    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
-    TargetLowering::TargetLoweringOpt TLO(DAG);
-    TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
-    DCI.CommitTargetLoweringOpt(TLO);
-    return ResNode;
-  }
-
-  // The type legalizer will have broken apart v2i64 build_vector created during
-  // widening before the code which handles that case is run.  Look for build
-  // vector (load, load + 4, 0/undef, 0/undef)
-  if (VT == MVT::v4i32 || VT == MVT::v4f32) {
-    LoadSDNode *LD0 = dyn_cast<LoadSDNode>(N->getOperand(0));
-    LoadSDNode *LD1 = dyn_cast<LoadSDNode>(N->getOperand(1));
-    if (!LD0 || !LD1)
-      return SDValue();
-    if (LD0->getExtensionType() != ISD::NON_EXTLOAD ||
-        LD1->getExtensionType() != ISD::NON_EXTLOAD)
-      return SDValue();
-    // Make sure the second elt is a consecutive load.
-    if (!TLI.isConsecutiveLoad(LD1, LD0, EVT.getSizeInBits()/8, 1,
-                               DAG.getMachineFunction().getFrameInfo()))
-      return SDValue();
-
-    SDValue N2 = N->getOperand(2);
-    SDValue N3 = N->getOperand(3);
-    if (!isZeroNode(N2) && N2.getOpcode() != ISD::UNDEF)
-      return SDValue();
-    if (!isZeroNode(N3) && N3.getOpcode() != ISD::UNDEF)
-      return SDValue();
-    
+                       LD->isVolatile(), LD->getAlignment());
+  } else if (NumElems == 4 && LastLoadedElt == 1) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
-    SDValue Ops[] = { LD0->getChain(), LD0->getBasePtr() };
+    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
     SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
-    TargetLowering::TargetLoweringOpt TLO(DAG);
-    TLO.CombineTo(SDValue(LD0, 1), ResNode.getValue(1));
-    DCI.CommitTargetLoweringOpt(TLO);
     return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
   }
   return SDValue();
@@ -8466,14 +8389,25 @@ static SDValue PerformBTCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  if (Op.getOpcode() == ISD::BIT_CONVERT)
+    Op = Op.getOperand(0);
+  MVT VT = N->getValueType(0), OpVT = Op.getValueType();
+  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+      VT.getVectorElementType().getSizeInBits() == 
+      OpVT.getVectorElementType().getSizeInBits()) {
+    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
+  }
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default: break;
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
-  case ISD::BUILD_VECTOR:
-    return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
@@ -8485,6 +8419,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
+  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   }
 
   return SDValue();
diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll
new file mode 100644
index 00000000000..67122763ec9
--- /dev/null
+++ b/test/CodeGen/X86/vec_loadsingles.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq
+
+define <4 x float> @a(<4 x float> %a, float* nocapture %p) nounwind readonly {
+entry:
+	%tmp1 = load float* %p
+	%vecins = insertelement <4 x float> undef, float %tmp1, i32 0
+	%add.ptr = getelementptr float* %p, i32 1
+	%tmp5 = load float* %add.ptr
+	%vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1
+	ret <4 x float> %vecins7
+}
+
diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll
index 4fc652c022a..d3329701119 100644
--- a/test/CodeGen/X86/vec_set-5.ll
+++ b/test/CodeGen/X86/vec_set-5.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
 ; RUN: grep movlhps   %t | count 1
-; RUN: grep movq      %t | count 1
-; RUN: grep movsd     %t | count 1
+; RUN: grep movq      %t | count 2
 
 define <4 x float> @test1(float %a, float %b) nounwind {
 	%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll
index 02df526cee8..c7b6747a86f 100644
--- a/test/CodeGen/X86/vec_set-6.ll
+++ b/test/CodeGen/X86/vec_set-6.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
 ; RUN: grep movss    %t | count 1
-; RUN: grep movups   %t | count 1
+; RUN: grep movq     %t | count 1
 ; RUN: grep shufps   %t | count 1
 
 define <4 x float> @test(float %a, float %b, float %c) nounwind {
-- 
2.34.1