From 9b4cc76745cff2f823e726981febd83a8b6e05b3 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 3 Feb 2015 18:54:00 +0000
Subject: [PATCH] Merge consecutive 16-byte loads into one 32-byte load
 (PR22329)

This patch detects consecutive vector loads using the existing
EltsFromConsecutiveLoads() logic. This fixes:
http://llvm.org/bugs/show_bug.cgi?id=22329

This patch effectively reverts the tablegen additions of D6492 /
http://reviews.llvm.org/rL224344 ...which in hindsight were a horrible hack.

The test cases that were added with that patch are simply modified to load
from varying offsets of a base pointer. These loads did not match the existing
tablegen patterns.

A happy side effect of doing this optimization earlier is that we can now fold
the load into a math op where possible; this is shown in some of the updated
checks in the test file.

Differential Revision: http://reviews.llvm.org/D7303



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@228006 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp           | 62 +++++++++++++-----
 lib/Target/X86/X86InstrSSE.td                | 43 -------------
 test/CodeGen/X86/unaligned-32-byte-memops.ll | 67 +++++++++++---------
 3 files changed, 85 insertions(+), 87 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index df7984578ab..bf216c767d2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6011,9 +6011,9 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
   return SDValue();
 }
 
-/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
-/// vector of type 'VT', see if the elements can be replaced by a single large
-/// load which has the same value as a build_vector whose operands are 'elts'.
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
 ///
 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
 ///
@@ -6023,7 +6023,6 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                         SDLoc &DL, SelectionDAG &DAG,
                                         bool isAfterLegalize) {
-  EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
   LoadSDNode *LDBase = nullptr;
@@ -6034,7 +6033,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   // non-consecutive, bail out.
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = Elts[i];
-
+    // Look through a bitcast.
+    if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+      Elt = Elt.getOperand(0);
     if (!Elt.getNode() ||
         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
       return SDValue();
@@ -6049,7 +6050,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
       continue;
 
     LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+    EVT LdVT = Elt.getValueType();
+    // Each loaded element must be the correct fractional portion of the
+    // requested vector load.
+    if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+      return SDValue();
+    if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
       return SDValue();
     LastLoadedElt = i;
   }
@@ -6058,6 +6064,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   // load of the entire vector width starting at the base pointer.  If we found
   // consecutive loads for the low half, generate a vzext_load node.
   if (LastLoadedElt == NumElems - 1) {
+    assert(LDBase && "Did not find base load for merging consecutive loads");
+    EVT EltVT = LDBase->getValueType(0);
+    // Ensure that the input vector size for the merged loads matches the
+    // cumulative size of the input elements.
+    if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+      return SDValue();
 
     if (isAfterLegalize &&
         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
@@ -6084,6 +6096,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
 
   //TODO: The code below fires only for for loading the low v2i32 / v2f32
   //of a v4i32 / v4f32. It's probably worth generalizing.
+  EVT EltVT = VT.getVectorElementType();
   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
@@ -13164,25 +13177,44 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
                                      SelectionDAG &DAG) {
   if (!Subtarget->hasAVX())
     return SDValue();
-  
+
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue SubVec = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
+
+  if (!isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   MVT OpVT = Op.getSimpleValueType();
   MVT SubVecVT = SubVec.getSimpleValueType();
-    
+
+  // Fold two 16-byte subvector loads into one 32-byte load:
+  // (insert_subvector (insert_subvector undef, (load addr), 0),
+  //                   (load addr + 16), Elts/2)
+  // --> load32 addr
+  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
+      !Subtarget->isUnalignedMem32Slow()) {
+    SDValue SubVec2 = Vec.getOperand(1);
+    if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
+      if (Idx2->getZExtValue() == 0) {
+        SDValue Ops[] = { SubVec2, SubVec };
+        SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+        if (LD.getNode())
+          return LD;
+      }
+    }
+  }
+
   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
-      SubVecVT.is128BitVector() && isa<ConstantSDNode>(Idx)) {
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      SubVecVT.is128BitVector())
     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-  }
 
-  if (OpVT.is512BitVector() &&
-      SubVecVT.is256BitVector() && isa<ConstantSDNode>(Idx)) {
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
-  }
 
   return SDValue();
 }
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 0ab53f3183e..45656631830 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -8141,49 +8141,6 @@ def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 }
 
-// Combine two consecutive 16-byte loads with a common destination register into
-// one 32-byte load to that register.
-let Predicates = [HasAVX, HasFastMem32] in {
-  def : Pat<(insert_subvector
-              (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))),
-              (loadv4f32 (add addr:$src, (iPTR 16))),
-              (iPTR 4)),
-            (VMOVUPSYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))),
-              (loadv2f64 (add addr:$src, (iPTR 16))),
-              (iPTR 2)),
-            (VMOVUPDYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v32i8 (insert_subvector
-                undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 16)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v16i16 (insert_subvector
-                undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 8)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v8i32 (insert_subvector
-                undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 4)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))),
-              (loadv2i64 (add addr:$src, (iPTR 16))),
-              (iPTR 2)),
-            (VMOVDQUYrm addr:$src)>;
-}
-
 let Predicates = [HasAVX1Only] in {
 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
                                    (iPTR imm)),
diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll
index 347f330d67a..9cec17d9432 100644
--- a/test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -65,8 +65,9 @@ define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
@@ -88,8 +89,9 @@ define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
@@ -111,8 +113,9 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %v3
@@ -133,8 +136,9 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x float> %v3
@@ -160,12 +164,13 @@ define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddq
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddq
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1
-  %v1 = load <2 x i64>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5
+  %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6
+  %v1 = load <2 x i64>* %ptr1, align 1
   %v2 = load <2 x i64>* %ptr2, align 1
   %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v4 = add <4 x i64> %v3, %x
@@ -187,12 +192,13 @@ define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddd
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddd
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1
-  %v1 = load <4 x i32>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6
+  %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7
+  %v1 = load <4 x i32>* %ptr1, align 1
   %v2 = load <4 x i32>* %ptr2, align 1
   %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %v4 = add <8 x i32> %v3, %x
@@ -214,12 +220,13 @@ define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddw
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddw
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1
-  %v1 = load <8 x i16>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7
+  %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8
+  %v1 = load <8 x i16>* %ptr1, align 1
   %v2 = load <8 x i16>* %ptr2, align 1
   %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %v4 = add <16 x i16> %v3, %x
@@ -241,12 +248,13 @@ define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddb
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddb
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1
-  %v1 = load <16 x i8>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8
+  %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9
+  %v1 = load <16 x i8>* %ptr1, align 1
   %v2 = load <16 x i8>* %ptr2, align 1
   %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %v4 = add <32 x i8> %v3, %x
@@ -261,16 +269,17 @@ define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x doubl
   ; SANDYB-NEXT: vaddpd
   ; SANDYB-NEXT: retq
 
-  ; BTVER2: vmovupd
-  ; BTVER2-NEXT: vaddpd
+  ; BTVER2-NOT: vinsertf128
+  ; BTVER2: vaddpd
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovupd
+  ; HASWELL-NOT: vinsertf128
   ; HASWELL: vaddpd
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1
-  %v1 = load <2 x double>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9
+  %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10
+  %v1 = load <2 x double>* %ptr1, align 1
   %v2 = load <2 x double>* %ptr2, align 1
   %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v4 = fadd <4 x double> %v3, %x
-- 
2.34.1