From 546611e3987a8684f2c260d3793964494bf97950 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Sat, 21 Nov 2015 13:25:07 +0000
Subject: [PATCH] [DAGCombiner] Bugfix for lost chain depenedency.

When MergeConsecutiveStores() combines two loads and two stores into
wider loads and stores, the chain users of both of the original loads
must be transfered to the new load, because it may be that a chain
user only depends on one of the loads.

New test case: test/CodeGen/SystemZ/dag-combine-01.ll

Reviewed by James Y Knight.

Bugzilla: https://llvm.org/bugs/show_bug.cgi?id=25310#c6

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253779 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 ++---
 test/CodeGen/SystemZ/dag-combine-01.ll   | 97 ++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 13 deletions(-)
 create mode 100644 test/CodeGen/SystemZ/dag-combine-01.ll
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index af73b62546e..1cf5c4c913c 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11529,7 +11529,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   StartAddress = LoadNodes[0].OffsetFromBase;
   SDValue FirstChain = FirstLoad->getChain();
   for (unsigned i = 1; i < LoadNodes.size(); ++i) {
-    // All loads much share the same chain.
+    // All loads must share the same chain.
     if (LoadNodes[i].MemNode->getChain() != FirstChain)
       break;
 
@@ -11621,8 +11621,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   SDLoc LoadDL(LoadNodes[0].MemNode);
   SDLoc StoreDL(StoreNodes[0].MemNode);
 
-  // The merged loads are required to have the same chain, so using the first's
-  // chain is acceptable.
+  // The merged loads are required to have the same incoming chain, so
+  // using the first's chain is acceptable.
   SDValue NewLoad = DAG.getLoad(
       JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
       FirstLoad->getPointerInfo(), false, false, false, FirstLoadAlign);
@@ -11634,17 +11634,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
     NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
       FirstInChain->getPointerInfo(), false, false, FirstStoreAlign);
 
-  // Replace one of the loads with the new load.
-  LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[0].MemNode);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
-                                SDValue(NewLoad.getNode(), 1));
-
-  // Remove the rest of the load chains.
-  for (unsigned i = 1; i < NumElem ; ++i) {
-    // Replace all chain users of the old load nodes with the chain of the new
-    // load node.
+  // Transfer chain users from old loads to the new load.
+  for (unsigned i = 0; i < NumElem; ++i) {
     LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Ld->getChain());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                  SDValue(NewLoad.getNode(), 1));
   }
 
   // Replace the last store with the new store.
diff --git a/test/CodeGen/SystemZ/dag-combine-01.ll b/test/CodeGen/SystemZ/dag-combine-01.ll
new file mode 100644
index 00000000000..a56a118dada
--- /dev/null
+++ b/test/CodeGen/SystemZ/dag-combine-01.ll
@@ -0,0 +1,97 @@
+; Test that MergeConsecutiveStores() does not during DAG combining
+; incorrectly drop a chain dependency to a store previously chained to
+; one of two combined loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+@A = common global [2048 x float] zeroinitializer, align 4
+
+; Function Attrs: nounwind
+define signext i32 @main(i32 signext %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv24 = phi i64 [ 0, %entry ], [ %indvars.iv.next25, %for.body ]
+  %sum.018 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %0 = trunc i64 %indvars.iv24 to i32
+  %conv = sitofp i32 %0 to float
+  %arrayidx = getelementptr inbounds [2048 x float], [2048 x float]* @A, i64 0, i64 %indvars.iv24
+  store float %conv, float* %arrayidx, align 4
+  %add = fadd float %sum.018, %conv
+  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+  %exitcond26 = icmp eq i64 %indvars.iv.next25, 2048
+  br i1 %exitcond26, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  br label %for.body.3.lr.ph.i.preheader
+
+for.body.3.lr.ph.i.preheader:                     ; preds = %complex_transpose.exit, %for.end
+  %i.116 = phi i32 [ 0, %for.end ], [ %inc9, %complex_transpose.exit ]
+  br label %for.body.3.lr.ph.i
+
+for.body.3.lr.ph.i:                               ; preds = %for.body.3.lr.ph.i.preheader, %for.inc.40.i
+  %indvars.iv19 = phi i32 [ 1, %for.body.3.lr.ph.i.preheader ], [ %indvars.iv.next20, %for.inc.40.i ]
+  %indvars.iv57.i = phi i64 [ 1, %for.body.3.lr.ph.i.preheader ], [ %indvars.iv.next58.i, %for.inc.40.i ]
+  %1 = shl nsw i64 %indvars.iv57.i, 1
+  %2 = shl nsw i64 %indvars.iv57.i, 6
+  br label %for.body.3.i
+
+for.body.3.i:                                     ; preds = %for.body.3.i, %for.body.3.lr.ph.i
+; CHECK-LABEL: .LBB0_5:
+; CHECK-NOT:    stfh    %r{{.*}}, 0(%r{{.*}})
+; CHECK:        lg      %r{{.*}}, -4(%r{{.*}})
+; Overlapping load should go before the store
+  %indvars.iv.i = phi i64 [ 0, %for.body.3.lr.ph.i ], [ %indvars.iv.next.i, %for.body.3.i ]
+  %3 = shl nsw i64 %indvars.iv.i, 6
+  %4 = add nuw nsw i64 %3, %1
+  %arrayidx.i = getelementptr inbounds [2048 x float], [2048 x float]* @A, i64 0, i64 %4
+  %5 = bitcast float* %arrayidx.i to i32*
+  %6 = load i32, i32* %5, align 4
+  %arrayidx9.i = getelementptr inbounds float, float* getelementptr inbounds ([2048 x float], [2048 x float]* @A, i64 0, i64 1), i64 %4
+  %7 = bitcast float* %arrayidx9.i to i32*
+  %8 = load i32, i32* %7, align 4
+  %9 = shl nsw i64 %indvars.iv.i, 1
+  %10 = add nuw nsw i64 %9, %2
+  %arrayidx14.i = getelementptr inbounds [2048 x float], [2048 x float]* @A, i64 0, i64 %10
+  %11 = bitcast float* %arrayidx14.i to i32*
+  %12 = load i32, i32* %11, align 4
+  %arrayidx19.i = getelementptr inbounds float, float* getelementptr inbounds ([2048 x float], [2048 x float]* @A, i64 0, i64 1), i64 %10
+  %13 = bitcast float* %arrayidx19.i to i32*
+  %14 = load i32, i32* %13, align 4
+  store i32 %6, i32* %11, align 4
+  store i32 %8, i32* %13, align 4
+  store i32 %12, i32* %5, align 4
+  store i32 %14, i32* %7, align 4
+  %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next.i to i32
+  %exitcond21 = icmp eq i32 %lftr.wideiv, %indvars.iv19
+  br i1 %exitcond21, label %for.inc.40.i, label %for.body.3.i
+
+for.inc.40.i:                                     ; preds = %for.body.3.i
+  %indvars.iv.next58.i = add nuw nsw i64 %indvars.iv57.i, 1
+  %indvars.iv.next20 = add nuw nsw i32 %indvars.iv19, 1
+  %exitcond22 = icmp eq i64 %indvars.iv.next58.i, 32
+  br i1 %exitcond22, label %complex_transpose.exit, label %for.body.3.lr.ph.i
+
+complex_transpose.exit:                           ; preds = %for.inc.40.i
+  %inc9 = add nuw nsw i32 %i.116, 1
+  %exitcond23 = icmp eq i32 %inc9, 10
+  br i1 %exitcond23, label %for.body.14.preheader, label %for.body.3.lr.ph.i.preheader
+
+for.body.14.preheader:                            ; preds = %complex_transpose.exit
+  br label %for.body.14
+
+for.body.14:                                      ; preds = %for.body.14.preheader, %for.body.14
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.14 ], [ 0, %for.body.14.preheader ]
+  %sum.115 = phi float [ %add17, %for.body.14 ], [ 0.000000e+00, %for.body.14.preheader ]
+  %arrayidx16 = getelementptr inbounds [2048 x float], [2048 x float]* @A, i64 0, i64 %indvars.iv
+  %15 = load float, float* %arrayidx16, align 4
+  %add17 = fadd float %sum.115, %15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 2048
+  br i1 %exitcond, label %for.end.20, label %for.body.14
+
+for.end.20:                                       ; preds = %for.body.14
+  ret i32 0
+}
-- 
2.34.1