merge consecutive loads that are offset from a base address

author Sanjay Patel <spatel@rotateright.com>

Tue, 16 Dec 2014 21:57:18 +0000 (21:57 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Tue, 16 Dec 2014 21:57:18 +0000 (21:57 +0000)
author Sanjay Patel <spatel@rotateright.com>
Tue, 16 Dec 2014 21:57:18 +0000 (21:57 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Tue, 16 Dec 2014 21:57:18 +0000 (21:57 +0000)
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

index f9131e7f2b203fe8e959232f4bcc61a800be03c6..20eaa2965e2736d7d39b80a3d972068430454bc8 100644 (file)
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6553,11 +6553,25 @@ bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
      return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
    }
  
-  // Handle X+C
-  if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
-      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
-    return true;
-
+  // Handle X + C.
+  if (isBaseWithConstantOffset(Loc)) {
+    int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+    if (Loc.getOperand(0) == BaseLoc) {
+      // If the base location is a simple address with no offset itself, then
+      // the second load's first add operand should be the base address.
+      if (LocOffset == Dist * (int)Bytes)
+        return true;
+    } else if (isBaseWithConstantOffset(BaseLoc)) {
+      // The base location itself has an offset, so subtract that value from the
+      // second load's offset before comparing to distance * size.
+      int64_t BOffset =
+        cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue();
+      if (Loc.getOperand(0) == BaseLoc.getOperand(0)) {
+        if ((LocOffset - BOffset) == Dist * (int)Bytes)
+          return true;
+      }
+    }
+  }
    const GlobalValue *GV1 = nullptr;
    const GlobalValue *GV2 = nullptr;
    int64_t Offset1 = 0;
diff --git a/test/CodeGen/X86/chain_order.ll b/test/CodeGen/X86/chain_order.ll

index c88726e75a812623c82b97750cf4fa85de76aed4..72e6f78bdef7e3c3171fd87f356e806dbe94ee94 100644 (file)
--- a/test/CodeGen/X86/chain_order.ll
+++ b/test/CodeGen/X86/chain_order.ll
@@ -1,13 +1,13 @@
  ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-linux | FileCheck %s
  
-;CHECK-LABEL: cftx020:
-;CHECK: vmovsd  (%rdi), %xmm{{.*}}
-;CHECK: vmovsd  16(%rdi), %xmm{{.*}}
-;CHECK: vmovsd  24(%rdi), %xmm{{.*}}
-;CHECK: vmovhpd  8(%rdi), %xmm{{.*}}
-;CHECK: vmovupd %xmm{{.*}}, (%rdi)
-;CHECK: vmovupd %xmm{{.*}}, 16(%rdi)
-;CHECK: ret
+; CHECK-LABEL: cftx020:
+; CHECK: vmovsd  (%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovsd  16(%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovhpd  24(%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovhpd  8(%rdi), %xmm{{.*}}
+; CHECK: vmovupd %xmm{{.*}}, (%rdi)
+; CHECK-NEXT: vmovupd %xmm{{.*}}, 16(%rdi)
+; CHECK: ret
  
  ; A test from pifft (after SLP-vectorization) that fails when we drop the chain on newly merged loads.
  define void @cftx020(double* nocapture %a) {
diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll

index af4d6fa61fd878c23390cb10b340e65577d2fb65..fd132a52b8f1e19492b7ab29f7e4c63fd88c3daa 100644 (file)
--- a/test/CodeGen/X86/vec_loadsingles.ll
+++ b/test/CodeGen/X86/vec_loadsingles.ll
@@ -89,7 +89,7 @@ define <8 x float> @merge_8_floats(float* %ptr) {
  ; FAST32-NEXT: retq
  
  ; SLOW32: vmovups
-; SLOW32: vinsertf128
+; SLOW32-NEXT: vinsertf128
  ; SLOW32-NEXT: retq
  }
  
@@ -112,7 +112,34 @@ define <4 x double> @merge_4_doubles(double* %ptr) {
  ; FAST32-NEXT: retq
  
  ; SLOW32: vmovups
-; SLOW32: vinsertf128
+; SLOW32-NEXT: vinsertf128
+; SLOW32-NEXT: retq
+}
+
+; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) 
+; Recognize and combine consecutive loads even when the
+; first of the combined loads is offset from the base address.
+define <4 x double> @merge_4_doubles_offset(double* %ptr) {
+  %arrayidx4 = getelementptr inbounds double* %ptr, i64 4
+  %arrayidx5 = getelementptr inbounds double* %ptr, i64 5
+  %arrayidx6 = getelementptr inbounds double* %ptr, i64 6
+  %arrayidx7 = getelementptr inbounds double* %ptr, i64 7
+  %e = load double* %arrayidx4, align 8
+  %f = load double* %arrayidx5, align 8
+  %g = load double* %arrayidx6, align 8
+  %h = load double* %arrayidx7, align 8
+  %vecinit4 = insertelement <4 x double> undef, double %e, i32 0
+  %vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1
+  %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2
+  %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3
+  ret <4 x double> %vecinit7
+
+; ALL-LABEL: merge_4_doubles_offset
+; FAST32: vmovups
+; FAST32-NEXT: retq
+
+; SLOW32: vmovups
+; SLOW32-NEXT: vinsertf128
  ; SLOW32-NEXT: retq
  }
author	Sanjay Patel <spatel@rotateright.com>
	Tue, 16 Dec 2014 21:57:18 +0000 (21:57 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Tue, 16 Dec 2014 21:57:18 +0000 (21:57 +0000)
lib/CodeGen/SelectionDAG/SelectionDAG.cpp		patch \| blob \| history
test/CodeGen/X86/chain_order.ll		patch \| blob \| history
test/CodeGen/X86/vec_loadsingles.ll		patch \| blob \| history