From f365d3984e2934b182e866d545348988d3b681d5 Mon Sep 17 00:00:00 2001
From: Manman Ren <mren@apple.com>
Date: Tue, 27 Nov 2012 18:09:26 +0000
Subject: [PATCH] X86: do not fold load instructions such as [V]MOVS[S|D] to
 other instructions when the destination register is wider than the memory
 load.

These load instructions load from m32 or m64 and set the upper bits to zero,
while the folded instructions may accept m128.

rdar://12721174


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168710 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.cpp   | 15 ++++++++++++
 test/CodeGen/X86/fold-load-vec.ll | 39 +++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 test/CodeGen/X86/fold-load-vec.ll

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 5a99ff004d4..4e31af14eb6 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -3982,6 +3982,21 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     break;
   }
   default: {
+    if ((LoadMI->getOpcode() == X86::MOVSSrm ||
+         LoadMI->getOpcode() == X86::VMOVSSrm) &&
+        MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
+          > 4)
+      // These instructions only load 32 bits, we can't fold them if the
+      // destination register is wider than 32 bits (4 bytes).
+      return NULL;
+    if ((LoadMI->getOpcode() == X86::MOVSDrm ||
+         LoadMI->getOpcode() == X86::VMOVSDrm) &&
+        MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
+          > 8)
+      // These instructions only load 64 bits, we can't fold them if the
+      // destination register is wider than 64 bits (8 bytes).
+      return NULL;
+
     // Folding a normal load. Just copy the load's address operands.
     unsigned NumOps = LoadMI->getDesc().getNumOperands();
     for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
new file mode 100644
index 00000000000..c1756d5e2e1
--- /dev/null
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+
+; rdar://12721174
+; We should not fold movss into pshufd since pshufd expects m128 while movss
+; loads from m32.
+define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
+; CHECK: sample_test
+; CHECK: movss
+; CHECK: pshufd
+entry:
+  %source.addr = alloca <4 x float>*, align 8
+  %dest.addr = alloca <2 x float>*, align 8
+  %tmp = alloca <2 x float>, align 8
+  store <4 x float>* %source, <4 x float>** %source.addr, align 8
+  store <2 x float>* %dest, <2 x float>** %dest.addr, align 8
+  store <2 x float> zeroinitializer, <2 x float>* %tmp, align 8
+  %0 = load <4 x float>** %source.addr, align 8
+  %arrayidx = getelementptr inbounds <4 x float>* %0, i64 0
+  %1 = load <4 x float>* %arrayidx, align 16
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = load <2 x float>* %tmp, align 8
+  %4 = insertelement <2 x float> %3, float %2, i32 1
+  store <2 x float> %4, <2 x float>* %tmp, align 8
+  %5 = load <2 x float>* %tmp, align 8
+  %6 = load <2 x float>** %dest.addr, align 8
+  %arrayidx1 = getelementptr inbounds <2 x float>* %6, i64 0
+  store <2 x float> %5, <2 x float>* %arrayidx1, align 8
+  %7 = load <2 x float>** %dest.addr, align 8
+  %arrayidx2 = getelementptr inbounds <2 x float>* %7, i64 0
+  %8 = load <2 x float>* %arrayidx2, align 8
+  %vecext = extractelement <2 x float> %8, i32 0
+  %9 = load <2 x float>** %dest.addr, align 8
+  %arrayidx3 = getelementptr inbounds <2 x float>* %9, i64 0
+  %10 = load <2 x float>* %arrayidx3, align 8
+  %vecext4 = extractelement <2 x float> %10, i32 1
+  call void @ext(float %vecext, float %vecext4)
+  ret void
+}
+declare void @ext(float, float)
-- 
2.34.1