From a5e1362f968568d66d76ddcdcff4ab98e203a48c Mon Sep 17 00:00:00 2001
From: Evan Cheng <evan.cheng@apple.com>
Date: Fri, 7 Jan 2011 19:35:30 +0000
Subject: [PATCH] Revert r122955. It seems using movups to lower memcpy can
 cause massive regression (even on Nehalem) in edge cases. I also didn't see
 any real performance benefit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123015 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            |  6 +-
 test/CodeGen/X86/2010-04-08-CoalescerBug.ll   |  4 +-
 .../X86/2010-06-25-CoalescerSubRegDefDead.ll  |  2 +-
 .../X86/2010-09-17-SideEffectsInChain.ll      |  8 +--
 test/CodeGen/X86/memcpy-2.ll                  | 58 ++++++++++++++++---
 test/CodeGen/X86/memcpy.ll                    | 42 ++++++--------
 test/CodeGen/X86/memset-2.ll                  | 16 +----
 test/CodeGen/X86/memset64-on-x86-32.ll        |  3 +-
 test/CodeGen/X86/small-byval-memcpy.ll        |  8 +--
 test/CodeGen/X86/tlv-1.ll                     |  8 +--
 test/CodeGen/X86/unaligned-load.ll            | 25 +++++---
 11 files changed, 101 insertions(+), 79 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ddec78bfff3..f871b5a7701 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1063,8 +1063,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   // linux.  This is because the stack realignment code can't handle certain
   // cases like PR2962.  This should be removed when PR2962 is fixed.
   const Function *F = MF.getFunction();
-  if (NonScalarIntSafe && !F->hasFnAttr(Attribute::NoImplicitFloat)) {
+  if (NonScalarIntSafe &&
+      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
     if (Size >= 16 &&
+        (Subtarget->isUnalignedMemAccessFast() ||
+         ((DstAlign == 0 || DstAlign >= 16) &&
+          (SrcAlign == 0 || SrcAlign >= 16))) &&
         Subtarget->getStackAlignment() >= 16) {
       if (Subtarget->hasSSE2())
         return MVT::v4i32;
diff --git a/test/CodeGen/X86/2010-04-08-CoalescerBug.ll b/test/CodeGen/X86/2010-04-08-CoalescerBug.ll
index 09b7711db5d..1c7c28c68e9 100644
--- a/test/CodeGen/X86/2010-04-08-CoalescerBug.ll
+++ b/test/CodeGen/X86/2010-04-08-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
 ; rdar://7842028
 
 ; Do not delete partially dead copy instructions.
@@ -9,7 +9,7 @@
 %struct.F = type { %struct.FC*, i32, i32, i8, i32, i32, i32 }
 %struct.FC = type { [10 x i8], [32 x i32], %struct.FC*, i32 }
 
-define void @t(%struct.F* %this) nounwind optsize {
+define void @t(%struct.F* %this) nounwind {
 entry:
 ; CHECK: t:
 ; CHECK: addq $12, %rsi
diff --git a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
index f31cdad8614..6db3ce1f42c 100644
--- a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
+++ b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
@@ -26,7 +26,7 @@ bb:
 ; CHECK: rep;stosl
 
   %tmp5 = bitcast i32* %tmp4 to i8*
-  call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 124, i32 4, i1 false)
+  call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 84, i32 4, i1 false)
   %tmp6 = getelementptr inbounds %struct.type* %s, i32 0, i32 62
   store i32* null, i32** %tmp6, align 8
   br label %bb1
diff --git a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
index 66dc0eabac3..8fe0309421e 100644
--- a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
+++ b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@@ -19,8 +19,8 @@ entry:
 }
 
 ; CHECK: movq	___stack_chk_guard@GOTPCREL(%rip), %rax
-; CHECK: movb	30(%rsp), %cl
-; CHECK: movb	(%rsp), %dl
-; CHECK: movb	%dl, (%rsp)
-; CHECK: movb	%cl, 30(%rsp)
+; CHECK: movb	30(%rsp), %dl
+; CHECK: movb	(%rsp), %sil
+; CHECK: movb	%sil, (%rsp)
+; CHECK: movb	%dl, 30(%rsp)
 ; CHECK: callq	___stack_chk_fail
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index 9078e4ba533..17cd8e868a2 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
 ; RUN: llc < %s -mattr=-sse       -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
 ; RUN: llc < %s                 -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
 
@@ -14,6 +15,13 @@ entry:
 ; SSE2: movl $0
 ; SSE2: movl $0
 
+; SSE1: t1:
+; SSE1: movaps _.str, %xmm0
+; SSE1: movaps %xmm0
+; SSE1: movb $0
+; SSE1: movl $0
+; SSE1: movl $0
+
 ; NOSSE: t1:
 ; NOSSE: movb $0
 ; NOSSE: movl $0
@@ -43,6 +51,10 @@ entry:
 ; SSE2: movaps (%eax), %xmm0
 ; SSE2: movaps %xmm0, (%eax)
 
+; SSE1: t2:
+; SSE1: movaps (%eax), %xmm0
+; SSE1: movaps %xmm0, (%eax)
+
 ; NOSSE: t2:
 ; NOSSE: movl
 ; NOSSE: movl
@@ -67,8 +79,22 @@ entry:
 define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
 ; SSE2: t3:
-; SSE2: movups (%eax), %xmm0
-; SSE2: movups %xmm0, (%eax)
+; SSE2: movsd (%eax), %xmm0
+; SSE2: movsd 8(%eax), %xmm1
+; SSE2: movsd %xmm1, 8(%eax)
+; SSE2: movsd %xmm0, (%eax)
+
+; SSE1: t3:
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
 
 ; NOSSE: t3:
 ; NOSSE: movl
@@ -83,8 +109,10 @@ entry:
 ; NOSSE: movl
 
 ; X86-64: t3:
-; X86-64: movups (%rsi), %xmm0
-; X86-64: movups %xmm0, (%rdi)
+; X86-64: movq (%rsi), %rax
+; X86-64: movq 8(%rsi), %rcx
+; X86-64: movq %rcx, 8(%rdi)
+; X86-64: movq %rax, (%rdi)
   %tmp2 = bitcast %struct.s0* %a to i8*           ; <i8*> [#uses=1]
   %tmp3 = bitcast %struct.s0* %b to i8*           ; <i8*> [#uses=1]
   tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8)
@@ -94,12 +122,24 @@ entry:
 define void @t4() nounwind {
 entry:
 ; SSE2: t4:
-; SSE2: movups _.str2, %xmm0
-; SSE2: movaps %xmm0, (%esp)
-; SSE2: movw $120, 28(%esp)
+; SSE2: movw $120
+; SSE2: movl $2021161080
+; SSE2: movl $2021161080
 ; SSE2: movl $2021161080
 ; SSE2: movl $2021161080
 ; SSE2: movl $2021161080
+; SSE2: movl $2021161080
+; SSE2: movl $2021161080
+
+; SSE1: t4:
+; SSE1: movw $120
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
 
 ; NOSSE: t4:
 ; NOSSE: movw $120
@@ -114,8 +154,8 @@ entry:
 ; X86-64: t4:
 ; X86-64: movabsq $8680820740569200760, %rax
 ; X86-64: movq %rax
-; X86-64: movups _.str2(%rip), %xmm0
-; X86-64: movaps %xmm0, -40(%rsp)
+; X86-64: movq %rax
+; X86-64: movq %rax
 ; X86-64: movw $120
 ; X86-64: movl $2021161080
   %tmp1 = alloca [30 x i8]
diff --git a/test/CodeGen/X86/memcpy.ll b/test/CodeGen/X86/memcpy.ll
index 4af93ad3682..72342cbacb4 100644
--- a/test/CodeGen/X86/memcpy.ll
+++ b/test/CodeGen/X86/memcpy.ll
@@ -37,34 +37,26 @@ entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false)
   ret void
 ; LINUX: test3:
-; LINUX-NOT: memcpy
-; LINUX: movups
-; LINUX: movups
-; LINUX: movups
-; LINUX: movups
-; LINUX: movups
-; LINUX: movups
-; LINUX: movups
-; LINUX: movups
+; LINUX: memcpy
 
 ; DARWIN: test3:
 ; DARWIN-NOT: memcpy
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
-; DARWIN: movups
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
+; DARWIN: movq
 }
 
 ; Large constant memcpy's should be inlined when not optimizing for size.
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index 128799d0f56..993583b4a49 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -5,21 +5,7 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
 define fastcc void @t1() nounwind {
 entry:
 ; CHECK: t1:
-; CHECK: pxor %xmm0, %xmm0
-; CHECK: movups %xmm0, 160
-; CHECK: movups %xmm0, 144
-; CHECK: movups %xmm0, 128
-; CHECK: movups %xmm0, 112
-; CHECK: movups %xmm0, 96
-; CHECK: movups %xmm0, 80
-; CHECK: movups %xmm0, 64
-; CHECK: movups %xmm0, 48
-; CHECK: movups %xmm0, 32
-; CHECK: movups %xmm0, 16
-; CHECK: movups %xmm0, 0
-; CHECK: movl $0, 184
-; CHECK: movl $0, 180
-; CHECK: movl $0, 176
+; CHECK: calll _memset
   call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind
   unreachable
 }
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
index 5a0e893e3b6..3f069b4a1aa 100644
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=nehalem | grep movups | count 5
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2   | grep movups   | count 5
+; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=core2   | grep movl   | count 20
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2   | grep movq   | count 10
 
 define void @bork() nounwind {
 entry:
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
index 8f69b111bc3..1b596b58989 100644
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,12 +1,8 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2   | grep movsd  | count 8
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
 
 define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %z) nounwind  {
 entry:
-; CHECK: ccosl:
-; CHECK: movaps
-; CHECK: movaps
-; CHECK: movups
-; CHECK: movups
 	%iz = alloca { x86_fp80, x86_fp80 }		; <{ x86_fp80, x86_fp80 }*> [#uses=3]
 	%tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1		; <x86_fp80*> [#uses=1]
 	%tmp2 = load x86_fp80* %tmp1, align 16		; <x86_fp80> [#uses=1]
diff --git a/test/CodeGen/X86/tlv-1.ll b/test/CodeGen/X86/tlv-1.ll
index f9be15d1a95..42940f147ed 100644
--- a/test/CodeGen/X86/tlv-1.ll
+++ b/test/CodeGen/X86/tlv-1.ll
@@ -10,12 +10,8 @@ entry:
   unreachable  
   ; CHECK: movq    _c@TLVP(%rip), %rdi
   ; CHECK-NEXT: callq   *(%rdi)
-  ; CHECK-NEXT: pxor	%xmm0, %xmm0
-  ; CHECK-NEXT: movups  %xmm0, 32(%rax)
-  ; CHECK-NEXT: movups  %xmm0, 16(%rax)
-  ; CHECK-NEXT: movups  %xmm0, (%rax)
-  ; CHECK-NEXT: movl $0, 56(%rax)
-  ; CHECK-NEXT: movq $0, 48(%rax)
+  ; CHECK-NEXT: movl    $0, 56(%rax)
+  ; CHECK-NEXT: movq    $0, 48(%rax)
 }
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll
index 040857786da..6a493c0594d 100644
--- a/test/CodeGen/X86/unaligned-load.ll
+++ b/test/CodeGen/X86/unaligned-load.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0   | FileCheck -check-prefix=I386 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s
 
 @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8
@@ -11,8 +13,13 @@ entry:
 bb:
   %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0
   call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1)
-; CHECK: movabsq $2325069237881678925, %rax
-; CHECK: movups _.str3(%rip), %xmm0
+; I386: calll {{_?}}memcpy
+
+; CORE2: movabsq
+; CORE2: movabsq
+; CORE2: movabsq
+
+; COREI7: movups _.str3
   br label %bb
 
 return:
@@ -21,9 +28,9 @@ return:
 
 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
 
-; CHECK: .section
-; CHECK: .align  4
-; CHECK-NEXT: _.str1:
-; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
-; CHECK: .align 4
-; CHECK-NEXT: _.str3:
+; CORE2: .section
+; CORE2: .align  4
+; CORE2-NEXT: _.str1:
+; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
+; CORE2: .align 4
+; CORE2-NEXT: _.str3:
-- 
2.34.1