From a5e1362f968568d66d76ddcdcff4ab98e203a48c Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Fri, 7 Jan 2011 19:35:30 +0000 Subject: [PATCH] Revert r122955. It seems using movups to lower memcpy can cause massive regression (even on Nehalem) in edge cases. I also didn't see any real performance benefit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123015 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 6 +- test/CodeGen/X86/2010-04-08-CoalescerBug.ll | 4 +- .../X86/2010-06-25-CoalescerSubRegDefDead.ll | 2 +- .../X86/2010-09-17-SideEffectsInChain.ll | 8 +-- test/CodeGen/X86/memcpy-2.ll | 58 ++++++++++++++++--- test/CodeGen/X86/memcpy.ll | 42 ++++++-------- test/CodeGen/X86/memset-2.ll | 16 +---- test/CodeGen/X86/memset64-on-x86-32.ll | 3 +- test/CodeGen/X86/small-byval-memcpy.ll | 8 +-- test/CodeGen/X86/tlv-1.ll | 8 +-- test/CodeGen/X86/unaligned-load.ll | 25 +++++--- 11 files changed, 101 insertions(+), 79 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ddec78bfff3..f871b5a7701 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1063,8 +1063,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = MF.getFunction(); - if (NonScalarIntSafe && !F->hasFnAttr(Attribute::NoImplicitFloat)) { + if (NonScalarIntSafe && + !F->hasFnAttr(Attribute::NoImplicitFloat)) { if (Size >= 16 && + (Subtarget->isUnalignedMemAccessFast() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16))) && Subtarget->getStackAlignment() >= 16) { if (Subtarget->hasSSE2()) return MVT::v4i32; diff --git a/test/CodeGen/X86/2010-04-08-CoalescerBug.ll b/test/CodeGen/X86/2010-04-08-CoalescerBug.ll index 09b7711db5d..1c7c28c68e9 100644 --- a/test/CodeGen/X86/2010-04-08-CoalescerBug.ll +++ b/test/CodeGen/X86/2010-04-08-CoalescerBug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s ; rdar://7842028 ; Do not delete partially dead copy instructions. @@ -9,7 +9,7 @@ %struct.F = type { %struct.FC*, i32, i32, i8, i32, i32, i32 } %struct.FC = type { [10 x i8], [32 x i32], %struct.FC*, i32 } -define void @t(%struct.F* %this) nounwind optsize { +define void @t(%struct.F* %this) nounwind { entry: ; CHECK: t: ; CHECK: addq $12, %rsi diff --git a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll index f31cdad8614..6db3ce1f42c 100644 --- a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll +++ b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll @@ -26,7 +26,7 @@ bb: ; CHECK: rep;stosl %tmp5 = bitcast i32* %tmp4 to i8* - call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 124, i32 4, i1 false) + call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 84, i32 4, i1 false) %tmp6 = getelementptr inbounds %struct.type* %s, i32 0, i32 62 store i32* null, i32** %tmp6, align 8 br label %bb1 diff --git a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll index 66dc0eabac3..8fe0309421e 100644 --- a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll +++ b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll @@ -19,8 +19,8 @@ entry: } ; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax -; CHECK: movb 30(%rsp), %cl -; CHECK: movb (%rsp), %dl -; CHECK: movb %dl, (%rsp) -; CHECK: movb %cl, 30(%rsp) +; CHECK: movb 30(%rsp), %dl +; CHECK: movb (%rsp), %sil +; CHECK: movb %sil, (%rsp) +; CHECK: movb %dl, 30(%rsp) ; CHECK: callq ___stack_chk_fail diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll index 9078e4ba533..17cd8e868a2 100644 --- a/test/CodeGen/X86/memcpy-2.ll +++ b/test/CodeGen/X86/memcpy-2.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2 +; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1 ; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64 @@ -14,6 +15,13 @@ entry: ; SSE2: movl $0 ; SSE2: movl $0 +; SSE1: t1: +; SSE1: movaps _.str, %xmm0 +; SSE1: movaps %xmm0 +; SSE1: movb $0 +; SSE1: movl $0 +; SSE1: movl $0 + ; NOSSE: t1: ; NOSSE: movb $0 ; NOSSE: movl $0 @@ -43,6 +51,10 @@ entry: ; SSE2: movaps (%eax), %xmm0 ; SSE2: movaps %xmm0, (%eax) +; SSE1: t2: +; SSE1: movaps (%eax), %xmm0 +; SSE1: movaps %xmm0, (%eax) + ; NOSSE: t2: ; NOSSE: movl ; NOSSE: movl @@ -67,8 +79,22 @@ entry: define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp { entry: ; SSE2: t3: -; SSE2: movups (%eax), %xmm0 -; SSE2: movups %xmm0, (%eax) +; SSE2: movsd (%eax), %xmm0 +; SSE2: movsd 8(%eax), %xmm1 +; SSE2: movsd %xmm1, 8(%eax) +; SSE2: movsd %xmm0, (%eax) + +; SSE1: t3: +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl ; NOSSE: t3: ; NOSSE: movl @@ -83,8 +109,10 @@ entry: ; NOSSE: movl ; X86-64: t3: -; X86-64: movups (%rsi), %xmm0 -; X86-64: movups %xmm0, (%rdi) +; X86-64: movq (%rsi), %rax +; X86-64: movq 8(%rsi), %rcx +; X86-64: movq %rcx, 8(%rdi) +; X86-64: movq %rax, (%rdi) %tmp2 = bitcast %struct.s0* %a to i8* ; [#uses=1] %tmp3 = bitcast %struct.s0* %b to i8* ; [#uses=1] tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8) @@ -94,12 +122,24 @@ entry: define void @t4() nounwind { entry: ; SSE2: t4: -; SSE2: movups _.str2, %xmm0 -; SSE2: movaps %xmm0, (%esp) -; SSE2: movw $120, 28(%esp) +; SSE2: movw $120 +; SSE2: movl $2021161080 +; SSE2: movl $2021161080 ; SSE2: movl $2021161080 ; SSE2: movl $2021161080 ; SSE2: movl $2021161080 +; SSE2: movl $2021161080 +; SSE2: movl $2021161080 + +; SSE1: t4: +; SSE1: movw $120 +; SSE1: movl $2021161080 +; SSE1: movl $2021161080 +; SSE1: movl $2021161080 +; SSE1: movl $2021161080 +; SSE1: movl $2021161080 +; SSE1: movl $2021161080 +; SSE1: movl $2021161080 ; NOSSE: t4: ; NOSSE: movw $120 @@ -114,8 +154,8 @@ entry: ; X86-64: t4: ; X86-64: movabsq $8680820740569200760, %rax ; X86-64: movq %rax -; X86-64: movups _.str2(%rip), %xmm0 -; X86-64: movaps %xmm0, -40(%rsp) +; X86-64: movq %rax +; X86-64: movq %rax ; X86-64: movw $120 ; X86-64: movl $2021161080 %tmp1 = alloca [30 x i8] diff --git a/test/CodeGen/X86/memcpy.ll b/test/CodeGen/X86/memcpy.ll index 4af93ad3682..72342cbacb4 100644 --- a/test/CodeGen/X86/memcpy.ll +++ b/test/CodeGen/X86/memcpy.ll @@ -37,34 +37,26 @@ entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false) ret void ; LINUX: test3: -; LINUX-NOT: memcpy -; LINUX: movups -; LINUX: movups -; LINUX: movups -; LINUX: movups -; LINUX: movups -; LINUX: movups -; LINUX: movups -; LINUX: movups +; LINUX: memcpy ; DARWIN: test3: ; DARWIN-NOT: memcpy -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups -; DARWIN: movups +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq } ; Large constant memcpy's should be inlined when not optimizing for size. diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll index 128799d0f56..993583b4a49 100644 --- a/test/CodeGen/X86/memset-2.ll +++ b/test/CodeGen/X86/memset-2.ll @@ -5,21 +5,7 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind define fastcc void @t1() nounwind { entry: ; CHECK: t1: -; CHECK: pxor %xmm0, %xmm0 -; CHECK: movups %xmm0, 160 -; CHECK: movups %xmm0, 144 -; CHECK: movups %xmm0, 128 -; CHECK: movups %xmm0, 112 -; CHECK: movups %xmm0, 96 -; CHECK: movups %xmm0, 80 -; CHECK: movups %xmm0, 64 -; CHECK: movups %xmm0, 48 -; CHECK: movups %xmm0, 32 -; CHECK: movups %xmm0, 16 -; CHECK: movups %xmm0, 0 -; CHECK: movl $0, 184 -; CHECK: movl $0, 180 -; CHECK: movl $0, 176 +; CHECK: calll _memset call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind unreachable } diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll index 5a0e893e3b6..3f069b4a1aa 100644 --- a/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/test/CodeGen/X86/memset64-on-x86-32.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movups | count 5 +; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10 define void @bork() nounwind { entry: diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll index 8f69b111bc3..1b596b58989 100644 --- a/test/CodeGen/X86/small-byval-memcpy.ll +++ b/test/CodeGen/X86/small-byval-memcpy.ll @@ -1,12 +1,8 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movsd | count 8 +; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2 define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %z) nounwind { entry: -; CHECK: ccosl: -; CHECK: movaps -; CHECK: movaps -; CHECK: movups -; CHECK: movups %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3] %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; [#uses=1] %tmp2 = load x86_fp80* %tmp1, align 16 ; [#uses=1] diff --git a/test/CodeGen/X86/tlv-1.ll b/test/CodeGen/X86/tlv-1.ll index f9be15d1a95..42940f147ed 100644 --- a/test/CodeGen/X86/tlv-1.ll +++ b/test/CodeGen/X86/tlv-1.ll @@ -10,12 +10,8 @@ entry: unreachable ; CHECK: movq _c@TLVP(%rip), %rdi ; CHECK-NEXT: callq *(%rdi) - ; CHECK-NEXT: pxor %xmm0, %xmm0 - ; CHECK-NEXT: movups %xmm0, 32(%rax) - ; CHECK-NEXT: movups %xmm0, 16(%rax) - ; CHECK-NEXT: movups %xmm0, (%rax) - ; CHECK-NEXT: movl $0, 56(%rax) - ; CHECK-NEXT: movq $0, 48(%rax) + ; CHECK-NEXT: movl $0, 56(%rax) + ; CHECK-NEXT: movq $0, 48(%rax) } declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll index 040857786da..6a493c0594d 100644 --- a/test/CodeGen/X86/unaligned-load.ll +++ b/test/CodeGen/X86/unaligned-load.ll @@ -1,4 +1,6 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=I386 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8 @@ -11,8 +13,13 @@ entry: bb: %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0 call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1) -; CHECK: movabsq $2325069237881678925, %rax -; CHECK: movups _.str3(%rip), %xmm0 +; I386: calll {{_?}}memcpy + +; CORE2: movabsq +; CORE2: movabsq +; CORE2: movabsq + +; COREI7: movups _.str3 br label %bb return: @@ -21,9 +28,9 @@ return: declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind -; CHECK: .section -; CHECK: .align 4 -; CHECK-NEXT: _.str1: -; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" -; CHECK: .align 4 -; CHECK-NEXT: _.str3: +; CORE2: .section +; CORE2: .align 4 +; CORE2-NEXT: _.str1: +; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" +; CORE2: .align 4 +; CORE2-NEXT: _.str3: -- 2.34.1