Use movups to lower memcpy and memset even if it's not fast (like corei7).

author Evan Cheng <evan.cheng@apple.com>

Thu, 6 Jan 2011 07:58:36 +0000 (07:58 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Thu, 6 Jan 2011 07:58:36 +0000 (07:58 +0000)
author Evan Cheng <evan.cheng@apple.com>
Thu, 6 Jan 2011 07:58:36 +0000 (07:58 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Thu, 6 Jan 2011 07:58:36 +0000 (07:58 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index f871b5a7701af86b189fdcc7d1caccd064755a42..ddec78bfff30ad92a706fbb7851ba7d2d3329192 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1063,12 +1063,8 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
    // linux.  This is because the stack realignment code can't handle certain
    // cases like PR2962.  This should be removed when PR2962 is fixed.
    const Function *F = MF.getFunction();
-  if (NonScalarIntSafe &&
-      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
+  if (NonScalarIntSafe && !F->hasFnAttr(Attribute::NoImplicitFloat)) {
      if (Size >= 16 &&
-        (Subtarget->isUnalignedMemAccessFast() ||
-         ((DstAlign == 0 || DstAlign >= 16) &&
-          (SrcAlign == 0 || SrcAlign >= 16))) &&
          Subtarget->getStackAlignment() >= 16) {
        if (Subtarget->hasSSE2())
          return MVT::v4i32;
diff --git a/test/CodeGen/X86/2010-04-08-CoalescerBug.ll b/test/CodeGen/X86/2010-04-08-CoalescerBug.ll

index 1c7c28c68e9facb133392897d726fc94b846f040..09b7711db5dc4bd731c815b792a5180bab1ade96 100644 (file)
--- a/test/CodeGen/X86/2010-04-08-CoalescerBug.ll
+++ b/test/CodeGen/X86/2010-04-08-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
  ; rdar://7842028
  
  ; Do not delete partially dead copy instructions.
@@ -9,7 +9,7 @@
  %struct.F = type { %struct.FC*, i32, i32, i8, i32, i32, i32 }
  %struct.FC = type { [10 x i8], [32 x i32], %struct.FC*, i32 }
  
-define void @t(%struct.F* %this) nounwind {
+define void @t(%struct.F* %this) nounwind optsize {
  entry:
  ; CHECK: t:
  ; CHECK: addq $12, %rsi
diff --git a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll

index 6db3ce1f42c0c9c7278257f8f21eefc26d17daf4..f31cdad8614d6d10b782327117eb1d6ef4485f10 100644 (file)
--- a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
+++ b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
@@ -26,7 +26,7 @@ bb:
  ; CHECK: rep;stosl
  
    %tmp5 = bitcast i32* %tmp4 to i8*
-  call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 84, i32 4, i1 false)
+  call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 124, i32 4, i1 false)
    %tmp6 = getelementptr inbounds %struct.type* %s, i32 0, i32 62
    store i32* null, i32** %tmp6, align 8
    br label %bb1
diff --git a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll

index 8fe0309421e509e03a60979f639a982592c441b9..66dc0eabac3aba3ac2f4ad99728a42ecc7a0155d 100644 (file)
--- a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
+++ b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@@ -19,8 +19,8 @@ entry:
  }
  
  ; CHECK: movq  ___stack_chk_guard@GOTPCREL(%rip), %rax
-; CHECK: movb  30(%rsp), %dl
-; CHECK: movb  (%rsp), %sil
-; CHECK: movb  %sil, (%rsp)
-; CHECK: movb  %dl, 30(%rsp)
+; CHECK: movb  30(%rsp), %cl
+; CHECK: movb  (%rsp), %dl
+; CHECK: movb  %dl, (%rsp)
+; CHECK: movb  %cl, 30(%rsp)
  ; CHECK: callq ___stack_chk_fail
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll

index 17cd8e868a254c0177d19cc1f856a013a81fbeb5..9078e4ba533f92e8b223b52b06b6aabcc00385f8 100644 (file)
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -1,5 +1,4 @@
  ; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
  ; RUN: llc < %s -mattr=-sse       -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
  ; RUN: llc < %s                 -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
  
@@ -15,13 +14,6 @@ entry:
  ; SSE2: movl $0
  ; SSE2: movl $0
  
-; SSE1: t1:
-; SSE1: movaps _.str, %xmm0
-; SSE1: movaps %xmm0
-; SSE1: movb $0
-; SSE1: movl $0
-; SSE1: movl $0
-
  ; NOSSE: t1:
  ; NOSSE: movb $0
  ; NOSSE: movl $0
@@ -51,10 +43,6 @@ entry:
  ; SSE2: movaps (%eax), %xmm0
  ; SSE2: movaps %xmm0, (%eax)
  
-; SSE1: t2:
-; SSE1: movaps (%eax), %xmm0
-; SSE1: movaps %xmm0, (%eax)
-
  ; NOSSE: t2:
  ; NOSSE: movl
  ; NOSSE: movl
@@ -79,22 +67,8 @@ entry:
  define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
  entry:
  ; SSE2: t3:
-; SSE2: movsd (%eax), %xmm0
-; SSE2: movsd 8(%eax), %xmm1
-; SSE2: movsd %xmm1, 8(%eax)
-; SSE2: movsd %xmm0, (%eax)
-
-; SSE1: t3:
-; SSE1: movl
-; SSE1: movl
-; SSE1: movl
-; SSE1: movl
-; SSE1: movl
-; SSE1: movl
-; SSE1: movl
-; SSE1: movl
-; SSE1: movl
-; SSE1: movl
+; SSE2: movups (%eax), %xmm0
+; SSE2: movups %xmm0, (%eax)
  
  ; NOSSE: t3:
  ; NOSSE: movl
@@ -109,10 +83,8 @@ entry:
  ; NOSSE: movl
  
  ; X86-64: t3:
-; X86-64: movq (%rsi), %rax
-; X86-64: movq 8(%rsi), %rcx
-; X86-64: movq %rcx, 8(%rdi)
-; X86-64: movq %rax, (%rdi)
+; X86-64: movups (%rsi), %xmm0
+; X86-64: movups %xmm0, (%rdi)
    %tmp2 = bitcast %struct.s0* %a to i8*           ; <i8*> [#uses=1]
    %tmp3 = bitcast %struct.s0* %b to i8*           ; <i8*> [#uses=1]
    tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8)
@@ -122,24 +94,12 @@ entry:
  define void @t4() nounwind {
  entry:
  ; SSE2: t4:
-; SSE2: movw $120
-; SSE2: movl $2021161080
-; SSE2: movl $2021161080
+; SSE2: movups _.str2, %xmm0
+; SSE2: movaps %xmm0, (%esp)
+; SSE2: movw $120, 28(%esp)
  ; SSE2: movl $2021161080
  ; SSE2: movl $2021161080
  ; SSE2: movl $2021161080
-; SSE2: movl $2021161080
-; SSE2: movl $2021161080
-
-; SSE1: t4:
-; SSE1: movw $120
-; SSE1: movl $2021161080
-; SSE1: movl $2021161080
-; SSE1: movl $2021161080
-; SSE1: movl $2021161080
-; SSE1: movl $2021161080
-; SSE1: movl $2021161080
-; SSE1: movl $2021161080
  
  ; NOSSE: t4:
  ; NOSSE: movw $120
@@ -154,8 +114,8 @@ entry:
  ; X86-64: t4:
  ; X86-64: movabsq $8680820740569200760, %rax
  ; X86-64: movq %rax
-; X86-64: movq %rax
-; X86-64: movq %rax
+; X86-64: movups _.str2(%rip), %xmm0
+; X86-64: movaps %xmm0, -40(%rsp)
  ; X86-64: movw $120
  ; X86-64: movl $2021161080
    %tmp1 = alloca [30 x i8]
diff --git a/test/CodeGen/X86/memcpy.ll b/test/CodeGen/X86/memcpy.ll

index 72342cbacb4f503371a0ce7e9c758f5454a3ac6f..4af93ad3682172156156e443c55bfa04158870eb 100644 (file)
--- a/test/CodeGen/X86/memcpy.ll
+++ b/test/CodeGen/X86/memcpy.ll
@@ -37,26 +37,34 @@ entry:
    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false)
    ret void
  ; LINUX: test3:
-; LINUX: memcpy
+; LINUX-NOT: memcpy
+; LINUX: movups
+; LINUX: movups
+; LINUX: movups
+; LINUX: movups
+; LINUX: movups
+; LINUX: movups
+; LINUX: movups
+; LINUX: movups
  
  ; DARWIN: test3:
  ; DARWIN-NOT: memcpy
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
+; DARWIN: movups
  }
  
  ; Large constant memcpy's should be inlined when not optimizing for size.
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll

index ae6b6e9772b927293d1ea6850186fe7cc1fa8d29..eb5597eb6ff8d21b2fe0244715e0c27c8bc211e1 100644 (file)
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -5,7 +5,21 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
  define fastcc void @t1() nounwind {
  entry:
  ; CHECK: t1:
-; CHECK: calll _memset
+; CHECK: pxor %xmm0, %xmm0
+; CHECK: movups %xmm0, 160
+; CHECK: movups %xmm0, 144
+; CHECK: movups %xmm0, 128
+; CHECK: movups %xmm0, 112
+; CHECK: movups %xmm0, 96
+; CHECK: movups %xmm0, 80
+; CHECK: movups %xmm0, 64
+; CHECK: movups %xmm0, 48
+; CHECK: movups %xmm0, 32
+; CHECK: movups %xmm0, 16
+; CHECK: movups %xmm0, 0
+; CHECK: movl $0, 184
+; CHECK: movl $0, 180
+; CHECK: movl $0, 176
    call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind
    unreachable
  }
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll

index 3f069b4a1aa82354dea8eead4f7d68ebe5682d47..5a0e893e3b69242ba59b0c8a9c89cb756540c447 100644 (file)
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,6 +1,5 @@
  ; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=nehalem | grep movups | count 5
-; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=core2   | grep movl   | count 20
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2   | grep movq   | count 10
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2   | grep movups   | count 5
  
  define void @bork() nounwind {
  entry:
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll

index 1b596b589899917c47cb94e9f3b2a390bb59c96c..8f69b111bc330f745339a04ea4311dc9f4be2a53 100644 (file)
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2   | grep movsd  | count 8
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | FileCheck %s
  
  define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %z) nounwind  {
  entry:
+; CHECK: ccosl:
+; CHECK: movaps
+; CHECK: movaps
+; CHECK: movups
+; CHECK: movups
         %iz = alloca { x86_fp80, x86_fp80 }             ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
         %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1          ; <x86_fp80*> [#uses=1]
         %tmp2 = load x86_fp80* %tmp1, align 16          ; <x86_fp80> [#uses=1]
diff --git a/test/CodeGen/X86/tlv-1.ll b/test/CodeGen/X86/tlv-1.ll

index 42940f147ed8361b897877182d2e61187bf1d280..f9be15d1a952f6d36cd129b9cde5869b78fab68a 100644 (file)
--- a/test/CodeGen/X86/tlv-1.ll
+++ b/test/CodeGen/X86/tlv-1.ll
@@ -10,8 +10,12 @@ entry:
    unreachable  
    ; CHECK: movq    _c@TLVP(%rip), %rdi
    ; CHECK-NEXT: callq   *(%rdi)
-  ; CHECK-NEXT: movl    $0, 56(%rax)
-  ; CHECK-NEXT: movq    $0, 48(%rax)
+  ; CHECK-NEXT: pxor   %xmm0, %xmm0
+  ; CHECK-NEXT: movups  %xmm0, 32(%rax)
+  ; CHECK-NEXT: movups  %xmm0, 16(%rax)
+  ; CHECK-NEXT: movups  %xmm0, (%rax)
+  ; CHECK-NEXT: movl $0, 56(%rax)
+  ; CHECK-NEXT: movq $0, 48(%rax)
  }
  
  declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll

index 6a493c0594de352fc819d6ce57be999a2a6704e6..040857786da7673cfb74fe92360ec5bbcb89e6c5 100644 (file)
--- a/test/CodeGen/X86/unaligned-load.ll
+++ b/test/CodeGen/X86/unaligned-load.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0   | FileCheck -check-prefix=I386 %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s
  
  @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
  @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8
@@ -13,13 +11,8 @@ entry:
  bb:
    %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0
    call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1)
-; I386: calll {{_?}}memcpy
-
-; CORE2: movabsq
-; CORE2: movabsq
-; CORE2: movabsq
-
-; COREI7: movups _.str3
+; CHECK: movabsq $2325069237881678925, %rax
+; CHECK: movups _.str3(%rip), %xmm0
    br label %bb
  
  return:
@@ -28,9 +21,9 @@ return:
  
  declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
  
-; CORE2: .section
-; CORE2: .align  4
-; CORE2-NEXT: _.str1:
-; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
-; CORE2: .align 4
-; CORE2-NEXT: _.str3:
+; CHECK: .section
+; CHECK: .align  4
+; CHECK-NEXT: _.str1:
+; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
+; CHECK: .align 4
+; CHECK-NEXT: _.str3:
author	Evan Cheng <evan.cheng@apple.com>
	Thu, 6 Jan 2011 07:58:36 +0000 (07:58 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Thu, 6 Jan 2011 07:58:36 +0000 (07:58 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/2010-04-08-CoalescerBug.ll		patch \| blob \| history
test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll		patch \| blob \| history
test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll		patch \| blob \| history
test/CodeGen/X86/memcpy-2.ll		patch \| blob \| history
test/CodeGen/X86/memcpy.ll		patch \| blob \| history
test/CodeGen/X86/memset-2.ll		patch \| blob \| history
test/CodeGen/X86/memset64-on-x86-32.ll		patch \| blob \| history
test/CodeGen/X86/small-byval-memcpy.ll		patch \| blob \| history
test/CodeGen/X86/tlv-1.ll		patch \| blob \| history
test/CodeGen/X86/unaligned-load.ll		patch \| blob \| history