Use movups to lower memcpy and memset even if it's not fast (like corei7).

[oota-llvm.git] / test / CodeGen / X86 / small-byval-memcpy.ll
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll

index 1b596b589899917c47cb94e9f3b2a390bb59c96c..8f69b111bc330f745339a04ea4311dc9f4be2a53 100644 (file)
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2   | grep movsd  | count 8
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | FileCheck %s
  
  define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %z) nounwind  {
  entry:
+; CHECK: ccosl:
+; CHECK: movaps
+; CHECK: movaps
+; CHECK: movups
+; CHECK: movups
         %iz = alloca { x86_fp80, x86_fp80 }             ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
         %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1          ; <x86_fp80*> [#uses=1]
         %tmp2 = load x86_fp80* %tmp1, align 16          ; <x86_fp80> [#uses=1]