Enable FeatureFastUAMem for btver2

author Sanjay Patel <spatel@rotateright.com>

Fri, 28 Nov 2014 18:40:18 +0000 (18:40 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Fri, 28 Nov 2014 18:40:18 +0000 (18:40 +0000)
author Sanjay Patel <spatel@rotateright.com>
Fri, 28 Nov 2014 18:40:18 +0000 (18:40 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Fri, 28 Nov 2014 18:40:18 +0000 (18:40 +0000)
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td

index f553a58921abd297c49745f0311c3cffa2c495c2..8fcc85b4dd6877aa8ded76f7da5009ea80114b56 100644 (file)
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -79,6 +79,10 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                         "Bit testing of memory is slow">;
  def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                         "SHLD instruction is slow">;
+// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that
+// explicit. Also, it seems this would be the default state for most chips
+// going forward, so it would probably be better to negate the logic and
+// match the 32-byte "slow mem" feature below.
  def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
                                          "IsUAMemFast", "true",
                                          "Fast unaligned memory access">;
@@ -361,8 +365,10 @@ def : ProcessorModel<"btver2", BtVer2Model,
                       [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
                        FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                        FeatureBMI, FeatureF16C, FeatureMOVBE,
-                      FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD,
-                      FeatureUseSqrtEst, FeatureUseRecipEst]>;
+                      FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
+                      FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
+
+// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
  
  // Bulldozer
  def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll

index 1b596b589899917c47cb94e9f3b2a390bb59c96c..3c03750199cbe077a6e0b125c4c53138ce454f28 100644 (file)
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,20 +1,25 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2   | grep movsd  | count 8
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s --check-prefix=CORE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
  
-define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %z) nounwind  {
-entry:
-       %iz = alloca { x86_fp80, x86_fp80 }             ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
-       %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1          ; <x86_fp80*> [#uses=1]
-       %tmp2 = load x86_fp80* %tmp1, align 16          ; <x86_fp80> [#uses=1]
-       %tmp3 = fsub x86_fp80 0xK80000000000000000000, %tmp2            ; <x86_fp80> [#uses=1]
-       %tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1         ; <x86_fp80*> [#uses=1]
-       %real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0         ; <x86_fp80*> [#uses=1]
-       %tmp6 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0          ; <x86_fp80*> [#uses=1]
-       %tmp7 = load x86_fp80* %tmp6, align 16          ; <x86_fp80> [#uses=1]
-       store x86_fp80 %tmp3, x86_fp80* %real, align 16
-       store x86_fp80 %tmp7, x86_fp80* %tmp4, align 16
-       call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %iz ) nounwind 
-       ret void
-}
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+define void @copy16bytes(i8* nocapture %a, i8* nocapture readonly %b) {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i32 1, i1 false)
+  ret void
+
+  ; CHECK-LABEL: copy16bytes
+  ; CORE2: movq
+  ; CORE2-NEXT: movq
+  ; CORE2-NEXT: movq
+  ; CORE2-NEXT: movq
+  ; CORE2-NEXT: retq
  
-declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval align 4 ) nounwind 
+  ; NEHALEM: movups
+  ; NEHALEM-NEXT: movups
+  ; NEHALEM-NEXT: retq
+
+  ; BTVER2: movups
+  ; BTVER2-NEXT: movups
+  ; BTVER2-NEXT: retq
+}
author	Sanjay Patel <spatel@rotateright.com>
	Fri, 28 Nov 2014 18:40:18 +0000 (18:40 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Fri, 28 Nov 2014 18:40:18 +0000 (18:40 +0000)
lib/Target/X86/X86.td		patch \| blob \| history
test/CodeGen/X86/small-byval-memcpy.ll		patch \| blob \| history