Add a feature flag for slow 32-byte unaligned memory accesses [x86].

[oota-llvm.git] / test / CodeGen / X86 / unaligned-32-byte-memops.ll
diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll

new file mode 100644 (file)

index 0000000..01342ba
--- /dev/null
+++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
+; because that is slower than two 16-byte loads. 
+; Other AVX-capable chips don't have that problem.
+
+define <8 x float> @load32bytes(<8 x float>* %Ap) {
+  ; CHECK-LABEL: load32bytes
+
+  ; SANDYB: vmovaps
+  ; SANDYB: vinsertf128
+  ; SANDYB: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL: retq
+
+  %A = load <8 x float>* %Ap, align 16
+  ret <8 x float> %A
+}
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
+; because that is slowerthan two 16-byte stores. 
+; Other AVX-capable chips don't have that problem.
+
+define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
+  ; CHECK-LABEL: store32bytes
+
+  ; SANDYB: vextractf128
+  ; SANDYB: vmovaps
+  ; SANDYB: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL: retq
+
+  store <8 x float> %A, <8 x float>* %P, align 16
+  ret void
+}