--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
+; because that is slower than two 16-byte loads.
+; Other AVX-capable chips don't have that problem.
+
+define <8 x float> @load32bytes(<8 x float>* %Ap) {
+ ; CHECK-LABEL: load32bytes
+
+ ; SANDYB: vmovaps
+ ; SANDYB: vinsertf128
+ ; SANDYB: retq
+
+ ; BTVER2: vmovups
+ ; BTVER2: retq
+
+ ; HASWELL: vmovups
+ ; HASWELL: retq
+
+ %A = load <8 x float>* %Ap, align 16
+ ret <8 x float> %A
+}
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
+; because that is slowerthan two 16-byte stores.
+; Other AVX-capable chips don't have that problem.
+
+define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
+ ; CHECK-LABEL: store32bytes
+
+ ; SANDYB: vextractf128
+ ; SANDYB: vmovaps
+ ; SANDYB: retq
+
+ ; BTVER2: vmovups
+ ; BTVER2: retq
+
+ ; HASWELL: vmovups
+ ; HASWELL: retq
+
+ store <8 x float> %A, <8 x float>* %P, align 16
+ ret void
+}