; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL

; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
; because that is slower than two 16-byte loads. 
; Other AVX-capable chips don't have that problem.

define <8 x float> @load32bytes(<8 x float>* %Ap) {
  ; CHECK-LABEL: load32bytes

  ; SANDYB: vmovaps
  ; SANDYB: vinsertf128
  ; SANDYB: retq

  ; BTVER2: vmovups
  ; BTVER2: retq

  ; HASWELL: vmovups
  ; HASWELL: retq

  %A = load <8 x float>* %Ap, align 16
  ret <8 x float> %A
}

; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
; because that is slowerthan two 16-byte stores. 
; Other AVX-capable chips don't have that problem.

define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
  ; CHECK-LABEL: store32bytes

  ; SANDYB: vextractf128
  ; SANDYB: vmovaps
  ; SANDYB: retq

  ; BTVER2: vmovups
  ; BTVER2: retq

  ; HASWELL: vmovups
  ; HASWELL: retq

  store <8 x float> %A, <8 x float>* %P, align 16
  ret void
}