test/CodeGen/X86/avx-vzeroupper.ll

   1 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
   2
   3 declare <4 x float> @do_sse(<4 x float>)
   4 declare <8 x float> @do_avx(<8 x float>)
   5 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
   6 @x = common global <4 x float> zeroinitializer, align 16
   7 @g = common global <8 x float> zeroinitializer, align 32
   8
   9 ;; Basic checking - don't emit any vzeroupper instruction
  10
  11 ; CHECK: _test00
  12 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
  13 entry:
  14   ; CHECK-NOT: vzeroupper
  15   %add.i = fadd <4 x float> %a, %b
  16   %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
  17   ; CHECK: ret
  18   ret <4 x float> %call3
  19 }
  20
  21 ;; Check parameter 256-bit parameter passing
  22
  23 ; CHECK: _test01
  24 define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
  25 entry:
  26   %tmp = load <4 x float>* @x, align 16
  27   ; CHECK: vzeroupper
  28   ; CHECK-NEXT: callq _do_sse
  29   %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
  30   store <4 x float> %call, <4 x float>* @x, align 16
  31   ; CHECK-NOT: vzeroupper
  32   ; CHECK: callq _do_sse
  33   %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
  34   store <4 x float> %call2, <4 x float>* @x, align 16
  35   ; CHECK: ret
  36   ret <8 x float> %c
  37 }
  38
  39 ;; Test the pass convergence and also that vzeroupper is only issued when necessary,
  40 ;; for this function it should be only once
  41
  42 ; CHECK: _test02
  43 define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
  44 entry:
  45   %add.i = fadd <4 x float> %a, %b
  46   br label %for.body
  47
  48 for.body:                                         ; preds = %for.body, %entry
  49   ; CHECK: LBB
  50   ; CHECK-NOT: vzeroupper
  51   %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ]
  52   %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ]
  53   ; CHECK: callq _do_sse
  54   %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
  55   ; CHECK-NEXT: callq _do_sse
  56   %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
  57   %tmp11 = load <8 x float>* @g, align 32
  58   %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
  59   ; CHECK: vzeroupper
  60   ; CHECK-NEXT: callq _do_sse
  61   %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
  62   %1 = add nsw i32 %i.018, 1
  63   %exitcond = icmp eq i32 %1, 4
  64   br i1 %exitcond, label %for.end, label %for.body
  65
  66 for.end:                                          ; preds = %for.body
  67   ret <4 x float> %call14
  68 }
  69
  70 ;; Check that we also perform vzeroupper when we return from a function.
  71
  72 ; CHECK: _test03
  73 define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
  74 entry:
  75   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  76   ; CHECK-NOT: vzeroupper
  77   ; CHECK: call
  78   %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
  79   %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  80   ; CHECK: vzeroupper
  81   ; CHECK: ret
  82   ret <4 x float> %shuf2
  83 }