test/CodeGen/X86/avx-intel-ocl.ll

   1 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck -check-prefix=X32 %s
   2 ; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx | FileCheck -check-prefix=X32 %s
   3 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s
   4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=X64 %s
   5
   6 declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
   7 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
   8 declare i32 @func_int(i32, i32)
   9
  10 ; WIN64-LABEL: testf16_inp
  11 ; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
  12 ; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
  13 ; WIN64: leaq    {{.*}}(%rsp), %rcx
  14 ; WIN64: call
  15 ; WIN64: ret
  16
  17 ; X32-LABEL: testf16_inp
  18 ; X32: movl    %eax, (%esp)
  19 ; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
  20 ; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
  21 ; X32: call
  22 ; X32: ret
  23
  24 ; X64-LABEL: testf16_inp
  25 ; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
  26 ; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
  27 ; X64: leaq    {{.*}}(%rsp), %rdi
  28 ; X64: call
  29 ; X64: ret
  30
  31 ;test calling conventions - input parameters
  32 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
  33   %y = alloca <16 x float>, align 16
  34   %x = fadd <16 x float> %a, %b
  35   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
  36   %2 = load <16 x float>, <16 x float>* %y, align 16
  37   %3 = fadd <16 x float> %2, %1
  38   ret <16 x float> %3
  39 }
  40
  41 ;test calling conventions - preserved registers
  42
  43 ; preserved ymm6-ymm15
  44 ; WIN64-LABEL: testf16_regs
  45 ; WIN64: call
  46 ; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
  47 ; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
  48 ; WIN64: ret
  49
  50 ; preserved ymm8-ymm15
  51 ; X64-LABEL: testf16_regs
  52 ; X64: call
  53 ; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
  54 ; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
  55 ; X64: ret
  56
  57 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
  58   %y = alloca <16 x float>, align 16
  59   %x = fadd <16 x float> %a, %b
  60   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
  61   %2 = load <16 x float>, <16 x float>* %y, align 16
  62   %3 = fadd <16 x float> %1, %b
  63   %4 = fadd <16 x float> %2, %3
  64   ret <16 x float> %4
  65 }
  66
  67 ; test calling conventions - prolog and epilog
  68 ; WIN64-LABEL: test_prolog_epilog
  69 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  70 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  71 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  72 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  73 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  74 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  75 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  76 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  77 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  78 ; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  79 ; WIN64: call
  80 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  81 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  82 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  83 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  84 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  85 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  86 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  87 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  88 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  89 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  90
  91 ; X64-LABEL: test_prolog_epilog
  92 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  93 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  94 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  95 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  96 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  97 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  98 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  99 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
 100 ; X64: call
 101 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 102 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 103 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 104 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 105 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 106 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 107 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 108 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 109 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
 110    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
 111    ret <16 x float> %c
 112 }
 113
 114 ; test functions with integer parameters
 115 ; pass parameters on stack for 32-bit platform
 116 ; X32-LABEL: test_int
 117 ; X32: movl {{.*}}, 4(%esp)
 118 ; X32: movl {{.*}}, (%esp)
 119 ; X32: call
 120 ; X32: addl {{.*}}, %eax
 121
 122 ; pass parameters in registers for 64-bit platform
 123 ; X64-LABEL: test_int
 124 ; X64: leal {{.*}}, %edi
 125 ; X64: movl {{.*}}, %esi
 126 ; X64: call
 127 ; X64: addl {{.*}}, %eax
 128 define i32 @test_int(i32 %a, i32 %b) nounwind {
 129     %c1 = add i32 %a, %b
 130         %c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a)
 131     %c = add i32 %c2, %b
 132         ret i32 %c
 133 }
 134
 135 ; WIN64-LABEL: test_float4
 136 ; WIN64-NOT: vzeroupper
 137 ; WIN64: call
 138 ; WIN64-NOT: vzeroupper
 139 ; WIN64: call
 140 ; WIN64: ret
 141
 142 ; X64-LABEL: test_float4
 143 ; X64-NOT: vzeroupper
 144 ; X64: call
 145 ; X64-NOT: vzeroupper
 146 ; X64: call
 147 ; X64: ret
 148
 149 ; X32-LABEL: test_float4
 150 ; X32: vzeroupper
 151 ; X32: call
 152 ; X32: vzeroupper
 153 ; X32: call
 154 ; X32: ret
 155
 156 declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>)
 157
 158 define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone {
 159 entry:
 160   %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 161   %1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 162   %2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 163   %call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind
 164   %3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 165   %4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 166   %5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 167   %6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 168   %call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind
 169   %7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 170   %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 171   ret <8 x float> %8
 172 }