1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
3 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
4 declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
5 declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
7 ; SI-LABEL @test_div_scale_f32_1:
8 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
9 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
10 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
11 ; SI: buffer_store_dword [[RESULT0]]
13 define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
14 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
15 %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
16 %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
18 %a = load float addrspace(1)* %gep.0, align 4
19 %b = load float addrspace(1)* %gep.1, align 4
21 %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
22 %result0 = extractvalue { float, i1 } %result, 0
23 store float %result0, float addrspace(1)* %out, align 4
27 ; SI-LABEL @test_div_scale_f32_2:
28 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
29 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
30 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
31 ; SI: buffer_store_dword [[RESULT0]]
33 define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
34 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
35 %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
36 %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
38 %a = load float addrspace(1)* %gep.0, align 4
39 %b = load float addrspace(1)* %gep.1, align 4
41 %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
42 %result0 = extractvalue { float, i1 } %result, 0
43 store float %result0, float addrspace(1)* %out, align 4
47 ; SI-LABEL @test_div_scale_f64_1:
48 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
49 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
50 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
51 ; SI: buffer_store_dwordx2 [[RESULT0]]
53 define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
54 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
55 %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
56 %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
58 %a = load double addrspace(1)* %gep.0, align 8
59 %b = load double addrspace(1)* %gep.1, align 8
61 %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
62 %result0 = extractvalue { double, i1 } %result, 0
63 store double %result0, double addrspace(1)* %out, align 8
67 ; SI-LABEL @test_div_scale_f64_1:
68 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
69 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
70 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
71 ; SI: buffer_store_dwordx2 [[RESULT0]]
73 define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
74 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
75 %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
76 %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
78 %a = load double addrspace(1)* %gep.0, align 8
79 %b = load double addrspace(1)* %gep.1, align 8
81 %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
82 %result0 = extractvalue { double, i1 } %result, 0
83 store double %result0, double addrspace(1)* %out, align 8
87 ; SI-LABEL @test_div_scale_f32_scalar_num_1:
88 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
89 ; SI-DAG: s_load_dword [[A:s[0-9]+]]
90 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
91 ; SI: buffer_store_dword [[RESULT0]]
93 define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
94 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
95 %gep = getelementptr float addrspace(1)* %in, i32 %tid
97 %b = load float addrspace(1)* %gep, align 4
99 %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
100 %result0 = extractvalue { float, i1 } %result, 0
101 store float %result0, float addrspace(1)* %out, align 4
105 ; SI-LABEL @test_div_scale_f32_scalar_num_2:
106 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
107 ; SI-DAG: s_load_dword [[A:s[0-9]+]]
108 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
109 ; SI: buffer_store_dword [[RESULT0]]
111 define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
112 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
113 %gep = getelementptr float addrspace(1)* %in, i32 %tid
115 %b = load float addrspace(1)* %gep, align 4
117 %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
118 %result0 = extractvalue { float, i1 } %result, 0
119 store float %result0, float addrspace(1)* %out, align 4
123 ; SI-LABEL @test_div_scale_f32_scalar_den_1:
124 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
125 ; SI-DAG: s_load_dword [[B:s[0-9]+]]
126 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
127 ; SI: buffer_store_dword [[RESULT0]]
129 define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
130 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
131 %gep = getelementptr float addrspace(1)* %in, i32 %tid
133 %a = load float addrspace(1)* %gep, align 4
135 %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
136 %result0 = extractvalue { float, i1 } %result, 0
137 store float %result0, float addrspace(1)* %out, align 4
141 ; SI-LABEL @test_div_scale_f32_scalar_den_2:
142 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
143 ; SI-DAG: s_load_dword [[B:s[0-9]+]]
144 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
145 ; SI: buffer_store_dword [[RESULT0]]
147 define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
148 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
149 %gep = getelementptr float addrspace(1)* %in, i32 %tid
151 %a = load float addrspace(1)* %gep, align 4
153 %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
154 %result0 = extractvalue { float, i1 } %result, 0
155 store float %result0, float addrspace(1)* %out, align 4
159 ; SI-LABEL @test_div_scale_f64_scalar_num_1:
160 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
161 ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
162 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
163 ; SI: buffer_store_dwordx2 [[RESULT0]]
165 define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
166 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
167 %gep = getelementptr double addrspace(1)* %in, i32 %tid
169 %b = load double addrspace(1)* %gep, align 8
171 %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
172 %result0 = extractvalue { double, i1 } %result, 0
173 store double %result0, double addrspace(1)* %out, align 8
177 ; SI-LABEL @test_div_scale_f64_scalar_num_2:
178 ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
179 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
180 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
181 ; SI: buffer_store_dwordx2 [[RESULT0]]
183 define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
184 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
185 %gep = getelementptr double addrspace(1)* %in, i32 %tid
187 %b = load double addrspace(1)* %gep, align 8
189 %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
190 %result0 = extractvalue { double, i1 } %result, 0
191 store double %result0, double addrspace(1)* %out, align 8
195 ; SI-LABEL @test_div_scale_f64_scalar_den_1:
196 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
197 ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
198 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
199 ; SI: buffer_store_dwordx2 [[RESULT0]]
201 define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
202 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
203 %gep = getelementptr double addrspace(1)* %in, i32 %tid
205 %a = load double addrspace(1)* %gep, align 8
207 %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
208 %result0 = extractvalue { double, i1 } %result, 0
209 store double %result0, double addrspace(1)* %out, align 8
213 ; SI-LABEL @test_div_scale_f64_scalar_den_2:
214 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
215 ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
216 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
217 ; SI: buffer_store_dwordx2 [[RESULT0]]
219 define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
220 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
221 %gep = getelementptr double addrspace(1)* %in, i32 %tid
223 %a = load double addrspace(1)* %gep, align 8
225 %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
226 %result0 = extractvalue { double, i1 } %result, 0
227 store double %result0, double addrspace(1)* %out, align 8
231 ; SI-LABEL @test_div_scale_f32_all_scalar_1:
232 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
233 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
234 ; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
235 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
236 ; SI: buffer_store_dword [[RESULT0]]
238 define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
239 %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
240 %result0 = extractvalue { float, i1 } %result, 0
241 store float %result0, float addrspace(1)* %out, align 4
245 ; SI-LABEL @test_div_scale_f32_all_scalar_2:
246 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
247 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
248 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
249 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
250 ; SI: buffer_store_dword [[RESULT0]]
252 define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
253 %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
254 %result0 = extractvalue { float, i1 } %result, 0
255 store float %result0, float addrspace(1)* %out, align 4
259 ; SI-LABEL @test_div_scale_f64_all_scalar_1:
260 ; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
261 ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
262 ; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
263 ; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
264 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
265 ; SI: buffer_store_dwordx2 [[RESULT0]]
267 define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
268 %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
269 %result0 = extractvalue { double, i1 } %result, 0
270 store double %result0, double addrspace(1)* %out, align 8
274 ; SI-LABEL @test_div_scale_f64_all_scalar_2:
275 ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
276 ; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
277 ; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
278 ; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
279 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
280 ; SI: buffer_store_dwordx2 [[RESULT0]]
282 define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
283 %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
284 %result0 = extractvalue { double, i1 } %result, 0
285 store double %result0, double addrspace(1)* %out, align 8