1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
4 ; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
5 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
7 ; Run with devices with different unaligned load restrictions.
9 ; TODO: Vector element tests
10 ; TODO: Non-zero base offset for load and store combinations
11 ; TODO: Same base addrspacecasted
14 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
15 ; GCN: buffer_store_byte
16 ; GCN: buffer_store_byte
18 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
19 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
21 store i8 123, i8 addrspace(1)* %out.gep.1
22 store i8 456, i8 addrspace(1)* %out, align 2
26 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
27 ; GCN: buffer_store_byte
28 ; GCN: buffer_store_byte
30 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
31 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
33 store i8 123, i8 addrspace(1)* %out.gep.1
34 store i8 456, i8 addrspace(1)* %out
38 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
39 ; GCN: buffer_store_dword v
40 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
41 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
43 store i16 123, i16 addrspace(1)* %out.gep.1
44 store i16 456, i16 addrspace(1)* %out, align 4
48 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
49 ; GCN: buffer_store_dword v
50 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
51 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
53 store i16 0, i16 addrspace(1)* %out.gep.1
54 store i16 0, i16 addrspace(1)* %out, align 4
58 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
59 ; GCN: buffer_store_short
60 ; GCN: buffer_store_short
62 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
63 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
65 store i16 123, i16 addrspace(1)* %out.gep.1
66 store i16 456, i16 addrspace(1)* %out
70 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
71 ; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
72 ; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
73 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
74 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
75 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
76 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
77 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
79 store i32 123, i32 addrspace(1)* %out.gep.1
80 store i32 456, i32 addrspace(1)* %out
84 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
85 ; GCN: buffer_store_dwordx2
86 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
87 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
88 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
89 store float 1.0, float addrspace(1)* %out.gep.1.bc
90 store i32 456, i32 addrspace(1)* %out
94 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
95 ; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
96 ; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
97 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
98 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
99 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
100 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
101 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
102 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
103 store i32 123, i32 addrspace(1)* %out.gep.1.bc
104 store float 4.0, float addrspace(1)* %out
108 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
109 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
110 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
111 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
112 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
113 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
114 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
115 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
116 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
117 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
119 store i32 123, i32 addrspace(1)* %out.gep.1
120 store i32 456, i32 addrspace(1)* %out.gep.2
121 store i32 333, i32 addrspace(1)* %out.gep.3
122 store i32 1234, i32 addrspace(1)* %out
126 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
127 ; GCN: buffer_store_dwordx4
128 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
129 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
130 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
131 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
133 store float 8.0, float addrspace(1)* %out
134 store float 1.0, float addrspace(1)* %out.gep.1
135 store float 2.0, float addrspace(1)* %out.gep.2
136 store float 4.0, float addrspace(1)* %out.gep.3
140 ; First store is out of order.
141 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
142 ; GCN: buffer_store_dwordx4
143 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
144 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
145 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
146 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
148 store float 1.0, float addrspace(1)* %out.gep.1
149 store float 2.0, float addrspace(1)* %out.gep.2
150 store float 4.0, float addrspace(1)* %out.gep.3
151 store float 8.0, float addrspace(1)* %out
155 ; FIXME: Should be able to merge this
156 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
157 ; GCN-NOAA: buffer_store_dword v
158 ; GCN-NOAA: buffer_store_dword v
159 ; GCN-NOAA: buffer_store_dword v
160 ; GCN-NOAA: buffer_store_dword v
162 ; GCN-AA: buffer_store_dwordx2
163 ; GCN-AA: buffer_store_dword v
164 ; GCN-AA: buffer_store_dword v
167 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
168 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
169 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
170 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
172 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
173 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
175 store i32 11, i32 addrspace(1)* %out.gep.1.bc
176 store float 2.0, float addrspace(1)* %out.gep.2
177 store i32 17, i32 addrspace(1)* %out.gep.3.bc
178 store float 8.0, float addrspace(1)* %out
182 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
183 ; SI-DAG: buffer_store_dwordx2
184 ; SI-DAG: buffer_store_dword
185 ; SI-NOT: buffer_store_dword
187 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
188 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
189 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
191 store i32 123, i32 addrspace(1)* %out.gep.1
192 store i32 456, i32 addrspace(1)* %out.gep.2
193 store i32 1234, i32 addrspace(1)* %out
197 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
198 ; XGCN: buffer_store_dwordx4
199 ; GCN: buffer_store_dwordx2
200 ; GCN: buffer_store_dwordx2
201 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
202 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
204 store i64 123, i64 addrspace(1)* %out.gep.1
205 store i64 456, i64 addrspace(1)* %out
209 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
210 ; XGCN: buffer_store_dwordx4
211 ; XGCN: buffer_store_dwordx4
213 ; GCN: buffer_store_dwordx2
214 ; GCN: buffer_store_dwordx2
215 ; GCN: buffer_store_dwordx2
216 ; GCN: buffer_store_dwordx2
217 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
218 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
219 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
220 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
222 store i64 123, i64 addrspace(1)* %out.gep.1
223 store i64 456, i64 addrspace(1)* %out.gep.2
224 store i64 333, i64 addrspace(1)* %out.gep.3
225 store i64 1234, i64 addrspace(1)* %out
229 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
230 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
231 ; GCN: buffer_store_dwordx2 [[LOAD]]
232 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
233 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
234 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
236 %lo = load i32, i32 addrspace(1)* %in
237 %hi = load i32, i32 addrspace(1)* %in.gep.1
239 store i32 %lo, i32 addrspace(1)* %out
240 store i32 %hi, i32 addrspace(1)* %out.gep.1
244 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
245 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
246 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
247 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
248 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
249 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
251 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
252 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
253 %lo = load i32, i32 addrspace(1)* %in.gep.0
254 %hi = load i32, i32 addrspace(1)* %in.gep.1
256 store i32 %lo, i32 addrspace(1)* %out.gep.0
257 store i32 %hi, i32 addrspace(1)* %out.gep.1
261 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
262 ; GCN: buffer_load_dword v
263 ; GCN: buffer_load_dword v
264 ; GCN: buffer_store_dword v
265 ; GCN: buffer_store_dword v
266 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
267 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
268 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
270 %lo = load i32, i32 addrspace(1)* %in
271 %hi = load i32, i32 addrspace(1)* %in.gep.1
273 store i32 %hi, i32 addrspace(1)* %out
274 store i32 %lo, i32 addrspace(1)* %out.gep.1
278 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
279 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
280 ; GCN: buffer_store_dwordx4 [[LOAD]]
281 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
282 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
283 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
284 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
285 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
286 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
287 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
289 %x = load i32, i32 addrspace(1)* %in
290 %y = load i32, i32 addrspace(1)* %in.gep.1
291 %z = load i32, i32 addrspace(1)* %in.gep.2
292 %w = load i32, i32 addrspace(1)* %in.gep.3
294 store i32 %x, i32 addrspace(1)* %out
295 store i32 %y, i32 addrspace(1)* %out.gep.1
296 store i32 %z, i32 addrspace(1)* %out.gep.2
297 store i32 %w, i32 addrspace(1)* %out.gep.3
301 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
302 ; SI-DAG: buffer_load_dwordx2
303 ; SI-DAG: buffer_load_dword v
305 ; SI-DAG: buffer_store_dword v
306 ; SI-DAG: buffer_store_dwordx2 v
308 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
309 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
310 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
311 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
312 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
314 %x = load i32, i32 addrspace(1)* %in
315 %y = load i32, i32 addrspace(1)* %in.gep.1
316 %z = load i32, i32 addrspace(1)* %in.gep.2
318 store i32 %x, i32 addrspace(1)* %out
319 store i32 %y, i32 addrspace(1)* %out.gep.1
320 store i32 %z, i32 addrspace(1)* %out.gep.2
324 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
325 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
326 ; GCN: buffer_store_dwordx4 [[LOAD]]
327 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
328 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
329 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
330 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
331 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
332 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
333 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
335 %x = load float, float addrspace(1)* %in
336 %y = load float, float addrspace(1)* %in.gep.1
337 %z = load float, float addrspace(1)* %in.gep.2
338 %w = load float, float addrspace(1)* %in.gep.3
340 store float %x, float addrspace(1)* %out
341 store float %y, float addrspace(1)* %out.gep.1
342 store float %z, float addrspace(1)* %out.gep.2
343 store float %w, float addrspace(1)* %out.gep.3
347 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
348 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
349 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
350 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
351 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
352 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
353 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
354 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
355 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
356 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
357 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
358 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
360 %x = load i32, i32 addrspace(1)* %in.gep.0
361 %y = load i32, i32 addrspace(1)* %in.gep.1
362 %z = load i32, i32 addrspace(1)* %in.gep.2
363 %w = load i32, i32 addrspace(1)* %in.gep.3
365 store i32 %x, i32 addrspace(1)* %out.gep.0
366 store i32 %y, i32 addrspace(1)* %out.gep.1
367 store i32 %z, i32 addrspace(1)* %out.gep.2
368 store i32 %w, i32 addrspace(1)* %out.gep.3
372 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
373 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
375 ; GCN: buffer_store_dwordx4 [[LOAD]]
376 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
377 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
378 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
379 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
380 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
381 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
382 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
384 %x = load i32, i32 addrspace(1)* %in
385 %y = load i32, i32 addrspace(1)* %in.gep.1
386 %z = load i32, i32 addrspace(1)* %in.gep.2
387 %w = load i32, i32 addrspace(1)* %in.gep.3
389 ; Make sure the barrier doesn't stop this
390 tail call void @llvm.AMDGPU.barrier.local() #1
392 store i32 %w, i32 addrspace(1)* %out.gep.3
393 store i32 %z, i32 addrspace(1)* %out.gep.2
394 store i32 %y, i32 addrspace(1)* %out.gep.1
395 store i32 %x, i32 addrspace(1)* %out
400 ; TODO: Re-packing of loaded register required. Maybe an IR pass
403 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
404 ; GCN: buffer_load_dword v
405 ; GCN: buffer_load_dword v
406 ; GCN: buffer_load_dword v
407 ; GCN: buffer_load_dword v
409 ; GCN: buffer_store_dword v
410 ; GCN: buffer_store_dword v
411 ; GCN: buffer_store_dword v
412 ; GCN: buffer_store_dword v
413 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
414 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
415 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
416 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
417 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
418 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
419 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
421 %x = load i32, i32 addrspace(1)* %in
422 %y = load i32, i32 addrspace(1)* %in.gep.1
423 %z = load i32, i32 addrspace(1)* %in.gep.2
424 %w = load i32, i32 addrspace(1)* %in.gep.3
426 ; Make sure the barrier doesn't stop this
427 tail call void @llvm.AMDGPU.barrier.local() #1
429 store i32 %w, i32 addrspace(1)* %out
430 store i32 %z, i32 addrspace(1)* %out.gep.1
431 store i32 %y, i32 addrspace(1)* %out.gep.2
432 store i32 %x, i32 addrspace(1)* %out.gep.3
437 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
438 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
439 ; GCN: buffer_store_dword [[LOAD]]
441 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
442 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
443 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
444 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
445 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
446 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
447 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
449 %x = load i8, i8 addrspace(1)* %in, align 4
450 %y = load i8, i8 addrspace(1)* %in.gep.1
451 %z = load i8, i8 addrspace(1)* %in.gep.2
452 %w = load i8, i8 addrspace(1)* %in.gep.3
454 store i8 %x, i8 addrspace(1)* %out, align 4
455 store i8 %y, i8 addrspace(1)* %out.gep.1
456 store i8 %z, i8 addrspace(1)* %out.gep.2
457 store i8 %w, i8 addrspace(1)* %out.gep.3
461 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
462 ; GCN: buffer_load_ubyte
463 ; GCN: buffer_load_ubyte
464 ; GCN: buffer_load_ubyte
465 ; GCN: buffer_load_ubyte
466 ; GCN: buffer_store_byte
467 ; GCN: buffer_store_byte
468 ; GCN: buffer_store_byte
469 ; GCN: buffer_store_byte
471 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
472 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
473 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
474 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
475 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
476 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
477 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
479 %x = load i8, i8 addrspace(1)* %in
480 %y = load i8, i8 addrspace(1)* %in.gep.1
481 %z = load i8, i8 addrspace(1)* %in.gep.2
482 %w = load i8, i8 addrspace(1)* %in.gep.3
484 store i8 %x, i8 addrspace(1)* %out
485 store i8 %y, i8 addrspace(1)* %out.gep.1
486 store i8 %z, i8 addrspace(1)* %out.gep.2
487 store i8 %w, i8 addrspace(1)* %out.gep.3
491 ; This works once AA is enabled on the subtarget
492 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
493 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
495 ; GCN-NOAA: buffer_store_dword v
496 ; GCN-NOAA: buffer_store_dword v
497 ; GCN-NOAA: buffer_store_dword v
498 ; GCN-NOAA: buffer_store_dword v
500 ; GCN-AA: buffer_store_dwordx4 [[LOAD]]
503 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
504 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
505 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
506 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
507 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
509 %x = extractelement <4 x i32> %vec, i32 0
510 %y = extractelement <4 x i32> %vec, i32 1
511 %z = extractelement <4 x i32> %vec, i32 2
512 %w = extractelement <4 x i32> %vec, i32 3
514 store i32 %x, i32 addrspace(1)* %out
515 store i32 %y, i32 addrspace(1)* %out.gep.1
516 store i32 %z, i32 addrspace(1)* %out.gep.2
517 store i32 %w, i32 addrspace(1)* %out.gep.3
521 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
525 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
526 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
528 store i8 123, i8 addrspace(3)* %out.gep.1
529 store i8 456, i8 addrspace(3)* %out, align 2
533 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
534 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
535 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
536 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
537 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
538 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
540 store i32 123, i32 addrspace(3)* %out.gep.1
541 store i32 456, i32 addrspace(3)* %out
545 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
550 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
551 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
552 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
553 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
555 store i32 123, i32 addrspace(3)* %out.gep.1
556 store i32 456, i32 addrspace(3)* %out.gep.2
557 store i32 333, i32 addrspace(3)* %out.gep.3
558 store i32 1234, i32 addrspace(3)* %out
562 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
563 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
564 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
565 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
566 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
567 ; GCN: buffer_store_dword v[[HI]]
568 define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
569 store i32 9, i32 addrspace(1)* %out, align 4
570 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
571 store i32 12, i32 addrspace(1)* %idx1, align 4
572 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
573 store i32 16, i32 addrspace(1)* %idx2, align 4
574 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
575 store i32 -12, i32 addrspace(1)* %idx3, align 4
576 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
577 store i32 11, i32 addrspace(1)* %idx4, align 4
581 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
582 ; GCN: buffer_store_dwordx4
583 ; GCN: buffer_store_dwordx2
584 define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
585 store i32 13, i32 addrspace(1)* %out, align 4
586 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
587 store i32 15, i32 addrspace(1)* %idx1, align 4
588 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
589 store i32 62, i32 addrspace(1)* %idx2, align 4
590 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
591 store i32 63, i32 addrspace(1)* %idx3, align 4
592 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
593 store i32 11, i32 addrspace(1)* %idx4, align 4
594 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
595 store i32 123, i32 addrspace(1)* %idx5, align 4
599 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
600 ; GCN: buffer_store_dwordx4
601 ; GCN: buffer_store_dwordx2
602 ; GCN: buffer_store_dword v
603 define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
604 store i32 34, i32 addrspace(1)* %out, align 4
605 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
606 store i32 999, i32 addrspace(1)* %idx1, align 4
607 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
608 store i32 65, i32 addrspace(1)* %idx2, align 4
609 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
610 store i32 33, i32 addrspace(1)* %idx3, align 4
611 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
612 store i32 98, i32 addrspace(1)* %idx4, align 4
613 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
614 store i32 91, i32 addrspace(1)* %idx5, align 4
615 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
616 store i32 212, i32 addrspace(1)* %idx6, align 4
620 ; FIXME: This should do 2 dwordx4 loads
621 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
623 ; GCN-NOAA: buffer_store_dword v
624 ; GCN-NOAA: buffer_store_dword v
625 ; GCN-NOAA: buffer_store_dword v
626 ; GCN-NOAA: buffer_store_dword v
627 ; GCN-NOAA: buffer_store_dword v
628 ; GCN-NOAA: buffer_store_dword v
629 ; GCN-NOAA: buffer_store_dword v
630 ; GCN-NOAA: buffer_store_dword v
632 ; GCN-AA: buffer_store_dwordx4
633 ; GCN-AA: buffer_store_dwordx2
634 ; GCN-AA: buffer_store_dwordx2
637 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
638 store i32 34, i32 addrspace(1)* %out, align 4
639 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
640 store i32 999, i32 addrspace(1)* %idx1, align 4
641 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
642 store i32 65, i32 addrspace(1)* %idx2, align 4
643 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
644 store i32 33, i32 addrspace(1)* %idx3, align 4
645 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
646 store i32 98, i32 addrspace(1)* %idx4, align 4
647 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
648 store i32 91, i32 addrspace(1)* %idx5, align 4
649 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
650 store i32 212, i32 addrspace(1)* %idx6, align 4
651 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
652 store i32 999, i32 addrspace(1)* %idx7, align 4
656 declare void @llvm.AMDGPU.barrier.local() #1
658 attributes #0 = { nounwind }
659 attributes #1 = { noduplicate nounwind }