1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
4 ; Run with devices with different unaligned load restrictions.
6 ; TODO: Vector element tests
7 ; TODO: Non-zero base offset for load and store combinations
8 ; TODO: Same base addrspacecasted
11 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
12 ; GCN: buffer_store_byte
13 ; GCN: buffer_store_byte
15 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
16 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
18 store i8 123, i8 addrspace(1)* %out.gep.1
19 store i8 456, i8 addrspace(1)* %out, align 2
23 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
24 ; GCN: buffer_store_byte
25 ; GCN: buffer_store_byte
27 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
28 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
30 store i8 123, i8 addrspace(1)* %out.gep.1
31 store i8 456, i8 addrspace(1)* %out
35 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
36 ; GCN: buffer_store_dword v
37 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
38 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
40 store i16 123, i16 addrspace(1)* %out.gep.1
41 store i16 456, i16 addrspace(1)* %out, align 4
45 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
46 ; GCN: buffer_store_dword v
47 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
48 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
50 store i16 0, i16 addrspace(1)* %out.gep.1
51 store i16 0, i16 addrspace(1)* %out, align 4
55 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
56 ; GCN: buffer_store_short
57 ; GCN: buffer_store_short
59 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
60 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
62 store i16 123, i16 addrspace(1)* %out.gep.1
63 store i16 456, i16 addrspace(1)* %out
67 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
68 ; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
69 ; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
70 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
71 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
72 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
73 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
74 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
76 store i32 123, i32 addrspace(1)* %out.gep.1
77 store i32 456, i32 addrspace(1)* %out
81 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
82 ; GCN: buffer_store_dwordx2
83 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
84 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
85 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
86 store float 1.0, float addrspace(1)* %out.gep.1.bc
87 store i32 456, i32 addrspace(1)* %out
91 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
92 ; GCN: buffer_store_dwordx2
93 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
94 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
95 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
96 store i32 123, i32 addrspace(1)* %out.gep.1.bc
97 store float 4.0, float addrspace(1)* %out
101 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
102 ; GCN: buffer_store_dwordx4
103 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
104 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
105 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
106 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
108 store i32 123, i32 addrspace(1)* %out.gep.1
109 store i32 456, i32 addrspace(1)* %out.gep.2
110 store i32 333, i32 addrspace(1)* %out.gep.3
111 store i32 1234, i32 addrspace(1)* %out
115 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
116 ; XGCN: buffer_store_dwordx4
117 ; GCN: buffer_store_dword v
118 ; GCN: buffer_store_dword v
119 ; GCN: buffer_store_dwordx2 v
120 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
121 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
122 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
123 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
125 store float 8.0, float addrspace(1)* %out
126 store float 1.0, float addrspace(1)* %out.gep.1
127 store float 2.0, float addrspace(1)* %out.gep.2
128 store float 4.0, float addrspace(1)* %out.gep.3
132 ; First store is out of order. Because of order of combines, the
133 ; consecutive store fails because only some of the stores have been
134 ; replaced with integer constant stores, and then won't merge because
135 ; the types are different.
137 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
138 ; XGCN: buffer_store_dwordx4
139 ; GCN: buffer_store_dword v
140 ; GCN: buffer_store_dword v
141 ; GCN: buffer_store_dword v
142 ; GCN: buffer_store_dword v
143 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
144 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
145 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
146 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
148 store float 1.0, float addrspace(1)* %out.gep.1
149 store float 2.0, float addrspace(1)* %out.gep.2
150 store float 4.0, float addrspace(1)* %out.gep.3
151 store float 8.0, float addrspace(1)* %out
155 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
156 ; SI-DAG: buffer_store_dwordx2
157 ; SI-DAG: buffer_store_dword
158 ; SI-NOT: buffer_store_dword
160 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
161 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
162 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
164 store i32 123, i32 addrspace(1)* %out.gep.1
165 store i32 456, i32 addrspace(1)* %out.gep.2
166 store i32 1234, i32 addrspace(1)* %out
170 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
171 ; XGCN: buffer_store_dwordx4
172 ; GCN: buffer_store_dwordx2
173 ; GCN: buffer_store_dwordx2
174 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
175 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
177 store i64 123, i64 addrspace(1)* %out.gep.1
178 store i64 456, i64 addrspace(1)* %out
182 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
183 ; XGCN: buffer_store_dwordx4
184 ; XGCN: buffer_store_dwordx4
186 ; GCN: buffer_store_dwordx2
187 ; GCN: buffer_store_dwordx2
188 ; GCN: buffer_store_dwordx2
189 ; GCN: buffer_store_dwordx2
190 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
191 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
192 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
193 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
195 store i64 123, i64 addrspace(1)* %out.gep.1
196 store i64 456, i64 addrspace(1)* %out.gep.2
197 store i64 333, i64 addrspace(1)* %out.gep.3
198 store i64 1234, i64 addrspace(1)* %out
202 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
203 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
204 ; GCN: buffer_store_dwordx2 [[LOAD]]
205 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
206 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
207 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
209 %lo = load i32, i32 addrspace(1)* %in
210 %hi = load i32, i32 addrspace(1)* %in.gep.1
212 store i32 %lo, i32 addrspace(1)* %out
213 store i32 %hi, i32 addrspace(1)* %out.gep.1
217 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
218 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
219 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
220 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
221 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
222 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
224 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
225 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
226 %lo = load i32, i32 addrspace(1)* %in.gep.0
227 %hi = load i32, i32 addrspace(1)* %in.gep.1
229 store i32 %lo, i32 addrspace(1)* %out.gep.0
230 store i32 %hi, i32 addrspace(1)* %out.gep.1
234 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
235 ; GCN: buffer_load_dword v
236 ; GCN: buffer_load_dword v
237 ; GCN: buffer_store_dword v
238 ; GCN: buffer_store_dword v
239 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
240 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
241 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
243 %lo = load i32, i32 addrspace(1)* %in
244 %hi = load i32, i32 addrspace(1)* %in.gep.1
246 store i32 %hi, i32 addrspace(1)* %out
247 store i32 %lo, i32 addrspace(1)* %out.gep.1
251 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
252 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
253 ; GCN: buffer_store_dwordx4 [[LOAD]]
254 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
255 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
256 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
257 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
258 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
259 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
260 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
262 %x = load i32, i32 addrspace(1)* %in
263 %y = load i32, i32 addrspace(1)* %in.gep.1
264 %z = load i32, i32 addrspace(1)* %in.gep.2
265 %w = load i32, i32 addrspace(1)* %in.gep.3
267 store i32 %x, i32 addrspace(1)* %out
268 store i32 %y, i32 addrspace(1)* %out.gep.1
269 store i32 %z, i32 addrspace(1)* %out.gep.2
270 store i32 %w, i32 addrspace(1)* %out.gep.3
274 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
275 ; SI-DAG: buffer_load_dwordx2
276 ; SI-DAG: buffer_load_dword v
278 ; SI-DAG: buffer_store_dword v
279 ; SI-DAG: buffer_store_dwordx2 v
281 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
282 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
283 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
284 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
285 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
287 %x = load i32, i32 addrspace(1)* %in
288 %y = load i32, i32 addrspace(1)* %in.gep.1
289 %z = load i32, i32 addrspace(1)* %in.gep.2
291 store i32 %x, i32 addrspace(1)* %out
292 store i32 %y, i32 addrspace(1)* %out.gep.1
293 store i32 %z, i32 addrspace(1)* %out.gep.2
297 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
298 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
299 ; GCN: buffer_store_dwordx4 [[LOAD]]
300 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
301 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
302 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
303 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
304 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
305 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
306 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
308 %x = load float, float addrspace(1)* %in
309 %y = load float, float addrspace(1)* %in.gep.1
310 %z = load float, float addrspace(1)* %in.gep.2
311 %w = load float, float addrspace(1)* %in.gep.3
313 store float %x, float addrspace(1)* %out
314 store float %y, float addrspace(1)* %out.gep.1
315 store float %z, float addrspace(1)* %out.gep.2
316 store float %w, float addrspace(1)* %out.gep.3
320 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
321 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
322 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
323 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
324 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
325 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
326 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
327 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
328 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
329 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
330 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
331 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
333 %x = load i32, i32 addrspace(1)* %in.gep.0
334 %y = load i32, i32 addrspace(1)* %in.gep.1
335 %z = load i32, i32 addrspace(1)* %in.gep.2
336 %w = load i32, i32 addrspace(1)* %in.gep.3
338 store i32 %x, i32 addrspace(1)* %out.gep.0
339 store i32 %y, i32 addrspace(1)* %out.gep.1
340 store i32 %z, i32 addrspace(1)* %out.gep.2
341 store i32 %w, i32 addrspace(1)* %out.gep.3
345 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
346 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
348 ; GCN: buffer_store_dwordx4 [[LOAD]]
349 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
350 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
351 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
352 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
353 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
354 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
355 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
357 %x = load i32, i32 addrspace(1)* %in
358 %y = load i32, i32 addrspace(1)* %in.gep.1
359 %z = load i32, i32 addrspace(1)* %in.gep.2
360 %w = load i32, i32 addrspace(1)* %in.gep.3
362 ; Make sure the barrier doesn't stop this
363 tail call void @llvm.AMDGPU.barrier.local() #1
365 store i32 %w, i32 addrspace(1)* %out.gep.3
366 store i32 %z, i32 addrspace(1)* %out.gep.2
367 store i32 %y, i32 addrspace(1)* %out.gep.1
368 store i32 %x, i32 addrspace(1)* %out
373 ; TODO: Re-packing of loaded register required. Maybe an IR pass
376 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
377 ; GCN: buffer_load_dword v
378 ; GCN: buffer_load_dword v
379 ; GCN: buffer_load_dword v
380 ; GCN: buffer_load_dword v
382 ; GCN: buffer_store_dword v
383 ; GCN: buffer_store_dword v
384 ; GCN: buffer_store_dword v
385 ; GCN: buffer_store_dword v
386 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
387 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
388 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
389 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
390 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
391 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
392 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
394 %x = load i32, i32 addrspace(1)* %in
395 %y = load i32, i32 addrspace(1)* %in.gep.1
396 %z = load i32, i32 addrspace(1)* %in.gep.2
397 %w = load i32, i32 addrspace(1)* %in.gep.3
399 ; Make sure the barrier doesn't stop this
400 tail call void @llvm.AMDGPU.barrier.local() #1
402 store i32 %w, i32 addrspace(1)* %out
403 store i32 %z, i32 addrspace(1)* %out.gep.1
404 store i32 %y, i32 addrspace(1)* %out.gep.2
405 store i32 %x, i32 addrspace(1)* %out.gep.3
410 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
411 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
412 ; GCN: buffer_store_dword [[LOAD]]
414 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
415 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
416 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
417 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
418 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
419 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
420 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
422 %x = load i8, i8 addrspace(1)* %in, align 4
423 %y = load i8, i8 addrspace(1)* %in.gep.1
424 %z = load i8, i8 addrspace(1)* %in.gep.2
425 %w = load i8, i8 addrspace(1)* %in.gep.3
427 store i8 %x, i8 addrspace(1)* %out, align 4
428 store i8 %y, i8 addrspace(1)* %out.gep.1
429 store i8 %z, i8 addrspace(1)* %out.gep.2
430 store i8 %w, i8 addrspace(1)* %out.gep.3
434 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
435 ; GCN: buffer_load_ubyte
436 ; GCN: buffer_load_ubyte
437 ; GCN: buffer_load_ubyte
438 ; GCN: buffer_load_ubyte
439 ; GCN: buffer_store_byte
440 ; GCN: buffer_store_byte
441 ; GCN: buffer_store_byte
442 ; GCN: buffer_store_byte
444 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
445 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
446 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
447 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
448 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
449 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
450 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
452 %x = load i8, i8 addrspace(1)* %in
453 %y = load i8, i8 addrspace(1)* %in.gep.1
454 %z = load i8, i8 addrspace(1)* %in.gep.2
455 %w = load i8, i8 addrspace(1)* %in.gep.3
457 store i8 %x, i8 addrspace(1)* %out
458 store i8 %y, i8 addrspace(1)* %out.gep.1
459 store i8 %z, i8 addrspace(1)* %out.gep.2
460 store i8 %w, i8 addrspace(1)* %out.gep.3
464 ; This works once AA is enabled on the subtarget
465 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
466 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
467 ; XGCN: buffer_store_dwordx4 [[LOAD]]
468 ; GCN: buffer_store_dword v
469 ; GCN: buffer_store_dword v
470 ; GCN: buffer_store_dword v
471 ; GCN: buffer_store_dword v
472 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
473 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
474 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
475 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
476 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
478 %x = extractelement <4 x i32> %vec, i32 0
479 %y = extractelement <4 x i32> %vec, i32 1
480 %z = extractelement <4 x i32> %vec, i32 2
481 %w = extractelement <4 x i32> %vec, i32 3
483 store i32 %x, i32 addrspace(1)* %out
484 store i32 %y, i32 addrspace(1)* %out.gep.1
485 store i32 %z, i32 addrspace(1)* %out.gep.2
486 store i32 %w, i32 addrspace(1)* %out.gep.3
490 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
494 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
495 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
497 store i8 123, i8 addrspace(3)* %out.gep.1
498 store i8 456, i8 addrspace(3)* %out, align 2
502 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
503 ; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
504 ; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
505 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
506 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
507 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
508 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
509 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
511 store i32 123, i32 addrspace(3)* %out.gep.1
512 store i32 456, i32 addrspace(3)* %out
516 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
521 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
522 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
523 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
524 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
526 store i32 123, i32 addrspace(3)* %out.gep.1
527 store i32 456, i32 addrspace(3)* %out.gep.2
528 store i32 333, i32 addrspace(3)* %out.gep.3
529 store i32 1234, i32 addrspace(3)* %out
533 declare void @llvm.AMDGPU.barrier.local() #1
535 attributes #0 = { nounwind }
536 attributes #1 = { noduplicate nounwind }