1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
4 ; Run with devices with different unaligned load restrictions.
6 ; TODO: Vector element tests
7 ; TODO: Non-zero base offset for load and store combinations
8 ; TODO: Same base addrspacecasted
11 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
12 ; GCN: buffer_store_byte
13 ; GCN: buffer_store_byte
15 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
16 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
18 store i8 123, i8 addrspace(1)* %out.gep.1
19 store i8 456, i8 addrspace(1)* %out, align 2
23 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
24 ; GCN: buffer_store_byte
25 ; GCN: buffer_store_byte
27 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
28 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
30 store i8 123, i8 addrspace(1)* %out.gep.1
31 store i8 456, i8 addrspace(1)* %out
35 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
36 ; GCN: buffer_store_dword v
37 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
38 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
40 store i16 123, i16 addrspace(1)* %out.gep.1
41 store i16 456, i16 addrspace(1)* %out, align 4
45 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
46 ; GCN: buffer_store_dword v
47 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
48 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
50 store i16 0, i16 addrspace(1)* %out.gep.1
51 store i16 0, i16 addrspace(1)* %out, align 4
55 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
56 ; GCN: buffer_store_short
57 ; GCN: buffer_store_short
59 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
60 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
62 store i16 123, i16 addrspace(1)* %out.gep.1
63 store i16 456, i16 addrspace(1)* %out
67 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
68 ; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
69 ; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
70 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
71 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
72 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
73 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
74 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
76 store i32 123, i32 addrspace(1)* %out.gep.1
77 store i32 456, i32 addrspace(1)* %out
81 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
82 ; GCN: buffer_store_dwordx2
83 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
84 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
85 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
86 store float 1.0, float addrspace(1)* %out.gep.1.bc
87 store i32 456, i32 addrspace(1)* %out
91 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
92 ; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
93 ; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
94 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
95 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
96 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
97 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
98 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
99 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
100 store i32 123, i32 addrspace(1)* %out.gep.1.bc
101 store float 4.0, float addrspace(1)* %out
105 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
106 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
107 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
108 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
109 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
110 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
111 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
112 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
113 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
114 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
116 store i32 123, i32 addrspace(1)* %out.gep.1
117 store i32 456, i32 addrspace(1)* %out.gep.2
118 store i32 333, i32 addrspace(1)* %out.gep.3
119 store i32 1234, i32 addrspace(1)* %out
123 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
124 ; GCN: buffer_store_dwordx4
125 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
126 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
127 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
128 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
130 store float 8.0, float addrspace(1)* %out
131 store float 1.0, float addrspace(1)* %out.gep.1
132 store float 2.0, float addrspace(1)* %out.gep.2
133 store float 4.0, float addrspace(1)* %out.gep.3
137 ; First store is out of order.
138 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
139 ; GCN: buffer_store_dwordx4
140 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
141 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
142 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
143 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
145 store float 1.0, float addrspace(1)* %out.gep.1
146 store float 2.0, float addrspace(1)* %out.gep.2
147 store float 4.0, float addrspace(1)* %out.gep.3
148 store float 8.0, float addrspace(1)* %out
152 ; FIXME: Should be able to merge this
153 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
154 ; XGCN: buffer_store_dwordx4
155 ; GCN: buffer_store_dword
156 ; GCN: buffer_store_dword
157 ; GCN: buffer_store_dword
158 ; GCN: buffer_store_dword
160 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
161 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
162 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
163 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
165 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
166 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
168 store i32 11, i32 addrspace(1)* %out.gep.1.bc
169 store float 2.0, float addrspace(1)* %out.gep.2
170 store i32 17, i32 addrspace(1)* %out.gep.3.bc
171 store float 8.0, float addrspace(1)* %out
175 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
176 ; SI-DAG: buffer_store_dwordx2
177 ; SI-DAG: buffer_store_dword
178 ; SI-NOT: buffer_store_dword
180 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
181 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
182 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
184 store i32 123, i32 addrspace(1)* %out.gep.1
185 store i32 456, i32 addrspace(1)* %out.gep.2
186 store i32 1234, i32 addrspace(1)* %out
190 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
191 ; XGCN: buffer_store_dwordx4
192 ; GCN: buffer_store_dwordx2
193 ; GCN: buffer_store_dwordx2
194 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
195 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
197 store i64 123, i64 addrspace(1)* %out.gep.1
198 store i64 456, i64 addrspace(1)* %out
202 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
203 ; XGCN: buffer_store_dwordx4
204 ; XGCN: buffer_store_dwordx4
206 ; GCN: buffer_store_dwordx2
207 ; GCN: buffer_store_dwordx2
208 ; GCN: buffer_store_dwordx2
209 ; GCN: buffer_store_dwordx2
210 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
211 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
212 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
213 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
215 store i64 123, i64 addrspace(1)* %out.gep.1
216 store i64 456, i64 addrspace(1)* %out.gep.2
217 store i64 333, i64 addrspace(1)* %out.gep.3
218 store i64 1234, i64 addrspace(1)* %out
222 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
223 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
224 ; GCN: buffer_store_dwordx2 [[LOAD]]
225 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
226 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
227 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
229 %lo = load i32, i32 addrspace(1)* %in
230 %hi = load i32, i32 addrspace(1)* %in.gep.1
232 store i32 %lo, i32 addrspace(1)* %out
233 store i32 %hi, i32 addrspace(1)* %out.gep.1
237 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
238 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
239 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
240 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
241 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
242 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
244 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
245 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
246 %lo = load i32, i32 addrspace(1)* %in.gep.0
247 %hi = load i32, i32 addrspace(1)* %in.gep.1
249 store i32 %lo, i32 addrspace(1)* %out.gep.0
250 store i32 %hi, i32 addrspace(1)* %out.gep.1
254 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
255 ; GCN: buffer_load_dword v
256 ; GCN: buffer_load_dword v
257 ; GCN: buffer_store_dword v
258 ; GCN: buffer_store_dword v
259 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
260 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
261 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
263 %lo = load i32, i32 addrspace(1)* %in
264 %hi = load i32, i32 addrspace(1)* %in.gep.1
266 store i32 %hi, i32 addrspace(1)* %out
267 store i32 %lo, i32 addrspace(1)* %out.gep.1
271 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
272 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
273 ; GCN: buffer_store_dwordx4 [[LOAD]]
274 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
275 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
276 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
277 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
278 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
279 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
280 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
282 %x = load i32, i32 addrspace(1)* %in
283 %y = load i32, i32 addrspace(1)* %in.gep.1
284 %z = load i32, i32 addrspace(1)* %in.gep.2
285 %w = load i32, i32 addrspace(1)* %in.gep.3
287 store i32 %x, i32 addrspace(1)* %out
288 store i32 %y, i32 addrspace(1)* %out.gep.1
289 store i32 %z, i32 addrspace(1)* %out.gep.2
290 store i32 %w, i32 addrspace(1)* %out.gep.3
294 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
295 ; SI-DAG: buffer_load_dwordx2
296 ; SI-DAG: buffer_load_dword v
298 ; SI-DAG: buffer_store_dword v
299 ; SI-DAG: buffer_store_dwordx2 v
301 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
302 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
303 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
304 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
305 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
307 %x = load i32, i32 addrspace(1)* %in
308 %y = load i32, i32 addrspace(1)* %in.gep.1
309 %z = load i32, i32 addrspace(1)* %in.gep.2
311 store i32 %x, i32 addrspace(1)* %out
312 store i32 %y, i32 addrspace(1)* %out.gep.1
313 store i32 %z, i32 addrspace(1)* %out.gep.2
317 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
318 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
319 ; GCN: buffer_store_dwordx4 [[LOAD]]
320 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
321 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
322 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
323 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
324 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
325 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
326 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
328 %x = load float, float addrspace(1)* %in
329 %y = load float, float addrspace(1)* %in.gep.1
330 %z = load float, float addrspace(1)* %in.gep.2
331 %w = load float, float addrspace(1)* %in.gep.3
333 store float %x, float addrspace(1)* %out
334 store float %y, float addrspace(1)* %out.gep.1
335 store float %z, float addrspace(1)* %out.gep.2
336 store float %w, float addrspace(1)* %out.gep.3
340 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
341 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
342 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
343 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
344 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
345 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
346 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
347 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
348 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
349 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
350 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
351 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
353 %x = load i32, i32 addrspace(1)* %in.gep.0
354 %y = load i32, i32 addrspace(1)* %in.gep.1
355 %z = load i32, i32 addrspace(1)* %in.gep.2
356 %w = load i32, i32 addrspace(1)* %in.gep.3
358 store i32 %x, i32 addrspace(1)* %out.gep.0
359 store i32 %y, i32 addrspace(1)* %out.gep.1
360 store i32 %z, i32 addrspace(1)* %out.gep.2
361 store i32 %w, i32 addrspace(1)* %out.gep.3
365 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
366 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
368 ; GCN: buffer_store_dwordx4 [[LOAD]]
369 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
370 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
371 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
372 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
373 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
374 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
375 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
377 %x = load i32, i32 addrspace(1)* %in
378 %y = load i32, i32 addrspace(1)* %in.gep.1
379 %z = load i32, i32 addrspace(1)* %in.gep.2
380 %w = load i32, i32 addrspace(1)* %in.gep.3
382 ; Make sure the barrier doesn't stop this
383 tail call void @llvm.AMDGPU.barrier.local() #1
385 store i32 %w, i32 addrspace(1)* %out.gep.3
386 store i32 %z, i32 addrspace(1)* %out.gep.2
387 store i32 %y, i32 addrspace(1)* %out.gep.1
388 store i32 %x, i32 addrspace(1)* %out
393 ; TODO: Re-packing of loaded register required. Maybe an IR pass
396 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
397 ; GCN: buffer_load_dword v
398 ; GCN: buffer_load_dword v
399 ; GCN: buffer_load_dword v
400 ; GCN: buffer_load_dword v
402 ; GCN: buffer_store_dword v
403 ; GCN: buffer_store_dword v
404 ; GCN: buffer_store_dword v
405 ; GCN: buffer_store_dword v
406 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
407 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
408 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
409 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
410 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
411 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
412 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
414 %x = load i32, i32 addrspace(1)* %in
415 %y = load i32, i32 addrspace(1)* %in.gep.1
416 %z = load i32, i32 addrspace(1)* %in.gep.2
417 %w = load i32, i32 addrspace(1)* %in.gep.3
419 ; Make sure the barrier doesn't stop this
420 tail call void @llvm.AMDGPU.barrier.local() #1
422 store i32 %w, i32 addrspace(1)* %out
423 store i32 %z, i32 addrspace(1)* %out.gep.1
424 store i32 %y, i32 addrspace(1)* %out.gep.2
425 store i32 %x, i32 addrspace(1)* %out.gep.3
430 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
431 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
432 ; GCN: buffer_store_dword [[LOAD]]
434 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
435 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
436 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
437 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
438 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
439 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
440 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
442 %x = load i8, i8 addrspace(1)* %in, align 4
443 %y = load i8, i8 addrspace(1)* %in.gep.1
444 %z = load i8, i8 addrspace(1)* %in.gep.2
445 %w = load i8, i8 addrspace(1)* %in.gep.3
447 store i8 %x, i8 addrspace(1)* %out, align 4
448 store i8 %y, i8 addrspace(1)* %out.gep.1
449 store i8 %z, i8 addrspace(1)* %out.gep.2
450 store i8 %w, i8 addrspace(1)* %out.gep.3
454 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
455 ; GCN: buffer_load_ubyte
456 ; GCN: buffer_load_ubyte
457 ; GCN: buffer_load_ubyte
458 ; GCN: buffer_load_ubyte
459 ; GCN: buffer_store_byte
460 ; GCN: buffer_store_byte
461 ; GCN: buffer_store_byte
462 ; GCN: buffer_store_byte
464 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
465 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
466 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
467 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
468 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
469 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
470 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
472 %x = load i8, i8 addrspace(1)* %in
473 %y = load i8, i8 addrspace(1)* %in.gep.1
474 %z = load i8, i8 addrspace(1)* %in.gep.2
475 %w = load i8, i8 addrspace(1)* %in.gep.3
477 store i8 %x, i8 addrspace(1)* %out
478 store i8 %y, i8 addrspace(1)* %out.gep.1
479 store i8 %z, i8 addrspace(1)* %out.gep.2
480 store i8 %w, i8 addrspace(1)* %out.gep.3
484 ; This works once AA is enabled on the subtarget
485 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
486 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
487 ; XGCN: buffer_store_dwordx4 [[LOAD]]
488 ; GCN: buffer_store_dword v
489 ; GCN: buffer_store_dword v
490 ; GCN: buffer_store_dword v
491 ; GCN: buffer_store_dword v
492 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
493 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
494 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
495 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
496 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
498 %x = extractelement <4 x i32> %vec, i32 0
499 %y = extractelement <4 x i32> %vec, i32 1
500 %z = extractelement <4 x i32> %vec, i32 2
501 %w = extractelement <4 x i32> %vec, i32 3
503 store i32 %x, i32 addrspace(1)* %out
504 store i32 %y, i32 addrspace(1)* %out.gep.1
505 store i32 %z, i32 addrspace(1)* %out.gep.2
506 store i32 %w, i32 addrspace(1)* %out.gep.3
510 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
514 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
515 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
517 store i8 123, i8 addrspace(3)* %out.gep.1
518 store i8 456, i8 addrspace(3)* %out, align 2
522 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
523 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
524 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
525 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
526 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
527 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
529 store i32 123, i32 addrspace(3)* %out.gep.1
530 store i32 456, i32 addrspace(3)* %out
534 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
539 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
540 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
541 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
542 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
544 store i32 123, i32 addrspace(3)* %out.gep.1
545 store i32 456, i32 addrspace(3)* %out.gep.2
546 store i32 333, i32 addrspace(3)* %out.gep.3
547 store i32 1234, i32 addrspace(3)* %out
551 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
552 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
553 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
554 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
555 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
556 ; GCN: buffer_store_dword v[[HI]]
557 define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
558 store i32 9, i32 addrspace(1)* %out, align 4
559 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
560 store i32 12, i32 addrspace(1)* %idx1, align 4
561 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
562 store i32 16, i32 addrspace(1)* %idx2, align 4
563 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
564 store i32 -12, i32 addrspace(1)* %idx3, align 4
565 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
566 store i32 11, i32 addrspace(1)* %idx4, align 4
570 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
571 ; GCN: buffer_store_dwordx4
572 ; GCN: buffer_store_dwordx2
573 define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
574 store i32 13, i32 addrspace(1)* %out, align 4
575 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
576 store i32 15, i32 addrspace(1)* %idx1, align 4
577 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
578 store i32 62, i32 addrspace(1)* %idx2, align 4
579 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
580 store i32 63, i32 addrspace(1)* %idx3, align 4
581 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
582 store i32 11, i32 addrspace(1)* %idx4, align 4
583 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
584 store i32 123, i32 addrspace(1)* %idx5, align 4
588 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
589 ; GCN: buffer_store_dwordx4
590 ; GCN: buffer_store_dwordx2
591 ; GCN: buffer_store_dword v
592 define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
593 store i32 34, i32 addrspace(1)* %out, align 4
594 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
595 store i32 999, i32 addrspace(1)* %idx1, align 4
596 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
597 store i32 65, i32 addrspace(1)* %idx2, align 4
598 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
599 store i32 33, i32 addrspace(1)* %idx3, align 4
600 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
601 store i32 98, i32 addrspace(1)* %idx4, align 4
602 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
603 store i32 91, i32 addrspace(1)* %idx5, align 4
604 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
605 store i32 212, i32 addrspace(1)* %idx6, align 4
609 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
610 ; XGCN: buffer_store_dwordx4
611 ; XGCN: buffer_store_dwordx4
613 ; GCN: buffer_store_dword v
614 ; GCN: buffer_store_dword v
615 ; GCN: buffer_store_dword v
616 ; GCN: buffer_store_dword v
617 ; GCN: buffer_store_dword v
618 ; GCN: buffer_store_dword v
619 ; GCN: buffer_store_dword v
620 ; GCN: buffer_store_dword v
621 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
622 store i32 34, i32 addrspace(1)* %out, align 4
623 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
624 store i32 999, i32 addrspace(1)* %idx1, align 4
625 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
626 store i32 65, i32 addrspace(1)* %idx2, align 4
627 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
628 store i32 33, i32 addrspace(1)* %idx3, align 4
629 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
630 store i32 98, i32 addrspace(1)* %idx4, align 4
631 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
632 store i32 91, i32 addrspace(1)* %idx5, align 4
633 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
634 store i32 212, i32 addrspace(1)* %idx6, align 4
635 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
636 store i32 999, i32 addrspace(1)* %idx7, align 4
640 declare void @llvm.AMDGPU.barrier.local() #1
642 attributes #0 = { nounwind }
643 attributes #1 = { noduplicate nounwind }