1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
4 ; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
5 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
7 ; Run with devices with different unaligned load restrictions.
9 ; TODO: Vector element tests
10 ; TODO: Non-zero base offset for load and store combinations
11 ; TODO: Same base addrspacecasted
14 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
15 ; GCN: buffer_store_byte
16 ; GCN: buffer_store_byte
18 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
19 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
21 store i8 123, i8 addrspace(1)* %out.gep.1
22 store i8 456, i8 addrspace(1)* %out, align 2
26 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
27 ; GCN: buffer_store_byte
28 ; GCN: buffer_store_byte
30 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
31 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
33 store i8 123, i8 addrspace(1)* %out.gep.1
34 store i8 456, i8 addrspace(1)* %out
38 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
39 ; GCN: buffer_store_dword v
40 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
41 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
43 store i16 123, i16 addrspace(1)* %out.gep.1
44 store i16 456, i16 addrspace(1)* %out, align 4
48 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
49 ; GCN: buffer_store_dword v
50 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
51 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
53 store i16 0, i16 addrspace(1)* %out.gep.1
54 store i16 0, i16 addrspace(1)* %out, align 4
58 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
59 ; GCN: buffer_store_short
60 ; GCN: buffer_store_short
62 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
63 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
65 store i16 123, i16 addrspace(1)* %out.gep.1
66 store i16 456, i16 addrspace(1)* %out
70 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
71 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
72 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
73 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
74 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
75 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
77 store i32 123, i32 addrspace(1)* %out.gep.1
78 store i32 456, i32 addrspace(1)* %out
82 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
83 ; GCN: buffer_store_dwordx2
84 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
85 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
86 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
87 store float 1.0, float addrspace(1)* %out.gep.1.bc
88 store i32 456, i32 addrspace(1)* %out
92 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
93 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
94 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
95 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
96 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
97 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
98 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
99 store i32 123, i32 addrspace(1)* %out.gep.1.bc
100 store float 4.0, float addrspace(1)* %out
104 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
105 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
106 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
107 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
108 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
109 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
110 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
111 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
112 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
113 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
115 store i32 123, i32 addrspace(1)* %out.gep.1
116 store i32 456, i32 addrspace(1)* %out.gep.2
117 store i32 333, i32 addrspace(1)* %out.gep.3
118 store i32 1234, i32 addrspace(1)* %out
122 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
123 ; GCN: buffer_store_dwordx4
124 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
125 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
126 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
127 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
129 store float 8.0, float addrspace(1)* %out
130 store float 1.0, float addrspace(1)* %out.gep.1
131 store float 2.0, float addrspace(1)* %out.gep.2
132 store float 4.0, float addrspace(1)* %out.gep.3
136 ; First store is out of order.
137 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
138 ; GCN: buffer_store_dwordx4
139 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
140 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
141 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
142 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
144 store float 1.0, float addrspace(1)* %out.gep.1
145 store float 2.0, float addrspace(1)* %out.gep.2
146 store float 4.0, float addrspace(1)* %out.gep.3
147 store float 8.0, float addrspace(1)* %out
151 ; FIXME: Should be able to merge this
152 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
153 ; GCN-NOAA: buffer_store_dword v
154 ; GCN-NOAA: buffer_store_dword v
155 ; GCN-NOAA: buffer_store_dword v
156 ; GCN-NOAA: buffer_store_dword v
158 ; GCN-AA: buffer_store_dwordx2
159 ; GCN-AA: buffer_store_dword v
160 ; GCN-AA: buffer_store_dword v
163 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
164 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
165 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
166 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
168 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
169 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
171 store i32 11, i32 addrspace(1)* %out.gep.1.bc
172 store float 2.0, float addrspace(1)* %out.gep.2
173 store i32 17, i32 addrspace(1)* %out.gep.3.bc
174 store float 8.0, float addrspace(1)* %out
178 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
179 ; SI-DAG: buffer_store_dwordx2
180 ; SI-DAG: buffer_store_dword
181 ; SI-NOT: buffer_store_dword
183 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
184 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
185 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
187 store i32 123, i32 addrspace(1)* %out.gep.1
188 store i32 456, i32 addrspace(1)* %out.gep.2
189 store i32 1234, i32 addrspace(1)* %out
193 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
194 ; XGCN: buffer_store_dwordx4
195 ; GCN: buffer_store_dwordx2
196 ; GCN: buffer_store_dwordx2
197 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
198 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
200 store i64 123, i64 addrspace(1)* %out.gep.1
201 store i64 456, i64 addrspace(1)* %out
205 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
206 ; XGCN: buffer_store_dwordx4
207 ; XGCN: buffer_store_dwordx4
209 ; GCN: buffer_store_dwordx2
210 ; GCN: buffer_store_dwordx2
211 ; GCN: buffer_store_dwordx2
212 ; GCN: buffer_store_dwordx2
213 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
214 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
215 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
216 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
218 store i64 123, i64 addrspace(1)* %out.gep.1
219 store i64 456, i64 addrspace(1)* %out.gep.2
220 store i64 333, i64 addrspace(1)* %out.gep.3
221 store i64 1234, i64 addrspace(1)* %out
225 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
226 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
227 ; GCN: buffer_store_dwordx2 [[LOAD]]
228 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
229 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
230 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
232 %lo = load i32, i32 addrspace(1)* %in
233 %hi = load i32, i32 addrspace(1)* %in.gep.1
235 store i32 %lo, i32 addrspace(1)* %out
236 store i32 %hi, i32 addrspace(1)* %out.gep.1
240 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
241 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
242 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
243 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
244 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
245 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
247 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
248 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
249 %lo = load i32, i32 addrspace(1)* %in.gep.0
250 %hi = load i32, i32 addrspace(1)* %in.gep.1
252 store i32 %lo, i32 addrspace(1)* %out.gep.0
253 store i32 %hi, i32 addrspace(1)* %out.gep.1
257 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
258 ; GCN: buffer_load_dword v
259 ; GCN: buffer_load_dword v
260 ; GCN: buffer_store_dword v
261 ; GCN: buffer_store_dword v
262 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
263 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
264 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
266 %lo = load i32, i32 addrspace(1)* %in
267 %hi = load i32, i32 addrspace(1)* %in.gep.1
269 store i32 %hi, i32 addrspace(1)* %out
270 store i32 %lo, i32 addrspace(1)* %out.gep.1
274 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
275 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
276 ; GCN: buffer_store_dwordx4 [[LOAD]]
277 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
278 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
279 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
280 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
281 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
282 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
283 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
285 %x = load i32, i32 addrspace(1)* %in
286 %y = load i32, i32 addrspace(1)* %in.gep.1
287 %z = load i32, i32 addrspace(1)* %in.gep.2
288 %w = load i32, i32 addrspace(1)* %in.gep.3
290 store i32 %x, i32 addrspace(1)* %out
291 store i32 %y, i32 addrspace(1)* %out.gep.1
292 store i32 %z, i32 addrspace(1)* %out.gep.2
293 store i32 %w, i32 addrspace(1)* %out.gep.3
297 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
298 ; SI-DAG: buffer_load_dwordx2
299 ; SI-DAG: buffer_load_dword v
301 ; SI-DAG: buffer_store_dword v
302 ; SI-DAG: buffer_store_dwordx2 v
304 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
305 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
306 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
307 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
308 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
310 %x = load i32, i32 addrspace(1)* %in
311 %y = load i32, i32 addrspace(1)* %in.gep.1
312 %z = load i32, i32 addrspace(1)* %in.gep.2
314 store i32 %x, i32 addrspace(1)* %out
315 store i32 %y, i32 addrspace(1)* %out.gep.1
316 store i32 %z, i32 addrspace(1)* %out.gep.2
320 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
321 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
322 ; GCN: buffer_store_dwordx4 [[LOAD]]
323 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
324 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
325 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
326 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
327 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
328 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
329 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
331 %x = load float, float addrspace(1)* %in
332 %y = load float, float addrspace(1)* %in.gep.1
333 %z = load float, float addrspace(1)* %in.gep.2
334 %w = load float, float addrspace(1)* %in.gep.3
336 store float %x, float addrspace(1)* %out
337 store float %y, float addrspace(1)* %out.gep.1
338 store float %z, float addrspace(1)* %out.gep.2
339 store float %w, float addrspace(1)* %out.gep.3
343 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
344 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
345 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
346 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
347 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
348 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
349 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
350 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
351 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
352 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
353 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
354 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
356 %x = load i32, i32 addrspace(1)* %in.gep.0
357 %y = load i32, i32 addrspace(1)* %in.gep.1
358 %z = load i32, i32 addrspace(1)* %in.gep.2
359 %w = load i32, i32 addrspace(1)* %in.gep.3
361 store i32 %x, i32 addrspace(1)* %out.gep.0
362 store i32 %y, i32 addrspace(1)* %out.gep.1
363 store i32 %z, i32 addrspace(1)* %out.gep.2
364 store i32 %w, i32 addrspace(1)* %out.gep.3
368 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
369 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
371 ; GCN: buffer_store_dwordx4 [[LOAD]]
372 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
373 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
374 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
375 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
376 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
377 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
378 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
380 %x = load i32, i32 addrspace(1)* %in
381 %y = load i32, i32 addrspace(1)* %in.gep.1
382 %z = load i32, i32 addrspace(1)* %in.gep.2
383 %w = load i32, i32 addrspace(1)* %in.gep.3
385 ; Make sure the barrier doesn't stop this
386 tail call void @llvm.AMDGPU.barrier.local() #1
388 store i32 %w, i32 addrspace(1)* %out.gep.3
389 store i32 %z, i32 addrspace(1)* %out.gep.2
390 store i32 %y, i32 addrspace(1)* %out.gep.1
391 store i32 %x, i32 addrspace(1)* %out
396 ; TODO: Re-packing of loaded register required. Maybe an IR pass
399 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
400 ; GCN: buffer_load_dword v
401 ; GCN: buffer_load_dword v
402 ; GCN: buffer_load_dword v
403 ; GCN: buffer_load_dword v
405 ; GCN: buffer_store_dword v
406 ; GCN: buffer_store_dword v
407 ; GCN: buffer_store_dword v
408 ; GCN: buffer_store_dword v
409 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
410 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
411 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
412 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
413 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
414 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
415 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
417 %x = load i32, i32 addrspace(1)* %in
418 %y = load i32, i32 addrspace(1)* %in.gep.1
419 %z = load i32, i32 addrspace(1)* %in.gep.2
420 %w = load i32, i32 addrspace(1)* %in.gep.3
422 ; Make sure the barrier doesn't stop this
423 tail call void @llvm.AMDGPU.barrier.local() #1
425 store i32 %w, i32 addrspace(1)* %out
426 store i32 %z, i32 addrspace(1)* %out.gep.1
427 store i32 %y, i32 addrspace(1)* %out.gep.2
428 store i32 %x, i32 addrspace(1)* %out.gep.3
433 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
434 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
435 ; GCN: buffer_store_dword [[LOAD]]
437 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
438 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
439 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
440 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
441 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
442 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
443 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
445 %x = load i8, i8 addrspace(1)* %in, align 4
446 %y = load i8, i8 addrspace(1)* %in.gep.1
447 %z = load i8, i8 addrspace(1)* %in.gep.2
448 %w = load i8, i8 addrspace(1)* %in.gep.3
450 store i8 %x, i8 addrspace(1)* %out, align 4
451 store i8 %y, i8 addrspace(1)* %out.gep.1
452 store i8 %z, i8 addrspace(1)* %out.gep.2
453 store i8 %w, i8 addrspace(1)* %out.gep.3
457 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
458 ; GCN: buffer_load_ubyte
459 ; GCN: buffer_load_ubyte
460 ; GCN: buffer_load_ubyte
461 ; GCN: buffer_load_ubyte
462 ; GCN: buffer_store_byte
463 ; GCN: buffer_store_byte
464 ; GCN: buffer_store_byte
465 ; GCN: buffer_store_byte
467 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
468 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
469 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
470 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
471 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
472 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
473 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
475 %x = load i8, i8 addrspace(1)* %in
476 %y = load i8, i8 addrspace(1)* %in.gep.1
477 %z = load i8, i8 addrspace(1)* %in.gep.2
478 %w = load i8, i8 addrspace(1)* %in.gep.3
480 store i8 %x, i8 addrspace(1)* %out
481 store i8 %y, i8 addrspace(1)* %out.gep.1
482 store i8 %z, i8 addrspace(1)* %out.gep.2
483 store i8 %w, i8 addrspace(1)* %out.gep.3
487 ; This works once AA is enabled on the subtarget
488 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
489 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
491 ; GCN-NOAA: buffer_store_dword v
492 ; GCN-NOAA: buffer_store_dword v
493 ; GCN-NOAA: buffer_store_dword v
494 ; GCN-NOAA: buffer_store_dword v
496 ; GCN-AA: buffer_store_dwordx4 [[LOAD]]
499 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
500 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
501 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
502 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
503 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
505 %x = extractelement <4 x i32> %vec, i32 0
506 %y = extractelement <4 x i32> %vec, i32 1
507 %z = extractelement <4 x i32> %vec, i32 2
508 %w = extractelement <4 x i32> %vec, i32 3
510 store i32 %x, i32 addrspace(1)* %out
511 store i32 %y, i32 addrspace(1)* %out.gep.1
512 store i32 %z, i32 addrspace(1)* %out.gep.2
513 store i32 %w, i32 addrspace(1)* %out.gep.3
517 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
521 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
522 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
524 store i8 123, i8 addrspace(3)* %out.gep.1
525 store i8 456, i8 addrspace(3)* %out, align 2
529 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
530 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
531 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
532 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
533 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
534 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
536 store i32 123, i32 addrspace(3)* %out.gep.1
537 store i32 456, i32 addrspace(3)* %out
541 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
546 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
547 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
548 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
549 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
551 store i32 123, i32 addrspace(3)* %out.gep.1
552 store i32 456, i32 addrspace(3)* %out.gep.2
553 store i32 333, i32 addrspace(3)* %out.gep.3
554 store i32 1234, i32 addrspace(3)* %out
558 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
559 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
560 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
561 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
562 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
563 ; GCN: buffer_store_dword v[[HI]]
564 define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
565 store i32 9, i32 addrspace(1)* %out, align 4
566 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
567 store i32 12, i32 addrspace(1)* %idx1, align 4
568 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
569 store i32 16, i32 addrspace(1)* %idx2, align 4
570 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
571 store i32 -12, i32 addrspace(1)* %idx3, align 4
572 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
573 store i32 11, i32 addrspace(1)* %idx4, align 4
577 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
578 ; GCN: buffer_store_dwordx4
579 ; GCN: buffer_store_dwordx2
580 define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
581 store i32 13, i32 addrspace(1)* %out, align 4
582 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
583 store i32 15, i32 addrspace(1)* %idx1, align 4
584 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
585 store i32 62, i32 addrspace(1)* %idx2, align 4
586 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
587 store i32 63, i32 addrspace(1)* %idx3, align 4
588 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
589 store i32 11, i32 addrspace(1)* %idx4, align 4
590 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
591 store i32 123, i32 addrspace(1)* %idx5, align 4
595 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
596 ; GCN: buffer_store_dwordx4
597 ; GCN: buffer_store_dwordx2
598 ; GCN: buffer_store_dword v
599 define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
600 store i32 34, i32 addrspace(1)* %out, align 4
601 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
602 store i32 999, i32 addrspace(1)* %idx1, align 4
603 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
604 store i32 65, i32 addrspace(1)* %idx2, align 4
605 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
606 store i32 33, i32 addrspace(1)* %idx3, align 4
607 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
608 store i32 98, i32 addrspace(1)* %idx4, align 4
609 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
610 store i32 91, i32 addrspace(1)* %idx5, align 4
611 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
612 store i32 212, i32 addrspace(1)* %idx6, align 4
616 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
617 ; GCN: buffer_store_dwordx4
618 ; GCN: buffer_store_dwordx4
620 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
621 store i32 34, i32 addrspace(1)* %out, align 4
622 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
623 store i32 999, i32 addrspace(1)* %idx1, align 4
624 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
625 store i32 65, i32 addrspace(1)* %idx2, align 4
626 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
627 store i32 33, i32 addrspace(1)* %idx3, align 4
628 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
629 store i32 98, i32 addrspace(1)* %idx4, align 4
630 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
631 store i32 91, i32 addrspace(1)* %idx5, align 4
632 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
633 store i32 212, i32 addrspace(1)* %idx6, align 4
634 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
635 store i32 999, i32 addrspace(1)* %idx7, align 4
639 declare void @llvm.AMDGPU.barrier.local() #1
641 attributes #0 = { nounwind }
642 attributes #1 = { noduplicate nounwind }