test/CodeGen/AMDGPU/merge-stores.ll

   1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
   2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
   3
   4 ; Run with devices with different unaligned load restrictions.
   5
   6 ; TODO: Vector element tests
   7 ; TODO: Non-zero base offset for load and store combinations
   8 ; TODO: Same base addrspacecasted
   9
  10
  11 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
  12 ; GCN: buffer_store_byte
  13 ; GCN: buffer_store_byte
  14 ; GCN: s_endpgm
  15 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
  16   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  17
  18   store i8 123, i8 addrspace(1)* %out.gep.1
  19   store i8 456, i8 addrspace(1)* %out, align 2
  20   ret void
  21 }
  22
  23 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
  24 ; GCN: buffer_store_byte
  25 ; GCN: buffer_store_byte
  26 ; GCN: s_endpgm
  27 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
  28   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  29
  30   store i8 123, i8 addrspace(1)* %out.gep.1
  31   store i8 456, i8 addrspace(1)* %out
  32   ret void
  33 }
  34
  35 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
  36 ; GCN: buffer_store_dword v
  37 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
  38   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  39
  40   store i16 123, i16 addrspace(1)* %out.gep.1
  41   store i16 456, i16 addrspace(1)* %out, align 4
  42   ret void
  43 }
  44
  45 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
  46 ; GCN: buffer_store_dword v
  47 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
  48   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  49
  50   store i16 0, i16 addrspace(1)* %out.gep.1
  51   store i16 0, i16 addrspace(1)* %out, align 4
  52   ret void
  53 }
  54
  55 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
  56 ; GCN: buffer_store_short
  57 ; GCN: buffer_store_short
  58 ; GCN: s_endpgm
  59 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
  60   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  61
  62   store i16 123, i16 addrspace(1)* %out.gep.1
  63   store i16 456, i16 addrspace(1)* %out
  64   ret void
  65 }
  66
  67 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
  68 ; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
  69 ; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
  70 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
  71 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
  72 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
  73 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
  74   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  75
  76   store i32 123, i32 addrspace(1)* %out.gep.1
  77   store i32 456, i32 addrspace(1)* %out
  78   ret void
  79 }
  80
  81 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
  82 ; GCN: buffer_store_dwordx2
  83 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
  84   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  85   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
  86   store float 1.0, float addrspace(1)* %out.gep.1.bc
  87   store i32 456, i32 addrspace(1)* %out
  88   ret void
  89 }
  90
  91 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
  92 ; GCN: buffer_store_dwordx2
  93 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
  94   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  95   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
  96   store i32 123, i32 addrspace(1)* %out.gep.1.bc
  97   store float 4.0, float addrspace(1)* %out
  98   ret void
  99 }
 100
 101 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
 102 ; GCN: buffer_store_dwordx4
 103 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 104   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 105   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 106   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 107
 108   store i32 123, i32 addrspace(1)* %out.gep.1
 109   store i32 456, i32 addrspace(1)* %out.gep.2
 110   store i32 333, i32 addrspace(1)* %out.gep.3
 111   store i32 1234, i32 addrspace(1)* %out
 112   ret void
 113 }
 114
 115 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 116 ; XGCN: buffer_store_dwordx4
 117 ; GCN: buffer_store_dword v
 118 ; GCN: buffer_store_dword v
 119 ; GCN: buffer_store_dwordx2 v
 120 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
 121   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 122   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 123   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 124
 125   store float 8.0, float addrspace(1)* %out
 126   store float 1.0, float addrspace(1)* %out.gep.1
 127   store float 2.0, float addrspace(1)* %out.gep.2
 128   store float 4.0, float addrspace(1)* %out.gep.3
 129   ret void
 130 }
 131
 132 ; First store is out of order. Because of order of combines, the
 133 ; consecutive store fails because only some of the stores have been
 134 ; replaced with integer constant stores, and then won't merge because
 135 ; the types are different.
 136
 137 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 138 ; XGCN: buffer_store_dwordx4
 139 ; GCN: buffer_store_dword v
 140 ; GCN: buffer_store_dword v
 141 ; GCN: buffer_store_dword v
 142 ; GCN: buffer_store_dword v
 143 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 144   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 145   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 146   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 147
 148   store float 1.0, float addrspace(1)* %out.gep.1
 149   store float 2.0, float addrspace(1)* %out.gep.2
 150   store float 4.0, float addrspace(1)* %out.gep.3
 151   store float 8.0, float addrspace(1)* %out
 152   ret void
 153 }
 154
 155 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
 156 ; SI-DAG: buffer_store_dwordx2
 157 ; SI-DAG: buffer_store_dword
 158 ; SI-NOT: buffer_store_dword
 159 ; GCN: s_endpgm
 160 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 161   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 162   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 163
 164   store i32 123, i32 addrspace(1)* %out.gep.1
 165   store i32 456, i32 addrspace(1)* %out.gep.2
 166   store i32 1234, i32 addrspace(1)* %out
 167   ret void
 168 }
 169
 170 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 171 ; XGCN: buffer_store_dwordx4
 172 ; GCN: buffer_store_dwordx2
 173 ; GCN: buffer_store_dwordx2
 174 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 175   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 176
 177   store i64 123, i64 addrspace(1)* %out.gep.1
 178   store i64 456, i64 addrspace(1)* %out
 179   ret void
 180 }
 181
 182 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 183 ; XGCN: buffer_store_dwordx4
 184 ; XGCN: buffer_store_dwordx4
 185
 186 ; GCN: buffer_store_dwordx2
 187 ; GCN: buffer_store_dwordx2
 188 ; GCN: buffer_store_dwordx2
 189 ; GCN: buffer_store_dwordx2
 190 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 191   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 192   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
 193   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
 194
 195   store i64 123, i64 addrspace(1)* %out.gep.1
 196   store i64 456, i64 addrspace(1)* %out.gep.2
 197   store i64 333, i64 addrspace(1)* %out.gep.3
 198   store i64 1234, i64 addrspace(1)* %out
 199   ret void
 200 }
 201
 202 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 203 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 204 ; GCN: buffer_store_dwordx2 [[LOAD]]
 205 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 206   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 207   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 208
 209   %lo = load i32, i32 addrspace(1)* %in
 210   %hi = load i32, i32 addrspace(1)* %in.gep.1
 211
 212   store i32 %lo, i32 addrspace(1)* %out
 213   store i32 %hi, i32 addrspace(1)* %out.gep.1
 214   ret void
 215 }
 216
 217 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 218 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 219 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 220 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 221   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 222   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 223
 224   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 225   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 226   %lo = load i32, i32 addrspace(1)* %in.gep.0
 227   %hi = load i32, i32 addrspace(1)* %in.gep.1
 228
 229   store i32 %lo, i32 addrspace(1)* %out.gep.0
 230   store i32 %hi, i32 addrspace(1)* %out.gep.1
 231   ret void
 232 }
 233
 234 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
 235 ; GCN: buffer_load_dword v
 236 ; GCN: buffer_load_dword v
 237 ; GCN: buffer_store_dword v
 238 ; GCN: buffer_store_dword v
 239 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 240   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 241   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 242
 243   %lo = load i32, i32 addrspace(1)* %in
 244   %hi = load i32, i32 addrspace(1)* %in.gep.1
 245
 246   store i32 %hi, i32 addrspace(1)* %out
 247   store i32 %lo, i32 addrspace(1)* %out.gep.1
 248   ret void
 249 }
 250
 251 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 252 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 253 ; GCN: buffer_store_dwordx4 [[LOAD]]
 254 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 255   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 256   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 257   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 258   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 259   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 260   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 261
 262   %x = load i32, i32 addrspace(1)* %in
 263   %y = load i32, i32 addrspace(1)* %in.gep.1
 264   %z = load i32, i32 addrspace(1)* %in.gep.2
 265   %w = load i32, i32 addrspace(1)* %in.gep.3
 266
 267   store i32 %x, i32 addrspace(1)* %out
 268   store i32 %y, i32 addrspace(1)* %out.gep.1
 269   store i32 %z, i32 addrspace(1)* %out.gep.2
 270   store i32 %w, i32 addrspace(1)* %out.gep.3
 271   ret void
 272 }
 273
 274 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
 275 ; SI-DAG: buffer_load_dwordx2
 276 ; SI-DAG: buffer_load_dword v
 277 ; GCN: s_waitcnt
 278 ; SI-DAG: buffer_store_dword v
 279 ; SI-DAG: buffer_store_dwordx2 v
 280 ; GCN: s_endpgm
 281 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 282   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 283   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 284   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 285   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 286
 287   %x = load i32, i32 addrspace(1)* %in
 288   %y = load i32, i32 addrspace(1)* %in.gep.1
 289   %z = load i32, i32 addrspace(1)* %in.gep.2
 290
 291   store i32 %x, i32 addrspace(1)* %out
 292   store i32 %y, i32 addrspace(1)* %out.gep.1
 293   store i32 %z, i32 addrspace(1)* %out.gep.2
 294   ret void
 295 }
 296
 297 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 298 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 299 ; GCN: buffer_store_dwordx4 [[LOAD]]
 300 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 301   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 302   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 303   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 304   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
 305   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
 306   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
 307
 308   %x = load float, float addrspace(1)* %in
 309   %y = load float, float addrspace(1)* %in.gep.1
 310   %z = load float, float addrspace(1)* %in.gep.2
 311   %w = load float, float addrspace(1)* %in.gep.3
 312
 313   store float %x, float addrspace(1)* %out
 314   store float %y, float addrspace(1)* %out.gep.1
 315   store float %z, float addrspace(1)* %out.gep.2
 316   store float %w, float addrspace(1)* %out.gep.3
 317   ret void
 318 }
 319
 320 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 321 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 322 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 323 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 324   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
 325   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
 326   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
 327   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
 328   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
 329   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
 330   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
 331   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
 332
 333   %x = load i32, i32 addrspace(1)* %in.gep.0
 334   %y = load i32, i32 addrspace(1)* %in.gep.1
 335   %z = load i32, i32 addrspace(1)* %in.gep.2
 336   %w = load i32, i32 addrspace(1)* %in.gep.3
 337
 338   store i32 %x, i32 addrspace(1)* %out.gep.0
 339   store i32 %y, i32 addrspace(1)* %out.gep.1
 340   store i32 %z, i32 addrspace(1)* %out.gep.2
 341   store i32 %w, i32 addrspace(1)* %out.gep.3
 342   ret void
 343 }
 344
 345 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
 346 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 347 ; GCN: s_barrier
 348 ; GCN: buffer_store_dwordx4 [[LOAD]]
 349 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 350   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 351   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 352   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 353   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 354   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 355   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 356
 357   %x = load i32, i32 addrspace(1)* %in
 358   %y = load i32, i32 addrspace(1)* %in.gep.1
 359   %z = load i32, i32 addrspace(1)* %in.gep.2
 360   %w = load i32, i32 addrspace(1)* %in.gep.3
 361
 362   ; Make sure the barrier doesn't stop this
 363   tail call void @llvm.AMDGPU.barrier.local() #1
 364
 365   store i32 %w, i32 addrspace(1)* %out.gep.3
 366   store i32 %z, i32 addrspace(1)* %out.gep.2
 367   store i32 %y, i32 addrspace(1)* %out.gep.1
 368   store i32 %x, i32 addrspace(1)* %out
 369
 370   ret void
 371 }
 372
 373 ; TODO: Re-packing of loaded register required. Maybe an IR pass
 374 ; should catch this?
 375
 376 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
 377 ; GCN: buffer_load_dword v
 378 ; GCN: buffer_load_dword v
 379 ; GCN: buffer_load_dword v
 380 ; GCN: buffer_load_dword v
 381 ; GCN: s_barrier
 382 ; GCN: buffer_store_dword v
 383 ; GCN: buffer_store_dword v
 384 ; GCN: buffer_store_dword v
 385 ; GCN: buffer_store_dword v
 386 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 387   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 388   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 389   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 390   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 391   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 392   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 393
 394   %x = load i32, i32 addrspace(1)* %in
 395   %y = load i32, i32 addrspace(1)* %in.gep.1
 396   %z = load i32, i32 addrspace(1)* %in.gep.2
 397   %w = load i32, i32 addrspace(1)* %in.gep.3
 398
 399   ; Make sure the barrier doesn't stop this
 400   tail call void @llvm.AMDGPU.barrier.local() #1
 401
 402   store i32 %w, i32 addrspace(1)* %out
 403   store i32 %z, i32 addrspace(1)* %out.gep.1
 404   store i32 %y, i32 addrspace(1)* %out.gep.2
 405   store i32 %x, i32 addrspace(1)* %out.gep.3
 406
 407   ret void
 408 }
 409
 410 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
 411 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 412 ; GCN: buffer_store_dword [[LOAD]]
 413 ; GCN: s_endpgm
 414 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 415   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 416   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 417   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 418   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 419   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 420   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 421
 422   %x = load i8, i8 addrspace(1)* %in, align 4
 423   %y = load i8, i8 addrspace(1)* %in.gep.1
 424   %z = load i8, i8 addrspace(1)* %in.gep.2
 425   %w = load i8, i8 addrspace(1)* %in.gep.3
 426
 427   store i8 %x, i8 addrspace(1)* %out, align 4
 428   store i8 %y, i8 addrspace(1)* %out.gep.1
 429   store i8 %z, i8 addrspace(1)* %out.gep.2
 430   store i8 %w, i8 addrspace(1)* %out.gep.3
 431   ret void
 432 }
 433
 434 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
 435 ; GCN: buffer_load_ubyte
 436 ; GCN: buffer_load_ubyte
 437 ; GCN: buffer_load_ubyte
 438 ; GCN: buffer_load_ubyte
 439 ; GCN: buffer_store_byte
 440 ; GCN: buffer_store_byte
 441 ; GCN: buffer_store_byte
 442 ; GCN: buffer_store_byte
 443 ; GCN: s_endpgm
 444 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 445   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 446   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 447   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 448   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 449   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 450   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 451
 452   %x = load i8, i8 addrspace(1)* %in
 453   %y = load i8, i8 addrspace(1)* %in.gep.1
 454   %z = load i8, i8 addrspace(1)* %in.gep.2
 455   %w = load i8, i8 addrspace(1)* %in.gep.3
 456
 457   store i8 %x, i8 addrspace(1)* %out
 458   store i8 %y, i8 addrspace(1)* %out.gep.1
 459   store i8 %z, i8 addrspace(1)* %out.gep.2
 460   store i8 %w, i8 addrspace(1)* %out.gep.3
 461   ret void
 462 }
 463
 464 ; This works once AA is enabled on the subtarget
 465 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 466 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 467 ; XGCN: buffer_store_dwordx4 [[LOAD]]
 468 ; GCN: buffer_store_dword v
 469 ; GCN: buffer_store_dword v
 470 ; GCN: buffer_store_dword v
 471 ; GCN: buffer_store_dword v
 472 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 473   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 474   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 475   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 476   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
 477
 478   %x = extractelement <4 x i32> %vec, i32 0
 479   %y = extractelement <4 x i32> %vec, i32 1
 480   %z = extractelement <4 x i32> %vec, i32 2
 481   %w = extractelement <4 x i32> %vec, i32 3
 482
 483   store i32 %x, i32 addrspace(1)* %out
 484   store i32 %y, i32 addrspace(1)* %out.gep.1
 485   store i32 %z, i32 addrspace(1)* %out.gep.2
 486   store i32 %w, i32 addrspace(1)* %out.gep.3
 487   ret void
 488 }
 489
 490 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
 491 ; GCN: ds_write_b8
 492 ; GCN: ds_write_b8
 493 ; GCN: s_endpgm
 494 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 495   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 496
 497   store i8 123, i8 addrspace(3)* %out.gep.1
 498   store i8 456, i8 addrspace(3)* %out, align 2
 499   ret void
 500 }
 501
 502 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
 503 ; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
 504 ; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
 505 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
 506 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
 507 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
 508 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 509   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 510
 511   store i32 123, i32 addrspace(3)* %out.gep.1
 512   store i32 456, i32 addrspace(3)* %out
 513   ret void
 514 }
 515
 516 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
 517 ; GCN: ds_write_b32
 518 ; GCN: ds_write_b32
 519 ; GCN: ds_write_b32
 520 ; GCN: ds_write_b32
 521 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 522   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 523   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
 524   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
 525
 526   store i32 123, i32 addrspace(3)* %out.gep.1
 527   store i32 456, i32 addrspace(3)* %out.gep.2
 528   store i32 333, i32 addrspace(3)* %out.gep.3
 529   store i32 1234, i32 addrspace(3)* %out
 530   ret void
 531 }
 532
 533 declare void @llvm.AMDGPU.barrier.local() #1
 534
 535 attributes #0 = { nounwind }
 536 attributes #1 = { noduplicate nounwind }